From 4eeeaaf5e822718eb222e6c49abd82e1bcb566fd Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Tue, 5 Sep 2017 14:03:37 -0400 Subject: tcp: horizontal scaling improvments - do not scale syn-ack window - fix the max number of outstanding syns in builtin client - fix syn-sent ack validation to use modulo arithmetic - improve retransmit timer handler - fix output buffer allocator leakeage - improved debugging Change-Id: Iac3bc0eadf7d0b494a93e22d210a3153b61b3273 Signed-off-by: Florin Coras --- src/vnet/session/session.c | 21 ++-- src/vnet/session/session_node.c | 6 +- src/vnet/tcp/builtin_client.c | 5 + src/vnet/tcp/tcp.c | 26 ++++- src/vnet/tcp/tcp.h | 7 +- src/vnet/tcp/tcp_debug.h | 49 +++++---- src/vnet/tcp/tcp_error.def | 3 +- src/vnet/tcp/tcp_input.c | 204 ++++++++++++++++++++++++++----------- src/vnet/tcp/tcp_output.c | 212 ++++++++++++++++++--------------------- src/vppinfra/tw_timer_template.c | 11 +- 10 files changed, 335 insertions(+), 209 deletions(-) diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 17644e292a9..4544f9a0f93 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -456,13 +456,16 @@ stream_session_connect_notify (transport_connection_t * tc, u8 is_fail) st); if (handle == HALF_OPEN_LOOKUP_INVALID_VALUE) { - clib_warning ("This can't be good!"); + clib_warning ("half-open was removed!"); return -1; } + /* Cleanup half-open table */ + stream_session_half_open_table_del (tc); + /* Get the app's index from the handle we stored when opening connection * and the opaque (api_context for external apps) from transport session - * index*/ + * index */ app = application_get_if_valid (handle >> 32); if (!app) return -1; @@ -499,9 +502,6 @@ stream_session_connect_notify (transport_connection_t * tc, u8 is_fail) new_s->session_state = SESSION_STATE_READY; } - /* Cleanup session lookup */ - stream_session_half_open_table_del (tc); - return error; } @@ -535,7 +535,7 @@ stream_session_disconnect_notify (transport_connection_t * tc) } /** - * Cleans up session and associated app if needed. + * Cleans up session and lookup table. */ void stream_session_delete (stream_session_t * s) @@ -559,9 +559,10 @@ stream_session_delete (stream_session_t * s) /** * Notification from transport that connection is being deleted * - * This should be called only on previously fully established sessions. For - * instance failed connects should call stream_session_connect_notify and - * indicate that the connect has failed. + * This removes the session if it is still valid. It should be called only on + * previously fully established sessions. For instance failed connects should + * call stream_session_connect_notify and indicate that the connect has + * failed. */ void stream_session_delete_notify (transport_connection_t * tc) @@ -748,7 +749,7 @@ session_send_session_evt_to_thread (u64 session_handle, if (PREDICT_TRUE (q->cursize < q->maxsize)) { if (unix_shared_memory_queue_add (q, (u8 *) & evt, - 1 /* do wait for mutex */ )) + 0 /* do wait for mutex */ )) { clib_warning ("failed to enqueue evt"); } diff --git a/src/vnet/session/session_node.c b/src/vnet/session/session_node.c index dec6d13ca84..09687687189 100644 --- a/src/vnet/session/session_node.c +++ b/src/vnet/session/session_node.c @@ -267,7 +267,7 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, b0 = vlib_get_buffer (vm, bi0); b0->error = 0; - b0->flags = VNET_BUFFER_F_LOCALLY_ORIGINATED; + b0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; b0->current_data = 0; b0->total_length_not_including_first_buffer = 0; @@ -321,8 +321,10 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, })); /* *INDENT-ON* */ - VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + if (VLIB_BUFFER_TRACE_TRAJECTORY) + b0->pre_data[1] = 3; + if (PREDICT_FALSE (n_trace > 0)) { session_queue_trace_t *t0; diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index 94e6b4ae5d8..5b4c8679970 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -509,6 +509,11 @@ clients_connect (vlib_main_t * vm, u8 * uri, u32 n_clients) /* Crude pacing for call setups */ if ((i % 4) == 0) vlib_process_suspend (vm, 10e-6); + ASSERT (i + 1 >= tm->ready_connections); + while (i + 1 - tm->ready_connections > 8000) + { + vlib_process_suspend (vm, 100e-6); + } } } diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index a4c13084413..04f1e068b9d 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -160,6 +160,7 @@ tcp_half_open_connection_new (void) { tcp_main_t *tm = vnet_get_tcp_main (); tcp_connection_t *tc = 0; + ASSERT (vlib_get_thread_index () == 0); pool_get (tm->half_open_connections, tc); memset (tc, 0, sizeof (*tc)); tc->c_c_index = tc - tm->half_open_connections; @@ -561,6 +562,22 @@ tcp_connection_fib_attach (tcp_connection_t * tc) } #endif /* 0 */ +/** + * Initialize connection send variables. + */ +void +tcp_init_snd_vars (tcp_connection_t * tc) +{ + u32 time_now; + + /* Set random initial sequence */ + time_now = tcp_time_now (); + tc->iss = random_u32 (&time_now); + tc->snd_una = tc->iss; + tc->snd_nxt = tc->iss + 1; + tc->snd_una_max = tc->snd_nxt; +} + /** Initialize tcp connection variables * * Should be called after having received a msg from the peer, i.e., a SYN or @@ -572,6 +589,9 @@ tcp_connection_init_vars (tcp_connection_t * tc) tcp_init_mss (tc); scoreboard_init (&tc->sack_sb); tcp_cc_init (tc); + if (tc->state == TCP_STATE_SYN_RCVD) + tcp_init_snd_vars (tc); + // tcp_connection_fib_attach (tc); } @@ -691,6 +711,7 @@ tcp_connection_open (transport_endpoint_t * rmt) TCP_EVT_DBG (TCP_EVT_OPEN, tc); tc->state = TCP_STATE_SYN_SENT; + tcp_init_snd_vars (tc); tcp_send_syn (tc); clib_spinlock_unlock_if_init (&tm->half_open_lock); @@ -784,7 +805,7 @@ format_tcp_vars (u8 * s, va_list * args) tc->snd_wnd, tc->rcv_wnd, tc->snd_wl1 - tc->irs, tc->snd_wl2 - tc->iss); s = format (s, " flight size %u send space %u rcv_wnd_av %d\n", - tcp_flight_size (tc), tcp_available_snd_space (tc), + tcp_flight_size (tc), tcp_available_output_snd_space (tc), tcp_rcv_wnd_available (tc)); s = format (s, " cong %U ", format_tcp_congestion_status, tc); s = format (s, "cwnd %u ssthresh %u rtx_bytes %u bytes_acked %u\n", @@ -1155,6 +1176,9 @@ tcp_timer_establish_handler (u32 conn_index) return; ASSERT (tc->state == TCP_STATE_SYN_RCVD); + /* Start cleanup. App wasn't notified yet so use delete notify as + * opposed to delete to cleanup session layer state. */ + stream_session_delete_notify (&tc->connection); } tc->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID; tcp_connection_cleanup (tc); diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 11d61f5dc83..6020a3debbe 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -97,7 +97,7 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler; * ticks to timer units */ #define TCP_DELACK_TIME 1 /* 0.1s */ #define TCP_ESTABLISH_TIME 750 /* 75s */ -#define TCP_SYN_RCVD_TIME 100 /* 10s */ +#define TCP_SYN_RCVD_TIME 600 /* 60s */ #define TCP_2MSL_TIME 300 /* 30s */ #define TCP_CLOSEWAIT_TIME 20 /* 0.1s */ #define TCP_CLEANUP_TIME 5 /* 0.5s Time to wait before cleanup */ @@ -676,6 +676,7 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, void tcp_connection_timers_init (tcp_connection_t * tc); void tcp_connection_timers_reset (tcp_connection_t * tc); +void tcp_init_snd_vars (tcp_connection_t * tc); void tcp_connection_init_vars (tcp_connection_t * tc); always_inline void @@ -690,6 +691,7 @@ always_inline void tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval) { ASSERT (tc->c_thread_index == vlib_get_thread_index ()); + ASSERT (tc->timers[timer_id] == TCP_TIMER_HANDLE_INVALID); tc->timers[timer_id] = tw_timer_start_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index], tc->c_c_index, timer_id, interval); @@ -722,6 +724,7 @@ tcp_timer_update (tcp_connection_t * tc, u8 timer_id, u32 interval) always_inline void tcp_retransmit_timer_set (tcp_connection_t * tc) { + ASSERT (tc->snd_una != tc->snd_una_max); tcp_timer_set (tc, TCP_TIMER_RETRANSMIT, clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); } @@ -769,7 +772,7 @@ tcp_retransmit_timer_update (tcp_connection_t * tc) { tcp_retransmit_timer_reset (tc); if (tc->snd_wnd < tc->snd_mss) - tcp_persist_timer_set (tc); + tcp_persist_timer_update (tc); } else tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index fc36eb29afd..cf77e6e6682 100755 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -197,9 +197,10 @@ typedef enum _tcp_dbg_evt ed->data[0] = _tc->c_c_index; \ } -#define TCP_EVT_SYN_RCVD_HANDLER(_tc, ...) \ +#define TCP_EVT_SYN_RCVD_HANDLER(_tc,_init, ...) \ { \ - TCP_EVT_INIT_HANDLER(_tc, 0); \ + if (_init) \ + TCP_EVT_INIT_HANDLER(_tc, 0); \ ELOG_TYPE_DECLARE (_e) = \ { \ .format = "syn-rx: irs %u", \ @@ -275,11 +276,14 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "syn-tx: iss %u", \ - .format_args = "i4", \ + .format = "syn-tx: iss %u snd_una %u snd_una_max %u snd_nxt %u", \ + .format_args = "i4i4i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 1); \ + DECLARE_ETD(_tc, _e, 4); \ ed->data[0] = _tc->iss; \ + ed->data[1] = _tc->snd_una - _tc->iss; \ + ed->data[2] = _tc->snd_una_max - _tc->iss; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ TCP_EVT_STATE_CHANGE_HANDLER(_tc); \ } @@ -287,24 +291,30 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "synack-tx: iss %u irs %u", \ - .format_args = "i4i4", \ + .format = "synack-tx: iss %u irs %u snd_una %u snd_nxt %u rcv_nxt %u",\ + .format_args = "i4i4i4i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 2); \ + DECLARE_ETD(_tc, _e, 5); \ ed->data[0] = _tc->iss; \ ed->data[1] = _tc->irs; \ + ed->data[2] = _tc->snd_una - _tc->iss; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ + ed->data[4] = _tc->rcv_nxt - _tc->irs; \ } #define TCP_EVT_SYNACK_RCVD_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "synack-rx: iss %u irs %u", \ - .format_args = "i4i4", \ + .format = "synack-rx: iss %u irs %u snd_una %u snd_nxt %u rcv_nxt %u",\ + .format_args = "i4i4i4i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 2); \ + DECLARE_ETD(_tc, _e, 5); \ ed->data[0] = _tc->iss; \ ed->data[1] = _tc->irs; \ + ed->data[2] = _tc->snd_una - _tc->iss; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ + ed->data[4] = _tc->rcv_nxt - _tc->irs; \ TCP_EVT_STATE_CHANGE_HANDLER(_tc); \ } @@ -361,17 +371,20 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "%s-rxt: iss %u", \ - .format_args = "t4i4", \ + .format = "%s-rxt: iss %u irs %u snd_nxt %u rcv_nxt %u", \ + .format_args = "t4i4i4i4i4", \ .n_enum_strings = 2, \ .enum_strings = { \ "syn", \ "syn-ack", \ }, \ }; \ - DECLARE_ETD(_tc, _e, 2); \ + DECLARE_ETD(_tc, _e, 5); \ ed->data[0] = _type; \ ed->data[1] = _tc->iss; \ + ed->data[2] = _tc->irs; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ + ed->data[4] = _tc->rcv_nxt - _tc->irs; \ } #else @@ -414,7 +427,7 @@ typedef enum _tcp_dbg_evt ed->data[0] = _tc->rcv_nxt - _tc->irs; \ ed->data[1] = _tc->rcv_wnd; \ ed->data[2] = _tc->snd_nxt - _tc->iss; \ - ed->data[3] = tcp_available_wnd(_tc); \ + ed->data[3] = tcp_available_snd_wnd(_tc); \ ed->data[4] = _tc->snd_wnd; \ } @@ -422,7 +435,7 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "acked: %u snd_una %u snd_wnd %u cwnd %u inflight %u", \ + .format = "ack-rx: %u snd_una %u snd_wnd %u cwnd %u inflight %u", \ .format_args = "i4i4i4i4i4", \ }; \ DECLARE_ETD(_tc, _e, 5); \ @@ -452,13 +465,13 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "pktize: una %u snd_nxt %u space %u flight %u rcv_wnd %u",\ + .format = "tx: una %u snd_nxt %u space %u flight %u rcv_wnd %u",\ .format_args = "i4i4i4i4i4", \ }; \ DECLARE_ETD(_tc, _e, 5); \ ed->data[0] = _tc->snd_una - _tc->iss; \ ed->data[1] = _tc->snd_nxt - _tc->iss; \ - ed->data[2] = tcp_available_snd_space (_tc); \ + ed->data[2] = tcp_available_output_snd_space (_tc); \ ed->data[3] = tcp_flight_size (_tc); \ ed->data[4] = _tc->rcv_wnd; \ } diff --git a/src/vnet/tcp/tcp_error.def b/src/vnet/tcp/tcp_error.def index a4e46d64629..08922315c99 100644 --- a/src/vnet/tcp/tcp_error.def +++ b/src/vnet/tcp/tcp_error.def @@ -38,4 +38,5 @@ tcp_error (FILTERED_DUPACKS, "Filtered duplicate ACKs") tcp_error (RST_SENT, "Resets sent") tcp_error (INVALID_CONNECTION, "Invalid connection") tcp_error (NO_WND, "No window") -tcp_error (CONNECTION_CLOSED, "Connection closed") \ No newline at end of file +tcp_error (CONNECTION_CLOSED, "Connection closed") +tcp_error (CREATE_EXISTS, "Connection already exists") \ No newline at end of file diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 1d90345347d..841e72a503e 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -275,6 +275,7 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, if (PREDICT_FALSE (tcp_options_parse (th0, &tc0->rcv_opts))) { + clib_warning ("options parse error"); return -1; } @@ -350,9 +351,12 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, if (tcp_syn (th0)) { /* TODO implement RFC 5961 */ - tcp_make_ack (tc0, b0); + if (tc0->state != TCP_STATE_SYN_RCVD) + tcp_make_ack (tc0, b0); + else + tcp_make_synack (tc0, b0); *next0 = tcp_next_output (tc0->c_is_ip4); - TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0); + TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0, 0); return -1; } @@ -1842,6 +1846,74 @@ VLIB_NODE_FUNCTION_MULTIARCH (tcp6_established_node, tcp6_established); vlib_node_registration_t tcp4_syn_sent_node; vlib_node_registration_t tcp6_syn_sent_node; +static u8 +tcp_lookup_is_valid (tcp_connection_t * tc, tcp_header_t * hdr) +{ + transport_connection_t *tmp; + if (!tc) + return 1; + + u8 is_valid = (tc->c_lcl_port == hdr->dst_port + && (tc->state == TCP_STATE_LISTEN + || tc->c_rmt_port == hdr->src_port)); + + if (!is_valid) + { + if ((tmp = + stream_session_half_open_lookup (&tc->c_lcl_ip, &tc->c_rmt_ip, + tc->c_lcl_port, tc->c_rmt_port, + tc->c_transport_proto))) + { + if (tmp->lcl_port == hdr->dst_port + && tmp->rmt_port == hdr->src_port) + { + clib_warning ("half-open is valid!"); + } + } + } + return is_valid; +} + +/** + * Lookup transport connection + */ +static tcp_connection_t * +tcp_lookup_connection (vlib_buffer_t * b, u8 thread_index, u8 is_ip4) +{ + tcp_header_t *tcp; + transport_connection_t *tconn; + tcp_connection_t *tc; + if (is_ip4) + { + ip4_header_t *ip4; + ip4 = vlib_buffer_get_current (b); + tcp = ip4_next_header (ip4); + tconn = stream_session_lookup_transport_wt4 (&ip4->dst_address, + &ip4->src_address, + tcp->dst_port, + tcp->src_port, + SESSION_TYPE_IP4_TCP, + thread_index); + tc = tcp_get_connection_from_transport (tconn); + ASSERT (tcp_lookup_is_valid (tc, tcp)); + } + else + { + ip6_header_t *ip6; + ip6 = vlib_buffer_get_current (b); + tcp = ip6_next_header (ip6); + tconn = stream_session_lookup_transport_wt6 (&ip6->dst_address, + &ip6->src_address, + tcp->dst_port, + tcp->src_port, + SESSION_TYPE_IP6_TCP, + thread_index); + tc = tcp_get_connection_from_transport (tconn); + ASSERT (tcp_lookup_is_valid (tc, tcp)); + } + return tc; +} + always_inline uword tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, int is_ip4) @@ -1888,6 +1960,15 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; } + /* Half-open completed recently but the connection was't removed + * yet by the owning thread */ + if (PREDICT_FALSE (tc0->flags & TCP_CONN_HALF_OPEN_DONE)) + { + /* Make sure the connection actually exists */ + ASSERT (tcp_lookup_connection (b0, my_thread_index, is_ip4)); + goto drop; + } + ack0 = vnet_buffer (b0)->tcp.ack_number; seq0 = vnet_buffer (b0)->tcp.seq_number; tcp0 = tcp_buffer_hdr (b0); @@ -1914,16 +1995,20 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, */ if (tcp_ack (tcp0)) { - if (ack0 <= tc0->iss || ack0 > tc0->snd_nxt) + if (seq_leq (ack0, tc0->iss) || seq_gt (ack0, tc0->snd_nxt)) { + clib_warning ("ack not in rcv wnd"); if (!tcp_rst (tcp0)) tcp_send_reset_w_pkt (tc0, b0, is_ip4); goto drop; } /* Make sure ACK is valid */ - if (tc0->snd_una > ack0) - goto drop; + if (seq_gt (tc0->snd_una, ack0)) + { + clib_warning ("ack invalid"); + goto drop; + } } /* @@ -1949,11 +2034,17 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* No SYN flag. Drop. */ if (!tcp_syn (tcp0)) - goto drop; + { + clib_warning ("not synack"); + goto drop; + } /* Parse options */ if (tcp_options_parse (tcp0, &tc0->rcv_opts)) - goto drop; + { + clib_warning ("options parse fail"); + goto drop; + } /* Valid SYN or SYN-ACK. Move connection from half-open pool to * current thread pool. */ @@ -1981,8 +2072,8 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (tcp_opts_wscale (&new_tc0->rcv_opts)) new_tc0->snd_wscale = new_tc0->rcv_opts.wscale; - new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window) - << new_tc0->snd_wscale; + /* RFC1323: SYN and SYN-ACK wnd not scaled */ + new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window); new_tc0->snd_wl1 = seq0; new_tc0->snd_wl2 = ack0; @@ -2004,6 +2095,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, * allocate session send reset */ if (stream_session_connect_notify (&new_tc0->connection, 0)) { + clib_warning ("connect notify fail"); tcp_send_reset_w_pkt (new_tc0, b0, is_ip4); tcp_connection_cleanup (new_tc0); goto drop; @@ -2032,6 +2124,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } tc0->rtt_ts = 0; + tcp_init_snd_vars (tc0); tcp_make_synack (new_tc0, b0); next0 = tcp_next_output (is_ip4); @@ -2196,6 +2289,18 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + tcp_is_syn (tcp0) + tcp_is_fin (tcp0) + vnet_buffer (b0)->tcp.data_len; + if (CLIB_DEBUG) + { + tcp_connection_t *tmp; + tmp = tcp_lookup_connection (b0, my_thread_index, is_ip4); + if (tmp->state != tc0->state) + { + clib_warning ("state changed"); + ASSERT (0); + goto drop; + } + } + /* * Special treatment for CLOSED */ @@ -2211,8 +2316,8 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, */ /* 1-4: check SEQ, RST, SYN */ - if (PREDICT_FALSE - (tcp_segment_validate (vm, tc0, b0, tcp0, &next0))) + if (PREDICT_FALSE (tcp_segment_validate (vm, tc0, b0, tcp0, + &next0))) { error0 = TCP_ERROR_SEGMENT_INVALID; goto drop; @@ -2230,6 +2335,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, */ if (!tcp_rcv_ack_is_acceptable (tc0, b0)) { + clib_warning ("connection not accepted"); tcp_send_reset_w_pkt (tc0, b0, is_ip4); goto drop; } @@ -2252,6 +2358,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Reset SYN-ACK retransmit and SYN_RCV establish timers */ tcp_retransmit_timer_reset (tc0); tcp_timer_reset (tc0, TCP_TIMER_ESTABLISH); + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); break; case TCP_STATE_ESTABLISHED: /* We can get packets in established state here because they @@ -2400,6 +2507,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Send FIN-ACK notify app and enter CLOSE-WAIT */ tcp_connection_timers_reset (tc0); tcp_make_fin (tc0, b0); + tc0->snd_nxt += 1; next0 = tcp_next_output (tc0->c_is_ip4); stream_session_disconnect_notify (&tc0->connection); tc0->state = TCP_STATE_CLOSE_WAIT; @@ -2598,6 +2706,14 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* 3. check for a SYN (did that already) */ + /* Make sure connection wasn't just created */ + child0 = tcp_lookup_connection (b0, my_thread_index, is_ip4); + if (PREDICT_FALSE (child0->state != TCP_STATE_LISTEN)) + { + error0 = TCP_ERROR_CREATE_EXISTS; + goto drop; + } + /* Create child session and send SYN-ACK */ child0 = tcp_connection_new (my_thread_index); child0->c_lcl_port = lc0->c_lcl_port; @@ -2621,12 +2737,15 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (stream_session_accept (&child0->connection, lc0->c_s_index, sst, 0 /* notify */ )) { + clib_warning ("session accept fail"); + tcp_connection_cleanup (child0); error0 = TCP_ERROR_CREATE_SESSION_FAIL; goto drop; } if (tcp_options_parse (th0, &child0->rcv_opts)) { + clib_warning ("options parse fail"); goto drop; } @@ -2651,7 +2770,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, child0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number; tcp_connection_init_vars (child0); - TCP_EVT_DBG (TCP_EVT_SYN_RCVD, child0); + TCP_EVT_DBG (TCP_EVT_SYN_RCVD, child0, 1); /* Reuse buffer to make syn-ack and send */ tcp_make_synack (child0, b0); @@ -2768,34 +2887,6 @@ typedef enum _tcp_input_next #define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN) -static u8 -tcp_lookup_is_valid (tcp_connection_t * tc, tcp_header_t * hdr) -{ - transport_connection_t *tmp; - if (!tc) - return 1; - - u8 is_valid = (tc->c_lcl_port == hdr->dst_port - && (tc->state == TCP_STATE_LISTEN - || tc->c_rmt_port == hdr->src_port)); - - if (!is_valid) - { - if ((tmp = - stream_session_half_open_lookup (&tc->c_lcl_ip, &tc->c_rmt_ip, - tc->c_lcl_port, tc->c_rmt_port, - tc->c_transport_proto))) - { - if (tmp->lcl_port == hdr->dst_port - && tmp->rmt_port == hdr->src_port) - { - clib_warning ("half-open is valid!"); - } - } - } - return is_valid; -} - always_inline uword tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, int is_ip4) @@ -2822,6 +2913,7 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t *b0; tcp_header_t *tcp0 = 0; tcp_connection_t *tc0; + transport_connection_t *tconn; ip4_header_t *ip40; ip6_header_t *ip60; u32 error0 = TCP_ERROR_NO_LISTENER, next0 = TCP_INPUT_NEXT_DROP; @@ -2847,15 +2939,13 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + tcp_header_bytes (tcp0)); n_data_bytes0 = clib_net_to_host_u16 (ip40->length) - n_advance_bytes0; - - tc0 = - (tcp_connection_t *) - stream_session_lookup_transport_wt4 (&ip40->dst_address, - &ip40->src_address, - tcp0->dst_port, - tcp0->src_port, - SESSION_TYPE_IP4_TCP, - my_thread_index); + tconn = stream_session_lookup_transport_wt4 (&ip40->dst_address, + &ip40->src_address, + tcp0->dst_port, + tcp0->src_port, + SESSION_TYPE_IP4_TCP, + my_thread_index); + tc0 = tcp_get_connection_from_transport (tconn); ASSERT (tcp_lookup_is_valid (tc0, tcp0)); } else @@ -2866,15 +2956,13 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, n_data_bytes0 = clib_net_to_host_u16 (ip60->payload_length) - n_advance_bytes0; n_advance_bytes0 += sizeof (ip60[0]); - - tc0 = - (tcp_connection_t *) - stream_session_lookup_transport_wt6 (&ip60->dst_address, - &ip60->src_address, - tcp0->dst_port, - tcp0->src_port, - SESSION_TYPE_IP6_TCP, - my_thread_index); + tconn = stream_session_lookup_transport_wt6 (&ip60->dst_address, + &ip60->src_address, + tcp0->dst_port, + tcp0->src_port, + SESSION_TYPE_IP6_TCP, + my_thread_index); + tc0 = tcp_get_connection_from_transport (tconn); ASSERT (tcp_lookup_is_valid (tc0, tcp0)); } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 15a9dcb48a8..9cb3e77937e 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -66,11 +66,10 @@ format_tcp_tx_trace (u8 * s, va_list * args) } static u8 -tcp_window_compute_scale (u32 available_space) +tcp_window_compute_scale (u32 window) { u8 wnd_scale = 0; - while (wnd_scale < TCP_MAX_WND_SCALE - && (available_space >> wnd_scale) > TCP_WND_MAX) + while (wnd_scale < TCP_MAX_WND_SCALE && (window >> wnd_scale) > TCP_WND_MAX) wnd_scale++; return wnd_scale; } @@ -444,12 +443,10 @@ tcp_alloc_tx_buffers (tcp_main_t * tm, u8 thread_index, u32 n_free_buffers) vec_validate (tm->tx_buffers[thread_index], current_length + n_free_buffers - 1); - _vec_len (tm->tx_buffers[thread_index]) = - current_length + vlib_buffer_alloc_from_free_list (vlib_get_main (), - tm->tx_buffers - [thread_index], - n_free_buffers, - VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + _vec_len (tm->tx_buffers[thread_index]) = current_length + + vlib_buffer_alloc (vlib_get_main (), + &tm->tx_buffers[thread_index][current_length], + n_free_buffers); /* buffer shortage, report failure */ if (vec_len (tm->tx_buffers[thread_index]) == 0) { @@ -470,7 +467,7 @@ tcp_get_free_buffer_index (tcp_main_t * tm, u32 * bidx) return -1; } my_tx_buffers = tm->tx_buffers[thread_index]; - *bidx = my_tx_buffers[_vec_len (my_tx_buffers) - 1]; + *bidx = my_tx_buffers[vec_len (my_tx_buffers) - 1]; _vec_len (my_tx_buffers) -= 1; return 0; } @@ -478,10 +475,7 @@ tcp_get_free_buffer_index (tcp_main_t * tm, u32 * bidx) always_inline void tcp_return_buffer (tcp_main_t * tm) { - u32 *my_tx_buffers; - u32 thread_index = vlib_get_thread_index (); - my_tx_buffers = tm->tx_buffers[thread_index]; - _vec_len (my_tx_buffers) += 1; + _vec_len (tm->tx_buffers[vlib_get_thread_index ()]) += 1; } always_inline void * @@ -489,7 +483,8 @@ tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b) { if (b->flags & VLIB_BUFFER_NEXT_PRESENT) vlib_buffer_free_one (vm, b->next_buffer); - b->flags = 0; + /* Zero all flags but free list index and trace flag */ + b->flags &= VLIB_BUFFER_NEXT_PRESENT - 1; b->current_data = 0; b->current_length = 0; b->total_length_not_including_first_buffer = 0; @@ -503,7 +498,8 @@ always_inline void * tcp_init_buffer (vlib_main_t * vm, vlib_buffer_t * b) { ASSERT ((b->flags & VLIB_BUFFER_NEXT_PRESENT) == 0); - b->flags = VNET_BUFFER_F_LOCALLY_ORIGINATED; + b->flags &= VLIB_BUFFER_FREE_LIST_INDEX_MASK; + b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; b->total_length_not_including_first_buffer = 0; vnet_buffer (b)->tcp.flags = 0; @@ -567,8 +563,34 @@ tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b) /* Reset flags, make sure ack is sent */ vnet_buffer (b)->tcp.flags &= ~TCP_BUF_FLAG_DUPACK; +} + +/** + * Convert buffer to SYN + */ +void +tcp_make_syn (tcp_connection_t * tc, vlib_buffer_t * b) +{ + u8 tcp_hdr_opts_len, tcp_opts_len; + tcp_header_t *th; + u16 initial_wnd; + tcp_options_t snd_opts; + + initial_wnd = tcp_initial_window_to_advertise (tc); - tc->snd_nxt += 1; + /* Make and write options */ + memset (&snd_opts, 0, sizeof (snd_opts)); + tcp_opts_len = tcp_make_syn_options (&snd_opts, tc->rcv_wscale); + tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t); + + th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss, + tc->rcv_nxt, tcp_hdr_opts_len, TCP_FLAG_SYN, + initial_wnd); + vnet_buffer (b)->tcp.connection_index = tc->c_c_index; + tcp_options_write ((u8 *) (th + 1), &snd_opts); + + tcp_timer_update (tc, TCP_TIMER_RETRANSMIT_SYN, + tc->rto * TCP_TO_TIMER_TICK); } /** @@ -582,37 +604,25 @@ tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b) u8 tcp_opts_len, tcp_hdr_opts_len; tcp_header_t *th; u16 initial_wnd; - u32 time_now; memset (snd_opts, 0, sizeof (*snd_opts)); - tcp_reuse_buffer (vm, b); - /* Set random initial sequence */ - time_now = tcp_time_now (); - - tc->iss = random_u32 (&time_now); - tc->snd_una = tc->iss; - tc->snd_nxt = tc->iss + 1; - tc->snd_una_max = tc->snd_nxt; - initial_wnd = tcp_initial_window_to_advertise (tc); - - /* Make and write options */ tcp_opts_len = tcp_make_synack_options (tc, snd_opts); tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t); th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss, tc->rcv_nxt, tcp_hdr_opts_len, TCP_FLAG_SYN | TCP_FLAG_ACK, initial_wnd); - tcp_options_write ((u8 *) (th + 1), snd_opts); vnet_buffer (b)->tcp.connection_index = tc->c_c_index; vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; - /* Init retransmit timer */ - tcp_retransmit_timer_set (tc); + /* Init retransmit timer. Use update instead of set because of + * retransmissions */ + tcp_retransmit_timer_force_update (tc); TCP_EVT_DBG (TCP_EVT_SYNACK_SENT, tc); } @@ -918,44 +928,17 @@ tcp_send_syn (tcp_connection_t * tc) u32 bi; tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); - u8 tcp_hdr_opts_len, tcp_opts_len; - tcp_header_t *th; - u32 time_now; - u16 initial_wnd; - tcp_options_t snd_opts; if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) return; b = vlib_get_buffer (vm, bi); tcp_init_buffer (vm, b); - - /* Set random initial sequence */ - time_now = tcp_time_now (); - - tc->iss = random_u32 (&time_now); - tc->snd_una = tc->iss; - tc->snd_una_max = tc->snd_nxt = tc->iss + 1; - - initial_wnd = tcp_initial_window_to_advertise (tc); - - /* Make and write options */ - memset (&snd_opts, 0, sizeof (snd_opts)); - tcp_opts_len = tcp_make_syn_options (&snd_opts, tc->rcv_wscale); - tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t); - - th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss, - tc->rcv_nxt, tcp_hdr_opts_len, TCP_FLAG_SYN, - initial_wnd); - - tcp_options_write ((u8 *) (th + 1), &snd_opts); + tcp_make_syn (tc, b); /* Measure RTT with this */ tc->rtt_ts = tcp_time_now (); tc->rtt_seq = tc->snd_nxt; - - /* Start retransmit trimer */ - tcp_timer_set (tc, TCP_TIMER_RETRANSMIT_SYN, tc->rto * TCP_TO_TIMER_TICK); tc->rto_boff = 0; /* Set the connection establishment timer */ @@ -1010,8 +993,12 @@ tcp_send_fin (tcp_connection_t * tc) /* buffer will be initialized by in tcp_make_fin */ tcp_make_fin (tc, b); tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4); - tc->flags |= TCP_CONN_FINSNT; - tc->flags &= ~TCP_CONN_FINPNDG; + if (!(tc->flags & TCP_CONN_FINSNT)) + { + tc->flags |= TCP_CONN_FINSNT; + tc->flags &= ~TCP_CONN_FINPNDG; + tc->snd_nxt += 1; + } tcp_retransmit_timer_force_update (tc); TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc); } @@ -1146,6 +1133,7 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, * Make sure we can retransmit something */ available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection); + ASSERT (available_bytes >= offset); available_bytes -= offset; if (!available_bytes) return 0; @@ -1209,6 +1197,7 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, VLIB_FRAME_SIZE - available_bufs)) { tcp_return_buffer (tm); + *b = 0; return 0; } } @@ -1236,7 +1225,7 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, ASSERT (n_peeked == len_to_deq); n_bytes += n_peeked; chain_b->current_length = n_peeked; - chain_b->flags = 0; + chain_b->flags &= VLIB_BUFFER_FREE_LIST_INDEX_MASK; chain_b->next_buffer = 0; /* update previous buffer */ @@ -1310,19 +1299,6 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID; } - if (!tcp_in_recovery (tc) && tc->rto_boff > 0 - && tc->state >= TCP_STATE_ESTABLISHED) - { - tc->rto_boff = 0; - tcp_update_rto (tc); - } - - /* Increment RTO backoff (also equal to number of retries) */ - tc->rto_boff += 1; - - /* Go back to first un-acked byte */ - tc->snd_nxt = tc->snd_una; - if (tc->state >= TCP_STATE_ESTABLISHED) { /* Lost FIN, retransmit and return */ @@ -1332,6 +1308,18 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) return; } + /* We're not in recovery so make sure rto_boff is 0 */ + if (!tcp_in_recovery (tc) && tc->rto_boff > 0) + { + tc->rto_boff = 0; + tcp_update_rto (tc); + } + + /* Increment RTO backoff (also equal to number of retries) and go back + * to first un-acked byte */ + tc->rto_boff += 1; + tc->snd_nxt = tc->snd_una; + /* First retransmit timeout */ if (tc->rto_boff == 1) tcp_rtx_timeout_cc (tc); @@ -1349,12 +1337,11 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) if (n_bytes == 0) { - if (b) - { - clib_warning ("retransmit fail: %U", format_tcp_connection, tc, - 2); - ASSERT (tc->rto_boff > 1 && tc->snd_una == tc->snd_congestion); - } + ASSERT (!b); + if (tc->snd_una == tc->snd_una_max) + return; + ASSERT (tc->rto_boff > 1 && tc->snd_una == tc->snd_congestion); + clib_warning ("retransmit fail: %U", format_tcp_connection, tc, 2); /* Try again eventually */ tcp_retransmit_timer_set (tc); return; @@ -1365,16 +1352,18 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* For first retransmit, record timestamp (Eifel detection RFC3522) */ if (tc->rto_boff == 1) tc->snd_rxt_ts = tcp_time_now (); + + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + tcp_retransmit_timer_update (tc); } - /* Retransmit for SYN/SYNACK */ - else if (tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_SYN_SENT) + /* Retransmit for SYN */ + else if (tc->state == TCP_STATE_SYN_SENT) { /* Half-open connection actually moved to established but we were * waiting for syn retransmit to pop to call cleanup from the right * thread. */ if (tc->flags & TCP_CONN_HALF_OPEN_DONE) { - ASSERT (tc->state == TCP_STATE_SYN_SENT); if (tcp_half_open_connection_cleanup (tc)) { clib_warning ("could not remove half-open connection"); @@ -1385,49 +1374,46 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Try without increasing RTO a number of times. If this fails, * start growing RTO exponentially */ + tc->rto_boff += 1; if (tc->rto_boff > TCP_RTO_SYN_RETRIES) tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) - { - clib_warning ("tcp_get_free_buffer_index FAIL"); - return; - } + return; + b = vlib_get_buffer (vm, bi); tcp_init_buffer (vm, b); - tcp_push_hdr_i (tc, b, tc->state, 1); + tcp_make_syn (tc, b); - /* Account for the SYN */ - tc->snd_nxt += 1; tc->rtt_ts = 0; - TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc, - (tc->state == TCP_STATE_SYN_SENT ? 0 : 1)); + TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc, 0); + + /* This goes straight to ipx_lookup. Retransmit timer set already */ + tcp_push_ip_hdr (tm, tc, b); + tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4); } - else + /* Retransmit SYN-ACK */ + else if (tc->state == TCP_STATE_SYN_RCVD) { - ASSERT (tc->state == TCP_STATE_CLOSED); - clib_warning ("connection closed ..."); - return; - } + tc->rto_boff += 1; + tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); + tc->rtt_ts = 0; - if (!is_syn) - { - tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + + b = vlib_get_buffer (vm, bi); + tcp_make_synack (tc, b); + TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc, 1); - /* Re-enable retransmit timer */ - tcp_retransmit_timer_set (tc); + /* Retransmit timer already updated, just enqueue to output */ + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); } else { - ASSERT (tc->state == TCP_STATE_SYN_SENT); - - /* This goes straight to ipx_lookup */ - tcp_push_ip_hdr (tm, tc, b); - tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4); - - /* Re-enable retransmit timer */ - tcp_timer_set (tc, TCP_TIMER_RETRANSMIT_SYN, - tc->rto * TCP_TO_TIMER_TICK); + ASSERT (tc->state == TCP_STATE_CLOSED); + clib_warning ("connection closed ..."); + return; } } diff --git a/src/vppinfra/tw_timer_template.c b/src/vppinfra/tw_timer_template.c index aba00142051..abad3718b6f 100644 --- a/src/vppinfra/tw_timer_template.c +++ b/src/vppinfra/tw_timer_template.c @@ -572,7 +572,8 @@ static inline { vec_add1 (callback_vector, t->user_handle); #if TW_START_STOP_TRACE_SIZE > 0 - TW (tw_timer_trace) (tw, 0xfe, ~0, t - tw->timers); + TW (tw_timer_trace) (tw, 0xfe, t->user_handle, + t - tw->timers); #endif pool_put (tw->timers, t); } @@ -635,7 +636,8 @@ static inline { vec_add1 (callback_vector, t->user_handle); #if TW_START_STOP_TRACE_SIZE > 0 - TW (tw_timer_trace) (tw, 0xfe, ~0, t - tw->timers); + TW (tw_timer_trace) (tw, 0xfe, t->user_handle, + t - tw->timers); #endif pool_put (tw->timers, t); } @@ -689,7 +691,8 @@ static inline { vec_add1 (callback_vector, t->user_handle); #if TW_START_STOP_TRACE_SIZE > 0 - TW (tw_timer_trace) (tw, 0xfe, ~0, t - tw->timers); + TW (tw_timer_trace) (tw, 0xfe, t->user_handle, + t - tw->timers); #endif pool_put (tw->timers, t); } @@ -725,7 +728,7 @@ static inline next_index = t->next; vec_add1 (callback_vector, t->user_handle); #if TW_START_STOP_TRACE_SIZE > 0 - TW (tw_timer_trace) (tw, 0xfe, ~0, t - tw->timers); + TW (tw_timer_trace) (tw, 0xfe, t->user_handle, t - tw->timers); #endif pool_put (tw->timers, t); } -- cgit 1.2.3-korg