From 9d063047eb1a3738cb0fc9ebebb55793d155bb20 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Thu, 14 Sep 2017 03:08:00 -0400 Subject: session/tcp: improve preallocated segment handling - add preallocated segment flag - don't remove pre-allocated segments except if application detaches - when preallocating fifos in multiple segments, completely fill a segment before moving to the next - detach server application from segment-managers when deleting app - batch syn/syn-ack/fin (re)transmissions - loosen up close-wait and time-wait times Change-Id: I412f53ce601cc83b3acc26aeffd7fa2d52d73b03 Signed-off-by: Florin Coras --- src/svm/svm_fifo_segment.c | 33 +++++++------- src/svm/svm_fifo_segment.h | 1 + src/vnet/session/application.c | 2 + src/vnet/session/segment_manager.c | 73 ++++++++++++++++-------------- src/vnet/session/session.c | 18 +++----- src/vnet/session/session_node.c | 21 +++++---- src/vnet/tcp/builtin_client.c | 2 +- src/vnet/tcp/tcp.c | 7 ++- src/vnet/tcp/tcp.h | 9 ++-- src/vnet/tcp/tcp_debug.h | 9 +--- src/vnet/tcp/tcp_input.c | 81 +++++++++++++++++---------------- src/vnet/tcp/tcp_output.c | 91 ++++++++++++++++++++++++++++++-------- 12 files changed, 209 insertions(+), 138 deletions(-) (limited to 'src') diff --git a/src/svm/svm_fifo_segment.c b/src/svm/svm_fifo_segment.c index 3bdd2b28ebb..da2b79351a5 100644 --- a/src/svm/svm_fifo_segment.c +++ b/src/svm/svm_fifo_segment.c @@ -57,11 +57,12 @@ allocate_new_fifo_chunk (svm_fifo_segment_header_t * fsh, } static void -preallocate_fifo_pairs (svm_fifo_segment_header_t * fsh, +preallocate_fifo_pairs (svm_fifo_segment_private_t * s, svm_fifo_segment_create_args_t * a) { - u32 rx_fifo_size, tx_fifo_size; - u32 rx_rounded_data_size, tx_rounded_data_size; + svm_fifo_segment_header_t *fsh = s->h; + u32 rx_fifo_size, tx_fifo_size, pairs_to_allocate; + u32 rx_rounded_data_size, tx_rounded_data_size, pair_size; svm_fifo_t *f; u8 *rx_fifo_space, *tx_fifo_space; int rx_freelist_index, tx_freelist_index; @@ -97,10 +98,11 @@ preallocate_fifo_pairs (svm_fifo_segment_header_t * fsh, - max_log2 (FIFO_SEGMENT_MIN_FIFO_SIZE); /* Calculate space requirements */ - rx_fifo_size = (sizeof (*f) + rx_rounded_data_size) - * a->preallocated_fifo_pairs; - tx_fifo_size = (sizeof (*f) + tx_rounded_data_size) - * a->preallocated_fifo_pairs; + pair_size = 2 * sizeof (*f) + rx_rounded_data_size + tx_rounded_data_size; + pairs_to_allocate = clib_min (s->ssvm.ssvm_size / pair_size, + a->preallocated_fifo_pairs); + rx_fifo_size = (sizeof (*f) + rx_rounded_data_size) * pairs_to_allocate; + tx_fifo_size = (sizeof (*f) + tx_rounded_data_size) * pairs_to_allocate; vec_validate_init_empty (fsh->free_fifos, clib_max (rx_freelist_index, tx_freelist_index), @@ -139,7 +141,7 @@ preallocate_fifo_pairs (svm_fifo_segment_header_t * fsh, /* Carve rx fifo space */ f = (svm_fifo_t *) rx_fifo_space; - for (i = 0; i < a->preallocated_fifo_pairs; i++) + for (i = 0; i < pairs_to_allocate; i++) { f->freelist_index = rx_freelist_index; f->next = fsh->free_fifos[rx_freelist_index]; @@ -149,7 +151,7 @@ preallocate_fifo_pairs (svm_fifo_segment_header_t * fsh, } /* Carve tx fifo space */ f = (svm_fifo_t *) tx_fifo_space; - for (i = 0; i < a->preallocated_fifo_pairs; i++) + for (i = 0; i < pairs_to_allocate; i++) { f->freelist_index = tx_freelist_index; f->next = fsh->free_fifos[tx_freelist_index]; @@ -157,6 +159,9 @@ preallocate_fifo_pairs (svm_fifo_segment_header_t * fsh, tx_fifo_space += sizeof (*f) + tx_rounded_data_size; f = (svm_fifo_t *) tx_fifo_space; } + + /* Account for the pairs allocated */ + a->preallocated_fifo_pairs -= pairs_to_allocate; } /** (master) create an svm fifo segment */ @@ -200,7 +205,7 @@ svm_fifo_segment_create (svm_fifo_segment_create_args_t * a) sh->opaque[0] = fsh; s->h = fsh; fsh->segment_name = format (0, "%s%c", a->segment_name, 0); - preallocate_fifo_pairs (fsh, a); + preallocate_fifo_pairs (s, a); ssvm_pop_heap (oldheap); @@ -245,10 +250,6 @@ svm_fifo_segment_create_process_private (svm_fifo_segment_create_args_t * a) segment_count = a->private_segment_count; } - /* Spread preallocated fifo pairs across segments */ - a->preallocated_fifo_pairs = - (a->preallocated_fifo_pairs + segment_count - 1) / segment_count; - /* Allocate segments */ for (i = 0; i < segment_count; i++) { @@ -280,9 +281,11 @@ svm_fifo_segment_create_process_private (svm_fifo_segment_create_args_t * a) if (a->private_segment_count) { + if (i != 0) + fsh->flags |= FIFO_SEGMENT_F_IS_PREALLOCATED; oldheap = clib_mem_get_heap (); clib_mem_set_heap (sh->heap); - preallocate_fifo_pairs (fsh, a); + preallocate_fifo_pairs (s, a); clib_mem_set_heap (oldheap); } sh->ready = 1; diff --git a/src/svm/svm_fifo_segment.h b/src/svm/svm_fifo_segment.h index 7c97e9b489f..5b771328c6c 100644 --- a/src/svm/svm_fifo_segment.h +++ b/src/svm/svm_fifo_segment.h @@ -33,6 +33,7 @@ typedef enum #define FIFO_SEGMENT_F_IS_PRIVATE 1 << 0 /* Private segment */ #define FIFO_SEGMENT_F_IS_MAIN_HEAP 1 << 1 /* Segment is main heap */ +#define FIFO_SEGMENT_F_IS_PREALLOCATED 1 << 2 /* Segment is preallocated */ typedef struct { diff --git a/src/vnet/session/application.c b/src/vnet/session/application.c index d105119c557..2b789c5f420 100644 --- a/src/vnet/session/application.c +++ b/src/vnet/session/application.c @@ -111,6 +111,8 @@ application_del (application_t * app) hash_foreach (handle, index, app->listeners_table, ({ vec_add1 (handles, handle); + sm = segment_manager_get (index); + sm->app_index = SEGMENT_MANAGER_INVALID_APP_INDEX; })); /* *INDENT-ON* */ diff --git a/src/vnet/session/segment_manager.c b/src/vnet/session/segment_manager.c index c23e4c0237c..48d027553b1 100644 --- a/src/vnet/session/segment_manager.c +++ b/src/vnet/session/segment_manager.c @@ -197,27 +197,24 @@ u8 segment_manager_has_fifos (segment_manager_t * sm) { svm_fifo_segment_private_t *segment; - /* Weird, but handle it */ - if (vec_len (sm->segment_indices) == 0) - return 0; - if (vec_len (sm->segment_indices) == 1) - { - segment = svm_fifo_segment_get_segment (sm->segment_indices[0]); - if (svm_fifo_segment_num_fifos (segment) == 0) - return 0; - } - if (CLIB_DEBUG) + int i; + + for (i = 0; i < vec_len (sm->segment_indices); i++) { - svm_fifo_segment_private_t *segment; - int i; - for (i = 1; i < vec_len (sm->segment_indices); i++) - { - segment = svm_fifo_segment_get_segment (sm->segment_indices[i]); - if (!svm_fifo_segment_has_fifos (segment)) - clib_warning ("segment has no fifos!"); - } + segment = svm_fifo_segment_get_segment (sm->segment_indices[i]); + if (CLIB_DEBUG && i && !svm_fifo_segment_has_fifos (segment) + && !(segment->h->flags & FIFO_SEGMENT_F_IS_PREALLOCATED)) + clib_warning ("segment %d has no fifos!", sm->segment_indices[i]); + if (svm_fifo_segment_has_fifos (segment)) + return 1; } - return 1; + return 0; +} + +static u8 +segment_manager_app_detached (segment_manager_t * sm) +{ + return (sm->app_index == SEGMENT_MANAGER_INVALID_APP_INDEX); } static void @@ -228,6 +225,13 @@ segment_manager_del_segment (segment_manager_t * sm, u32 segment_index) clib_spinlock_lock (&sm->lockp); svm_segment_index = sm->segment_indices[segment_index]; fifo_segment = svm_fifo_segment_get_segment (svm_segment_index); + if (!fifo_segment + || ((fifo_segment->h->flags & FIFO_SEGMENT_F_IS_PREALLOCATED) + && !segment_manager_app_detached (sm))) + { + clib_spinlock_unlock (&sm->lockp); + return; + } svm_fifo_segment_delete (fifo_segment); vec_del1 (sm->segment_indices, segment_index); clib_spinlock_unlock (&sm->lockp); @@ -288,26 +292,29 @@ segment_manager_del_sessions (segment_manager_t * sm) * * Since the fifos allocated in the segment keep backpointers to the sessions * prior to removing the segment, we call session disconnect. This - * subsequently propages into transport. + * subsequently propagates into transport. */ void segment_manager_del (segment_manager_t * sm) { + int i; - ASSERT (vec_len (sm->segment_indices) <= 1); - if (vec_len (sm->segment_indices)) + ASSERT (!segment_manager_has_fifos (sm) + && segment_manager_app_detached (sm)); + + /* If we have empty preallocated segments that haven't been removed, remove + * them now. Apart from that, the first segment in the first segment manager + * is not removed when all fifos are removed. It can only be removed when + * the manager is explicitly deleted/detached by the app. */ + for (i = vec_len (sm->segment_indices) - 1; i >= 0; i--) { - /* The first segment in the first segment manager is not removed when - * all fifos are removed. It can only be removed when the manager is - * explicitly deleted/detached by the app. */ if (CLIB_DEBUG) { - svm_fifo_segment_private_t *fifo_segment; - fifo_segment = - svm_fifo_segment_get_segment (sm->segment_indices[0]); - ASSERT (!svm_fifo_segment_has_fifos (fifo_segment)); + svm_fifo_segment_private_t *segment; + segment = svm_fifo_segment_get_segment (sm->segment_indices[i]); + ASSERT (!svm_fifo_segment_has_fifos (segment)); } - segment_manager_del_segment (sm, 0); + segment_manager_del_segment (sm, i); } clib_spinlock_free (&sm->lockp); if (CLIB_DEBUG) @@ -322,8 +329,7 @@ segment_manager_init_del (segment_manager_t * sm) segment_manager_del_sessions (sm); else { - ASSERT (!sm->first_is_protected - || sm->app_index == SEGMENT_MANAGER_INVALID_APP_INDEX); + ASSERT (!sm->first_is_protected || segment_manager_app_detached (sm)); segment_manager_del (sm); } } @@ -478,7 +484,8 @@ segment_manager_dealloc_fifos (u32 svm_segment_index, svm_fifo_t * rx_fifo, } /* Remove segment manager if no sessions and detached from app */ - if (sm->app_index == SEGMENT_MANAGER_INVALID_APP_INDEX && is_first) + if (segment_manager_app_detached (sm) + && !segment_manager_has_fifos (sm)) segment_manager_del (sm); } } diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 792e6612dc1..dc930ce87d3 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -453,7 +453,7 @@ stream_session_connect_notify (transport_connection_t * tc, u8 is_fail) st); if (handle == HALF_OPEN_LOOKUP_INVALID_VALUE) { - clib_warning ("half-open was removed!"); + TCP_DBG ("half-open was removed!"); return -1; } @@ -732,6 +732,7 @@ session_send_session_evt_to_thread (u64 session_handle, u32 thread_index) { static u16 serial_number = 0; + u32 tries = 0; session_fifo_event_t evt; unix_shared_memory_queue_t *q; @@ -741,21 +742,14 @@ session_send_session_evt_to_thread (u64 session_handle, evt.event_id = serial_number++; q = session_manager_get_vpp_event_queue (thread_index); - - /* Based on request block (or not) for lack of space */ - if (PREDICT_TRUE (q->cursize < q->maxsize)) + while (unix_shared_memory_queue_add (q, (u8 *) & evt, 1)) { - if (unix_shared_memory_queue_add (q, (u8 *) & evt, - 0 /* do wait for mutex */ )) + if (tries++ == 3) { - clib_warning ("failed to enqueue evt"); + TCP_DBG ("failed to enqueue evt"); + break; } } - else - { - clib_warning ("queue full"); - return; - } } /** diff --git a/src/vnet/session/session_node.c b/src/vnet/session/session_node.c index 09687687189..d015584990e 100644 --- a/src/vnet/session/session_node.c +++ b/src/vnet/session/session_node.c @@ -168,15 +168,19 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, return 0; } + /* Check how much we can pull. */ + max_dequeue0 = svm_fifo_max_dequeue (s0->server_tx_fifo); + if (peek_data) { - /* Offset in rx fifo from where to peek data */ + /* Offset in rx fifo from where to peek data */ tx_offset = transport_vft->tx_fifo_offset (tc0); + if (PREDICT_FALSE (tx_offset >= max_dequeue0)) + max_dequeue0 = 0; + else + max_dequeue0 -= tx_offset; } - /* Check how much we can pull. If buffering, subtract the offset */ - max_dequeue0 = svm_fifo_max_dequeue (s0->server_tx_fifo) - tx_offset; - /* Nothing to read return */ if (max_dequeue0 == 0) { @@ -277,6 +281,8 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, { n_bytes_read = svm_fifo_peek (s0->server_tx_fifo, tx_offset, len_to_deq0, data0); + if (n_bytes_read <= 0) + goto dequeue_fail; /* Keep track of progress locally, transport is also supposed to * increment it independently when pushing the header */ tx_offset += n_bytes_read; @@ -285,11 +291,10 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, { n_bytes_read = svm_fifo_dequeue_nowait (s0->server_tx_fifo, len_to_deq0, data0); + if (n_bytes_read <= 0) + goto dequeue_fail; } - if (n_bytes_read <= 0) - goto dequeue_fail; - b0->current_length = n_bytes_read; left_to_snd0 -= n_bytes_read; @@ -616,7 +621,7 @@ skip_dequeue: case FIFO_EVENT_APP_TX: s0 = session_event_get_session (e0, my_thread_index); - if (CLIB_DEBUG && !s0) + if (PREDICT_FALSE (!s0)) { clib_warning ("It's dead, Jim!"); continue; diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index 5b4c8679970..527b3289924 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -510,7 +510,7 @@ clients_connect (vlib_main_t * vm, u8 * uri, u32 n_clients) if ((i % 4) == 0) vlib_process_suspend (vm, 10e-6); ASSERT (i + 1 >= tm->ready_connections); - while (i + 1 - tm->ready_connections > 8000) + while (i + 1 - tm->ready_connections > 1000) { vlib_process_suspend (vm, 100e-6); } diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 04f1e068b9d..f779428fbaf 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -1035,7 +1035,7 @@ tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space) /* If not snd_wnd constrained and we can't write at least a segment, * don't try at all */ if (PREDICT_FALSE (snd_space < tc->snd_mss)) - return 0; + return snd_space < tc->cwnd ? 0 : snd_space; /* round down to mss multiple */ return snd_space - (snd_space % tc->snd_mss); @@ -1167,6 +1167,7 @@ tcp_timer_establish_handler (u32 conn_index) { ASSERT (tc->state == TCP_STATE_SYN_SENT); stream_session_connect_notify (&tc->connection, 1 /* fail */ ); + TCP_DBG ("establish pop: %U", format_tcp_connection, tc, 2); } else { @@ -1174,7 +1175,7 @@ tcp_timer_establish_handler (u32 conn_index) /* note: the connection may have already disappeared */ if (PREDICT_FALSE (tc == 0)) return; - + TCP_DBG ("establish pop: %U", format_tcp_connection, tc, 2); ASSERT (tc->state == TCP_STATE_SYN_RCVD); /* Start cleanup. App wasn't notified yet so use delete notify as * opposed to delete to cleanup session layer state. */ @@ -1369,6 +1370,8 @@ tcp_main_enable (vlib_main_t * vm) vec_validate (tm->tx_frames[0], num_threads - 1); vec_validate (tm->tx_frames[1], num_threads - 1); + vec_validate (tm->ip_lookup_tx_frames[0], num_threads - 1); + vec_validate (tm->ip_lookup_tx_frames[1], num_threads - 1); tm->bytes_per_buffer = vlib_buffer_free_list_buffer_size (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 6020a3debbe..bb8091af84f 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -99,8 +99,9 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler; #define TCP_ESTABLISH_TIME 750 /* 75s */ #define TCP_SYN_RCVD_TIME 600 /* 60s */ #define TCP_2MSL_TIME 300 /* 30s */ -#define TCP_CLOSEWAIT_TIME 20 /* 0.1s */ -#define TCP_CLEANUP_TIME 5 /* 0.5s Time to wait before cleanup */ +#define TCP_CLOSEWAIT_TIME 20 /* 2s */ +#define TCP_TIMEWAIT_TIME 20 /* 2s */ +#define TCP_CLEANUP_TIME 10 /* 1s Time to wait before cleanup */ #define TCP_TIMER_PERSIST_MIN 2 /* 0.2s */ #define TCP_RTO_MAX 60 * THZ /* Min max RTO (60s) as per RFC6298 */ @@ -372,8 +373,10 @@ typedef struct _tcp_main /** per-worker tx buffer free lists */ u32 **tx_buffers; - /** per-worker tx frames to 4/6 output nodes */ + /** per-worker tx frames to tcp 4/6 output nodes */ vlib_frame_t **tx_frames[2]; + /** per-worker tx frames to ip 4/6 lookup nodes */ + vlib_frame_t **ip_lookup_tx_frames[2]; /* Per worker-thread timer wheel for connections timers */ tw_timer_wheel_16t_2w_512sl_t *timer_wheels; diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index cf77e6e6682..4bc6b42e297 100755 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -82,13 +82,7 @@ typedef enum _tcp_dbg_evt * Infra and evt track setup */ -#define TCP_DBG(_tc, _evt, _args...) \ -{ \ - u8 *_tmp = 0; \ - _tmp = format(_tmp, "%U", format_tcp_connection_verbose, _tc); \ - clib_warning("%s", _tmp); \ - vec_free(_tmp); \ -} +#define TCP_DBG(_fmt, _args...) clib_warning (_fmt, ##_args) #define DECLARE_ETD(_tc, _e, _size) \ struct \ @@ -240,6 +234,7 @@ typedef enum _tcp_dbg_evt #define TCP_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args) #else #define TCP_EVT_DBG(_evt, _args...) +#define TCP_DBG(_fmt, _args...) #endif /* diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 841e72a503e..64a07070ec2 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -351,12 +351,17 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, if (tcp_syn (th0)) { /* TODO implement RFC 5961 */ - if (tc0->state != TCP_STATE_SYN_RCVD) - tcp_make_ack (tc0, b0); + if (tc0->state == TCP_STATE_SYN_RCVD) + { + tcp_make_synack (tc0, b0); + TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0, 0); + } else - tcp_make_synack (tc0, b0); + { + tcp_make_ack (tc0, b0); + TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, tc0); + } *next0 = tcp_next_output (tc0->c_is_ip4); - TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0, 0); return -1; } @@ -1747,18 +1752,17 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* 8: check the FIN bit */ if (PREDICT_FALSE (is_fin)) { - /* Enter CLOSE-WAIT and notify session. Don't send ACK, instead - * wait for session to call close. To avoid lingering + /* Enter CLOSE-WAIT and notify session. To avoid lingering * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */ - tc0->state = TCP_STATE_CLOSE_WAIT; - TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0); + /* Account for the FIN if nothing else was received */ if (vnet_buffer (b0)->tcp.data_len == 0) - { - tc0->rcv_nxt += 1; - next0 = TCP_ESTABLISHED_NEXT_DROP; - } + tc0->rcv_nxt += 1; + tcp_make_ack (tc0, b0); + next0 = tcp_next_output (tc0->c_is_ip4); + tc0->state = TCP_STATE_CLOSE_WAIT; stream_session_disconnect_notify (&tc0->connection); tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); + TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0); } done: @@ -1973,6 +1977,12 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, seq0 = vnet_buffer (b0)->tcp.seq_number; tcp0 = tcp_buffer_hdr (b0); + /* Crude check to see if the connection handle does not match + * the packet. Probably connection just switched to established */ + if (PREDICT_FALSE (tcp0->dst_port != tc0->c_lcl_port + || tcp0->src_port != tc0->c_rmt_port)) + goto drop; + if (PREDICT_FALSE (!tcp_ack (tcp0) && !tcp_rst (tcp0) && !tcp_syn (tcp0))) goto drop; @@ -2265,6 +2275,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_header_t *tcp0 = 0; tcp_connection_t *tc0; u32 next0 = TCP_RCV_PROCESS_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED; + u8 is_fin0; bi0 = from[0]; to_next[0] = bi0; @@ -2283,11 +2294,11 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } tcp0 = tcp_buffer_hdr (b0); + is_fin0 = tcp_is_fin (tcp0); /* SYNs, FINs and data consume sequence numbers */ vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number - + tcp_is_syn (tcp0) + tcp_is_fin (tcp0) - + vnet_buffer (b0)->tcp.data_len; + + tcp_is_syn (tcp0) + is_fin0 + vnet_buffer (b0)->tcp.data_len; if (CLIB_DEBUG) { @@ -2384,21 +2395,14 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* If FIN is ACKed */ else if (tc0->snd_una == tc0->snd_una_max) { - tc0->rcv_nxt += 1; tc0->state = TCP_STATE_FIN_WAIT_2; TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); - if (tcp_fin (tcp0)) - { - /* Stop all timers, 2MSL will be set lower */ - tcp_connection_timers_reset (tc0); - } - else - { - /* Wait for peer to finish sending its data */ - tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, - TCP_2MSL_TIME); - } + /* Stop all retransmit timers because we have nothing more + * to send. Enable waitclose though because we're willing to + * wait for peer's FIN but not indefinitely. */ + tcp_connection_timers_reset (tc0); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); } break; case TCP_STATE_FIN_WAIT_2: @@ -2434,10 +2438,10 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (!tcp_rcv_ack_is_acceptable (tc0, b0)) goto drop; + tc0->snd_una = vnet_buffer (b0)->tcp.ack_number; /* Apparently our FIN was lost */ - if (tcp_fin (tcp0)) + if (is_fin0) { - /* Don't "make" fin since that increments snd_nxt */ tcp_send_fin (tc0); goto drop; } @@ -2450,8 +2454,6 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, * particular, this makes sure that we won't have dead sessions * when processing events on the tx path */ tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); - - /* Stop retransmit */ tcp_retransmit_timer_reset (tc0); goto drop; @@ -2466,8 +2468,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; tcp_make_ack (tc0, b0); - tcp_timer_reset (tc0, TCP_TIMER_WAITCLOSE); - tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); goto drop; @@ -2486,6 +2487,8 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, case TCP_STATE_FIN_WAIT_2: if (vnet_buffer (b0)->tcp.data_len) error0 = tcp_segment_rcv (tm, tc0, b0, &next0); + else if (is_fin0) + tc0->rcv_nxt += 1; break; case TCP_STATE_CLOSE_WAIT: case TCP_STATE_CLOSING: @@ -2497,7 +2500,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } /* 8: check the FIN bit */ - if (!tcp_fin (tcp0)) + if (!is_fin0) goto drop; switch (tc0->state) @@ -2527,19 +2530,19 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); break; case TCP_STATE_FIN_WAIT_2: - /* Got FIN, send ACK! */ + /* Got FIN, send ACK! Be more aggressive with resource cleanup */ tc0->state = TCP_STATE_TIME_WAIT; tcp_connection_timers_reset (tc0); - tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_TIMEWAIT_TIME); tcp_make_ack (tc0, b0); next0 = tcp_next_output (is_ip4); TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); break; case TCP_STATE_TIME_WAIT: - /* Remain in the TIME-WAIT state. Restart the 2 MSL time-wait + /* Remain in the TIME-WAIT state. Restart the time-wait * timeout. */ - tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_TIMEWAIT_TIME); break; } TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0); @@ -3162,9 +3165,9 @@ do { \ TCP_ERROR_NONE); _(TIME_WAIT, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(TIME_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); - _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED); + _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); - _(CLOSED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, + _(CLOSED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); #undef _ } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index b843c926afe..be29f05f65c 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -629,9 +629,11 @@ tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b) } always_inline void -tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, - u8 is_ip4) +tcp_enqueue_to_ip_lookup_i (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, + u8 is_ip4, u8 flush) { + tcp_main_t *tm = vnet_get_tcp_main (); + u32 thread_index = vlib_get_thread_index (); u32 *to_next, next_index; vlib_frame_t *f; @@ -643,13 +645,42 @@ tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, /* Send to IP lookup */ next_index = is_ip4 ? ip4_lookup_node.index : ip6_lookup_node.index; - f = vlib_get_frame_to_node (vm, next_index); + if (VLIB_BUFFER_TRACE_TRAJECTORY > 0) + { + b->pre_data[0] = 2; + b->pre_data[1] = next_index; + } + + f = tm->ip_lookup_tx_frames[!is_ip4][thread_index]; + if (!f) + { + f = vlib_get_frame_to_node (vm, next_index); + ASSERT (f); + tm->ip_lookup_tx_frames[!is_ip4][thread_index] = f; + } - /* Enqueue the packet */ to_next = vlib_frame_vector_args (f); - to_next[0] = bi; - f->n_vectors = 1; - vlib_put_frame_to_node (vm, next_index, f); + to_next[f->n_vectors] = bi; + f->n_vectors += 1; + if (flush || f->n_vectors == VLIB_FRAME_SIZE) + { + vlib_put_frame_to_node (vm, next_index, f); + tm->ip_lookup_tx_frames[!is_ip4][thread_index] = 0; + } +} + +always_inline void +tcp_enqueue_to_ip_lookup_now (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, + u8 is_ip4) +{ + tcp_enqueue_to_ip_lookup_i (vm, b, bi, is_ip4, 1); +} + +always_inline void +tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, + u8 is_ip4) +{ + tcp_enqueue_to_ip_lookup_i (vm, b, bi, is_ip4, 0); } always_inline void @@ -666,8 +697,6 @@ tcp_enqueue_to_output_i (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, /* Decide where to send the packet */ next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index; - - /* Initialize the trajectory trace, if configured */ if (VLIB_BUFFER_TRACE_TRAJECTORY > 0) { b->pre_data[0] = 1; @@ -856,7 +885,7 @@ tcp_send_reset_w_pkt (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4) ASSERT (!bogus); } - tcp_enqueue_to_ip_lookup (vm, b, bi, is_ip4); + tcp_enqueue_to_ip_lookup_now (vm, b, bi, is_ip4); TCP_EVT_DBG (TCP_EVT_RST_SENT, tc); } @@ -968,7 +997,24 @@ tcp_flush_frame_to_output (vlib_main_t * vm, u8 thread_index, u8 is_ip4) } /** - * Flush both v4 and v6 tx frames for thread index + * Flush ip lookup tx frames populated by timer pops + */ +always_inline void +tcp_flush_frame_to_ip_lookup (vlib_main_t * vm, u8 thread_index, u8 is_ip4) +{ + if (tcp_main.ip_lookup_tx_frames[!is_ip4][thread_index]) + { + u32 next_index; + next_index = is_ip4 ? ip4_lookup_node.index : ip6_lookup_node.index; + vlib_put_frame_to_node (vm, next_index, + tcp_main.ip_lookup_tx_frames[!is_ip4] + [thread_index]); + tcp_main.ip_lookup_tx_frames[!is_ip4][thread_index] = 0; + } +} + +/** + * Flush v4 and v6 tcp and ip-lookup tx frames for thread index */ void tcp_flush_frames_to_output (u8 thread_index) @@ -976,6 +1022,8 @@ tcp_flush_frames_to_output (u8 thread_index) vlib_main_t *vm = vlib_get_main (); tcp_flush_frame_to_output (vm, thread_index, 1); tcp_flush_frame_to_output (vm, thread_index, 0); + tcp_flush_frame_to_ip_lookup (vm, thread_index, 1); + tcp_flush_frame_to_ip_lookup (vm, thread_index, 0); } /** @@ -984,22 +1032,28 @@ tcp_flush_frames_to_output (u8 thread_index) void tcp_send_fin (tcp_connection_t * tc) { - vlib_buffer_t *b; - u32 bi; tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); + vlib_buffer_t *b; + u32 bi; + u8 fin_snt = 0; + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) return; b = vlib_get_buffer (vm, bi); - /* buffer will be initialized by in tcp_make_fin */ + fin_snt = tc->flags & TCP_CONN_FINSNT; + if (fin_snt) + tc->snd_nxt = tc->snd_una; tcp_make_fin (tc, b); tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4); - if (!(tc->flags & TCP_CONN_FINSNT)) + if (!fin_snt) { tc->flags |= TCP_CONN_FINSNT; tc->flags &= ~TCP_CONN_FINPNDG; - tc->snd_nxt += 1; + /* Account for the FIN */ + tc->snd_una_max += 1; + tc->snd_nxt = tc->snd_una_max; } tcp_retransmit_timer_force_update (tc); TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc); @@ -1398,7 +1452,8 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) else if (tc->state == TCP_STATE_SYN_RCVD) { tc->rto_boff += 1; - tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); + if (tc->rto_boff > TCP_RTO_SYN_RETRIES) + tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); tc->rtt_ts = 0; if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) @@ -1414,7 +1469,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) else { ASSERT (tc->state == TCP_STATE_CLOSED); - clib_warning ("connection closed ..."); + TCP_DBG ("connection state: %d", tc->state); return; } } -- cgit 1.2.3-korg