From 68b0fb0c620c7451ef1a6380c43c39de6614db51 Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Tue, 28 Feb 2017 15:15:56 -0500 Subject: VPP-598: tcp stack initial commit Change-Id: I49e5ce0aae6e4ff634024387ceaf7dbc432a0351 Signed-off-by: Dave Barach Signed-off-by: Florin Coras --- src/vnet/tcp/tcp_newreno.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 src/vnet/tcp/tcp_newreno.c (limited to 'src/vnet/tcp/tcp_newreno.c') diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c new file mode 100644 index 00000000..856dffe4 --- /dev/null +++ b/src/vnet/tcp/tcp_newreno.c @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +void +newreno_congestion (tcp_connection_t * tc) +{ + tc->prev_ssthresh = tc->ssthresh; + tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss); +} + +void +newreno_recovered (tcp_connection_t * tc) +{ + tc->cwnd = tc->ssthresh; +} + +void +newreno_rcv_ack (tcp_connection_t * tc) +{ + if (tcp_in_slowstart (tc)) + { + tc->cwnd += clib_min (tc->snd_mss, tc->bytes_acked); + } + else + { + /* Round up to 1 if needed */ + tc->cwnd += clib_max (tc->snd_mss * tc->snd_mss / tc->cwnd, 1); + } +} + +void +newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type) +{ + if (ack_type == TCP_CC_DUPACK) + { + tc->cwnd += tc->snd_mss; + } + else if (ack_type == TCP_CC_PARTIALACK) + { + tc->cwnd -= tc->bytes_acked; + if (tc->bytes_acked > tc->snd_mss) + tc->bytes_acked += tc->snd_mss; + } +} + +void +newreno_conn_init (tcp_connection_t * tc) +{ + tc->ssthresh = tc->snd_wnd; + tc->cwnd = tcp_initial_cwnd (tc); +} + +const static tcp_cc_algorithm_t tcp_newreno = { + .congestion = newreno_congestion, + .recovered = newreno_recovered, + .rcv_ack = newreno_rcv_ack, + .rcv_cong_ack = newreno_rcv_cong_ack, + .init = newreno_conn_init +}; + +clib_error_t * +newreno_init (vlib_main_t * vm) +{ + clib_error_t *error = 0; + + tcp_cc_algo_register (TCP_CC_NEWRENO, &tcp_newreno); + + return error; +} + +VLIB_INIT_FUNCTION (newreno_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ -- cgit 1.2.3-korg From 3af90fceb61d0c236709c25df936bbbf304cbff5 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Wed, 3 May 2017 21:09:42 -0700 Subject: Fix TCP loss recovery, VPP-745 Allows pure loss recovery retransmits only on timeout. Change-Id: I563cdbf9e7b890a6569350bdbda4f746ace0544e Signed-off-by: Florin Coras --- src/vnet/tcp/tcp.c | 35 +++++++++++++++++++++++++++-------- src/vnet/tcp/tcp.h | 4 ++-- src/vnet/tcp/tcp_input.c | 35 +++++++++++++++++++++++++---------- src/vnet/tcp/tcp_newreno.c | 2 +- src/vnet/tcp/tcp_output.c | 44 +++++++++++++++++--------------------------- 5 files changed, 72 insertions(+), 48 deletions(-) (limited to 'src/vnet/tcp/tcp_newreno.c') diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index de4edfa6..e80e2ec9 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -567,30 +567,49 @@ tcp_session_send_mss (transport_connection_t * trans_conn) return tc->snd_mss; } +always_inline u32 +tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space) +{ + if (tc->snd_wnd < tc->snd_mss) + { + return tc->snd_wnd < snd_space ? tc->snd_wnd : 0; + } + + /* If we can't write at least a segment, don't try at all */ + if (snd_space < tc->snd_mss) + return 0; + + /* round down to mss multiple */ + return snd_space - (snd_space % tc->snd_mss); +} + /** * Compute tx window session is allowed to fill. */ u32 tcp_session_send_space (transport_connection_t * trans_conn) { - u32 snd_space, chunk; + int snd_space; tcp_connection_t *tc = (tcp_connection_t *) trans_conn; /* If we haven't gotten dupacks or if we did and have gotten sacked bytes * then we can still send */ - if (PREDICT_TRUE (tcp_in_fastrecovery (tc) == 0 + if (PREDICT_TRUE (tcp_in_cong_recovery (tc) == 0 && (tc->rcv_dupacks == 0 || tc->sack_sb.last_sacked_bytes))) { - chunk = tc->snd_wnd > tc->snd_mss ? tc->snd_mss : tc->snd_wnd; snd_space = tcp_available_snd_space (tc); + return tcp_round_snd_space (tc, snd_space); + } - /* If we can't write at least a segment, don't try at all */ - if (chunk == 0 || snd_space < chunk) + if (tcp_in_recovery (tc)) + { + tc->snd_nxt = tc->snd_una_max; + snd_space = tcp_available_wnd (tc) - tc->rtx_bytes + - (tc->snd_una_max - tc->snd_congestion); + if (snd_space <= 0) return 0; - - /* round down to mss multiple */ - return snd_space - (snd_space % chunk); + return tcp_round_snd_space (tc, snd_space); } /* If in fast recovery, send 1 SMSS if wnd allows */ diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index f61a1b52..c75479dc 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -24,8 +24,8 @@ #include #include -#define TCP_TICK 10e-3 /**< TCP tick period (s) */ -#define THZ 1/TCP_TICK /**< TCP tick frequency */ +#define TCP_TICK 0.001 /**< TCP tick period (s) */ +#define THZ (u32) (1/TCP_TICK) /**< TCP tick frequency */ #define TCP_TSTAMP_RESOLUTION TCP_TICK /**< Time stamp resolution */ #define TCP_PAWS_IDLE 24 * 24 * 60 * 60 * THZ /**< 24 days */ #define TCP_MAX_OPTION_SPACE 40 diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 0030cfe2..e9c52c5e 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -392,11 +392,10 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) /* Karn's rule, part 1. Don't use retransmitted segments to estimate * RTT because they're ambiguous. */ - if (tc->rtt_seq && seq_gt (ack, tc->rtt_seq) && !tc->rto_boff) + if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq) && !tc->rto_boff) { mrtt = tcp_time_now () - tc->rtt_ts; } - /* As per RFC7323 TSecr can be used for RTTM only if the segment advances * snd_una, i.e., the left side of the send window: * seq_lt (tc->snd_una, ack). Note: last condition could be dropped, we don't @@ -406,19 +405,22 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) mrtt = tcp_time_now () - tc->opt.tsecr; } + /* Allow measuring of a new RTT */ + tc->rtt_ts = 0; + + /* If ACK moves left side of the wnd make sure boff is 0, even if mrtt is + * not valid */ + if (tc->bytes_acked) + tc->rto_boff = 0; + /* Ignore dubious measurements */ if (mrtt == 0 || mrtt > TCP_RTT_MAX) return 0; tcp_estimate_rtt (tc, mrtt); - tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); - /* Allow measuring of RTT and make sure boff is 0 */ - tc->rtt_seq = 0; - tc->rto_boff = 0; - - return 1; + return 0; } /** @@ -735,7 +737,7 @@ tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) { u8 partial_ack; - if (tcp_in_cong_recovery (tc)) + if (tcp_in_fastrecovery (tc)) { partial_ack = seq_lt (tc->snd_una, tc->snd_congestion); if (!partial_ack) @@ -749,6 +751,7 @@ tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) /* Clear retransmitted bytes. XXX should we clear all? */ tc->rtx_bytes = 0; + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK); /* In case snd_nxt is still in the past and output tries to @@ -772,6 +775,13 @@ tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) tc->cc_algo->rcv_ack (tc); tc->tsecr_last_ack = tc->opt.tsecr; tc->rcv_dupacks = 0; + if (tcp_in_recovery (tc)) + { + tc->rtx_bytes -= clib_min (tc->bytes_acked, tc->rtx_bytes); + tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); + if (seq_geq (tc->snd_una, tc->snd_congestion)) + tcp_recovery_off (tc); + } } } @@ -897,7 +907,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, tcp_cc_rcv_ack (tc, b); /* If everything has been acked, stop retransmit timer - * otherwise update */ + * otherwise update. */ if (tc->snd_una == tc->snd_una_max) tcp_retransmit_timer_reset (tc); else @@ -1778,6 +1788,11 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_send_reset (b0, is_ip4); goto drop; } + + /* Update rtt and rto */ + tc0->bytes_acked = 1; + tcp_update_rtt (tc0, vnet_buffer (b0)->tcp.ack_number); + /* Switch state to ESTABLISHED */ tc0->state = TCP_STATE_ESTABLISHED; diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c index 856dffe4..3525f4e5 100644 --- a/src/vnet/tcp/tcp_newreno.c +++ b/src/vnet/tcp/tcp_newreno.c @@ -38,7 +38,7 @@ newreno_rcv_ack (tcp_connection_t * tc) else { /* Round up to 1 if needed */ - tc->cwnd += clib_max (tc->snd_mss * tc->snd_mss / tc->cwnd, 1); + tc->cwnd += clib_max ((tc->snd_mss * tc->snd_mss) / tc->cwnd, 1); } } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index a85d30da..7ee930c6 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -73,7 +73,7 @@ tcp_set_snd_mss (tcp_connection_t * tc) snd_mss = dummy_mtu; /* TODO cache mss and consider PMTU discovery */ - snd_mss = tc->opt.mss < snd_mss ? tc->opt.mss : snd_mss; + snd_mss = clib_min (tc->opt.mss, snd_mss); tc->snd_mss = snd_mss; @@ -923,6 +923,12 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, /* TODO this is updated in output as well ... */ if (tc->snd_nxt > tc->snd_una_max) tc->snd_una_max = tc->snd_nxt; + + if (tc->rtt_ts == 0) + { + tc->rtt_ts = tcp_time_now (); + tc->rtt_seq = tc->snd_nxt; + } TCP_EVT_DBG (TCP_EVT_PKTIZE, tc); } @@ -1019,9 +1025,10 @@ tcp_rtx_timeout_cc (tcp_connection_t * tc) } /* Start again from the beginning */ - tcp_recovery_on (tc); + tc->cwnd = tcp_loss_wnd (tc); tc->snd_congestion = tc->snd_una_max; + tcp_recovery_on (tc); } static void @@ -1032,7 +1039,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; - u32 bi, snd_space, n_bytes; + u32 bi, n_bytes; if (is_syn) { @@ -1065,33 +1072,16 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Exponential backoff */ tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); - /* Figure out what and how many bytes we can send */ - snd_space = tcp_available_snd_space (tc); - TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1); - if (snd_space == 0) - { - clib_warning ("no wnd to retransmit"); - tcp_return_buffer (tm); - - /* Force one segment */ - tcp_retransmit_first_unacked (tc); + /* Send one segment. No fancy recovery for now! */ + n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); + scoreboard_clear (&tc->sack_sb); - /* Re-enable retransmit timer. Output may be unwilling - * to do it for us */ - tcp_retransmit_timer_set (tc); - - return; - } - else + if (n_bytes == 0) { - /* No fancy recovery for now! */ - n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, snd_space); - scoreboard_clear (&tc->sack_sb); - - if (n_bytes == 0) - return; + clib_warning ("could not retransmit"); + return; } } else @@ -1400,7 +1390,7 @@ tcp46_output_inline (vlib_main_t * vm, } /* If not retransmitting - * 1) update snd_una_max (SYN, SYNACK, new data, FIN) + * 1) update snd_una_max (SYN, SYNACK, FIN) * 2) If we're not tracking an ACK, start tracking */ if (seq_lt (tc0->snd_una_max, tc0->snd_nxt)) { -- cgit 1.2.3-korg From 93992a9048cb6e5dcd22de5091e72de778122627 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Wed, 24 May 2017 18:03:56 -0700 Subject: Implement sack based tcp loss recovery (RFC 6675) - refactor existing congestion control code (RFC 6582/5681). Handling of ack feedback now consists of: ack parsing, cc event detection, event handling, congestion control update - extend sack scoreboard to support sack based retransmissions - basic implementation of Eifel detection algorithm (RFC 3522) for detecting spurious retransmissions - actually initialize the per-thread frame freelist hash tables - increase worker stack size to 2mb - fix session queue node out-of-buffer handling - ensure that the local buffer cache vec_len matches reality - avoid 2x spurious event requeues when short of buffers - count out-of-buffer events - make the builtin server thread-safe - fix bihash template threading issue: need to paint -1 across uninitialized working_copy_length vector elements (via rebase from master) Change-Id: I646cb9f1add9a67d08f4a87badbcb117980ebfc4 Signed-off-by: Florin Coras Signed-off-by: Dave Barach --- src/svm/svm_fifo.c | 5 +- src/vlib/node.c | 1 + src/vlib/threads.c | 2 +- src/vlib/threads.h | 2 +- src/vnet/session/node.c | 53 ++-- src/vnet/session/session.c | 11 +- src/vnet/session/session.h | 6 +- src/vnet/session/session_cli.c | 26 +- src/vnet/tcp/builtin_client.c | 40 ++- src/vnet/tcp/builtin_server.c | 20 +- src/vnet/tcp/tcp.c | 57 ++-- src/vnet/tcp/tcp.h | 112 +++++-- src/vnet/tcp/tcp_debug.h | 16 +- src/vnet/tcp/tcp_input.c | 671 +++++++++++++++++++++++++++++------------ src/vnet/tcp/tcp_newreno.c | 20 +- src/vnet/tcp/tcp_output.c | 287 ++++++++++++------ src/vnet/tcp/tcp_test.c | 53 ++-- 17 files changed, 973 insertions(+), 409 deletions(-) (limited to 'src/vnet/tcp/tcp_newreno.c') diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c index f13f6fea..5c8f244a 100644 --- a/src/svm/svm_fifo.c +++ b/src/svm/svm_fifo.c @@ -540,7 +540,7 @@ svm_fifo_peek (svm_fifo_t * f, u32 relative_offset, u32 max_bytes, /* read cursize, which can only increase while we're working */ cursize = svm_fifo_max_dequeue (f); - if (PREDICT_FALSE (cursize == 0)) + if (PREDICT_FALSE (cursize < relative_offset)) return -2; /* nothing in the fifo */ nitems = f->nitems; @@ -548,7 +548,8 @@ svm_fifo_peek (svm_fifo_t * f, u32 relative_offset, u32 max_bytes, real_head = real_head >= nitems ? real_head - nitems : real_head; /* Number of bytes we're going to copy */ - total_copy_bytes = (cursize < max_bytes) ? cursize : max_bytes; + total_copy_bytes = (cursize - relative_offset < max_bytes) ? + cursize - relative_offset : max_bytes; if (PREDICT_TRUE (copy_here != 0)) { diff --git a/src/vlib/node.c b/src/vlib/node.c index bbd3a42e..eecad274 100644 --- a/src/vlib/node.c +++ b/src/vlib/node.c @@ -502,6 +502,7 @@ vlib_node_main_init (vlib_main_t * vm) vlib_node_t *n; uword ni; + nm->frame_size_hash = hash_create (0, sizeof (uword)); nm->flags |= VLIB_NODE_MAIN_RUNTIME_STARTED; /* Generate sibling relationships */ diff --git a/src/vlib/threads.c b/src/vlib/threads.c index b7bc9e26..0c775e2d 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -670,7 +670,7 @@ start_workers (vlib_main_t * vm) /* zap the (per worker) frame freelists, etc */ nm_clone->frame_sizes = 0; - nm_clone->frame_size_hash = 0; + nm_clone->frame_size_hash = hash_create (0, sizeof (uword)); /* Packet trace buffers are guaranteed to be empty, nothing to do here */ diff --git a/src/vlib/threads.h b/src/vlib/threads.h index 17d35a24..572ce77f 100644 --- a/src/vlib/threads.h +++ b/src/vlib/threads.h @@ -62,7 +62,7 @@ typedef struct vlib_thread_registration_ #define VLIB_CPU_MASK (VLIB_MAX_CPUS - 1) /* 0x3f, max */ #define VLIB_OFFSET_MASK (~VLIB_CPU_MASK) -#define VLIB_LOG2_THREAD_STACK_SIZE (20) +#define VLIB_LOG2_THREAD_STACK_SIZE (21) #define VLIB_THREAD_STACK_SIZE (1<session_type]; @@ -167,9 +169,6 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, /* Check how much we can pull. If buffering, subtract the offset */ max_dequeue0 = svm_fifo_max_dequeue (s0->server_tx_fifo) - rx_offset; - /* Allow enqueuing of a new event */ - svm_fifo_unset_event (s0->server_tx_fifo); - /* Nothing to read return */ if (max_dequeue0 == 0) return 0; @@ -187,8 +186,8 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, max_len_to_snd0 = snd_space0; } - n_bytes_per_buf = vlib_buffer_free_list_buffer_size (vm, - VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + n_bytes_per_buf = vlib_buffer_free_list_buffer_size + (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); n_bytes_per_seg = MAX_HDRS_LEN + snd_mss0; n_bufs_per_seg = ceil ((double) n_bytes_per_seg / n_bytes_per_buf); n_bufs_per_evt = (ceil ((double) max_len_to_snd0 / n_bytes_per_seg)) @@ -205,24 +204,33 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, if (PREDICT_FALSE (n_bufs < VLIB_FRAME_SIZE)) { vec_validate (smm->tx_buffers[thread_index], - n_bufs + VLIB_FRAME_SIZE - 1); - n_bufs += vlib_buffer_alloc (vm, - &smm->tx_buffers[thread_index][n_bufs], - VLIB_FRAME_SIZE); - - /* buffer shortage - * XXX 0.9 because when debugging we might not get a full frame */ - if (PREDICT_FALSE (n_bufs < 0.9 * VLIB_FRAME_SIZE)) + n_bufs + 2 * VLIB_FRAME_SIZE - 1); + + buffers_allocated = 0; + do { - if (svm_fifo_set_event (s0->server_tx_fifo)) - { - vec_add1 (smm->pending_event_vector[thread_index], *e0); - } - return -1; + buffers_allocated_this_call = + vlib_buffer_alloc + (vm, + &smm->tx_buffers[thread_index][n_bufs + buffers_allocated], + 2 * VLIB_FRAME_SIZE - buffers_allocated); + buffers_allocated += buffers_allocated_this_call; } + while (buffers_allocated_this_call > 0 + && ((buffers_allocated + n_bufs < VLIB_FRAME_SIZE))); + + n_bufs += buffers_allocated; _vec_len (smm->tx_buffers[thread_index]) = n_bufs; + + if (PREDICT_FALSE (n_bufs < VLIB_FRAME_SIZE)) + { + vec_add1 (smm->pending_event_vector[thread_index], *e0); + return -1; + } } + /* Allow enqueuing of a new event */ + svm_fifo_unset_event (s0->server_tx_fifo); vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); while (left_to_snd0 && n_left_to_next >= n_bufs_per_seg) @@ -232,7 +240,9 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, */ /* Get free buffer */ + ASSERT (n_bufs >= 1); bi0 = smm->tx_buffers[thread_index][--n_bufs]; + ASSERT (bi0); _vec_len (smm->tx_buffers[thread_index]) = n_bufs; b0 = vlib_get_buffer (vm, bi0); @@ -545,9 +555,10 @@ skip_dequeue: my_thread_index, &n_tx_packets); /* Out of buffers */ - if (rv < 0) + if (PREDICT_FALSE (rv < 0)) { - vec_add1 (smm->pending_event_vector[my_thread_index], *e0); + vlib_node_increment_counter (vm, node->node_index, + SESSION_QUEUE_ERROR_NO_BUFFER, 1); continue; } break; diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 02b0cced..534598d6 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -551,7 +551,7 @@ u8 stream_session_no_space (transport_connection_t * tc, u32 thread_index, u16 data_len) { - stream_session_t *s = stream_session_get (tc->c_index, thread_index); + stream_session_t *s = stream_session_get (tc->s_index, thread_index); if (PREDICT_FALSE (s->session_state != SESSION_STATE_READY)) return 1; @@ -563,6 +563,15 @@ stream_session_no_space (transport_connection_t * tc, u32 thread_index, } u32 +stream_session_tx_fifo_max_dequeue (transport_connection_t * tc) +{ + stream_session_t *s = stream_session_get (tc->s_index, tc->thread_index); + if (s->session_state != SESSION_STATE_READY) + return 0; + return svm_fifo_max_dequeue (s->server_tx_fifo); +} + +int stream_session_peek_bytes (transport_connection_t * tc, u8 * buffer, u32 offset, u32 max_bytes) { diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index a8728649..d9c38bd1 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -352,16 +352,18 @@ stream_session_max_rx_enqueue (transport_connection_t * tc) } always_inline u32 -stream_session_fifo_size (transport_connection_t * tc) +stream_session_rx_fifo_size (transport_connection_t * tc) { stream_session_t *s = stream_session_get (tc->s_index, tc->thread_index); return s->server_rx_fifo->nitems; } +u32 stream_session_tx_fifo_max_dequeue (transport_connection_t * tc); + int stream_session_enqueue_data (transport_connection_t * tc, vlib_buffer_t * b, u32 offset, u8 queue_event, u8 is_in_order); -u32 +int stream_session_peek_bytes (transport_connection_t * tc, u8 * buffer, u32 offset, u32 max_bytes); u32 stream_session_dequeue_drop (transport_connection_t * tc, u32 max_bytes); diff --git a/src/vnet/session/session_cli.c b/src/vnet/session/session_cli.c index 509eedbb..6b8341aa 100755 --- a/src/vnet/session/session_cli.c +++ b/src/vnet/session/session_cli.c @@ -15,6 +15,15 @@ #include #include +u8 * +format_stream_session_fifos (u8 * s, va_list * args) +{ + stream_session_t *ss = va_arg (*args, stream_session_t *); + s = format (s, " Rx fifo: %U", format_svm_fifo, ss->server_rx_fifo, 1); + s = format (s, " Tx fifo: %U", format_svm_fifo, ss->server_tx_fifo, 1); + return s; +} + /** * Format stream session as per the following format * @@ -44,6 +53,8 @@ format_stream_session (u8 * s, va_list * args) ss->thread_index, verbose); if (verbose == 1) s = format (s, "%v", str); + if (verbose > 1) + s = format (s, "%U", format_stream_session_fifos, ss); } else if (ss->session_state == SESSION_STATE_LISTENING) { @@ -57,8 +68,12 @@ format_stream_session (u8 * s, va_list * args) } else if (ss->session_state == SESSION_STATE_CLOSED) { - s = format (s, "[CL] %-40U%v", tp_vft->format_connection, - ss->connection_index, ss->thread_index, verbose, str); + s = format (s, "[CL] %-40U", tp_vft->format_connection, + ss->connection_index, ss->thread_index, verbose); + if (verbose == 1) + s = format (s, "%v", str); + if (verbose > 1) + s = format (s, "%U", format_stream_session_fifos, ss); } else { @@ -124,13 +139,6 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, ({ vec_reset_length (str); str = format (str, "%U", format_stream_session, s, verbose); - if (verbose > 1) - { - str = format (str, " Rx fifo: %U", format_svm_fifo, - s->server_rx_fifo, 1); - str = format (str, " Tx fifo: %U", format_svm_fifo, - s->server_tx_fifo, 1); - } vlib_cli_output (vm, "%v", str); })); /* *INDENT-ON* */ diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index 768f0c3c..7238cda3 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -115,8 +115,17 @@ receive_test_chunk (tclient_main_t * tm, session_t * s) /* Allow enqueuing of new event */ // svm_fifo_unset_event (rx_fifo); - n_read = svm_fifo_dequeue_nowait (rx_fifo, vec_len (tm->rx_buf), - tm->rx_buf); + if (test_bytes) + { + n_read = svm_fifo_dequeue_nowait (rx_fifo, vec_len (tm->rx_buf), + tm->rx_buf); + } + else + { + n_read = svm_fifo_max_dequeue (rx_fifo); + svm_fifo_dequeue_drop (rx_fifo, n_read); + } + if (n_read > 0) { if (TCP_BUILTIN_CLIENT_DBG) @@ -165,6 +174,8 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, int i; int delete_session; u32 *connection_indices; + u32 tx_quota = 0; + u32 delta, prev_bytes_received_this_session; connection_indices = tm->connection_index_by_thread[my_thread_index]; @@ -177,14 +188,19 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, sp = pool_elt_at_index (tm->sessions, connection_indices[i]); - if (sp->bytes_to_send > 0) + if (tx_quota < 60 && sp->bytes_to_send > 0) { send_test_chunk (tm, sp); delete_session = 0; + tx_quota++; } if (sp->bytes_to_receive > 0) { + prev_bytes_received_this_session = sp->bytes_received; receive_test_chunk (tm, sp); + delta = sp->bytes_received - prev_bytes_received_this_session; + if (delta > 0) + tx_quota--; delete_session = 0; } if (PREDICT_FALSE (delete_session == 1)) @@ -195,11 +211,19 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION); dmp->client_index = tm->my_client_index; dmp->handle = sp->vpp_session_handle; - vl_msg_api_send_shmem (tm->vl_input_queue, (u8 *) & dmp); - vec_delete (connection_indices, 1, i); - tm->connection_index_by_thread[my_thread_index] = - connection_indices; - __sync_fetch_and_add (&tm->ready_connections, -1); +// vl_msg_api_send_shmem (tm->vl_input_queue, (u8 *) & dmp); + if (!unix_shared_memory_queue_add (tm->vl_input_queue, (u8 *) & dmp, + 1)) + { + vec_delete (connection_indices, 1, i); + tm->connection_index_by_thread[my_thread_index] = + connection_indices; + __sync_fetch_and_add (&tm->ready_connections, -1); + } + else + { + vl_msg_api_free (dmp); + } /* Kick the debug CLI process */ if (tm->ready_connections == 0) diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c index 4f0e211c..8bd2f360 100644 --- a/src/vnet/tcp/builtin_server.c +++ b/src/vnet/tcp/builtin_server.c @@ -39,7 +39,8 @@ typedef struct { - u8 *rx_buf; + /* Per-thread RX buffer */ + u8 **rx_buf; unix_shared_memory_queue_t **vpp_queue; u64 byte_index; @@ -117,13 +118,15 @@ void test_bytes (builtin_server_main_t * bsm, int actual_transfer) { int i; + u32 my_thread_id = vlib_get_thread_index (); for (i = 0; i < actual_transfer; i++) { - if (bsm->rx_buf[i] != ((bsm->byte_index + i) & 0xff)) + if (bsm->rx_buf[my_thread_id][i] != ((bsm->byte_index + i) & 0xff)) { clib_warning ("at %lld expected %d got %d", bsm->byte_index + i, - (bsm->byte_index + i) & 0xff, bsm->rx_buf[i]); + (bsm->byte_index + i) & 0xff, + bsm->rx_buf[my_thread_id][i]); } } bsm->byte_index += actual_transfer; @@ -138,6 +141,7 @@ builtin_server_rx_callback (stream_session_t * s) builtin_server_main_t *bsm = &builtin_server_main; session_fifo_event_t evt; static int serial_number = 0; + u32 my_thread_id = vlib_get_thread_index (); tx_fifo = s->server_tx_fifo; rx_fifo = s->server_rx_fifo; @@ -171,11 +175,12 @@ builtin_server_rx_callback (stream_session_t * s) return 0; } - vec_validate (bsm->rx_buf, max_transfer - 1); - _vec_len (bsm->rx_buf) = max_transfer; + vec_validate (bsm->rx_buf, my_thread_id); + vec_validate (bsm->rx_buf[my_thread_id], max_transfer - 1); + _vec_len (bsm->rx_buf[my_thread_id]) = max_transfer; actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, max_transfer, - bsm->rx_buf); + bsm->rx_buf[my_thread_id]); ASSERT (actual_transfer == max_transfer); // test_bytes (bsm, actual_transfer); @@ -184,7 +189,8 @@ builtin_server_rx_callback (stream_session_t * s) * Echo back */ - n_written = svm_fifo_enqueue_nowait (tx_fifo, actual_transfer, bsm->rx_buf); + n_written = svm_fifo_enqueue_nowait (tx_fifo, actual_transfer, + bsm->rx_buf[my_thread_id]); if (n_written != max_transfer) clib_warning ("short trout!"); diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 9b7b2f65..e0b67a8e 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -195,8 +195,8 @@ tcp_connection_close (tcp_connection_t * tc) TCP_EVT_DBG (TCP_EVT_CLOSE, tc); /* Send FIN if needed */ - if (tc->state == TCP_STATE_ESTABLISHED || tc->state == TCP_STATE_SYN_RCVD - || tc->state == TCP_STATE_CLOSE_WAIT) + if (tc->state == TCP_STATE_ESTABLISHED + || tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_CLOSE_WAIT) tcp_send_fin (tc); /* Switch state */ @@ -480,7 +480,7 @@ u8 * format_tcp_timers (u8 * s, va_list * args) { tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - int i, last = 0; + int i, last = -1; for (i = 0; i < TCP_N_TIMERS; i++) if (tc->timers[i] != TCP_TIMER_HANDLE_INVALID) @@ -493,7 +493,7 @@ format_tcp_timers (u8 * s, va_list * args) s = format (s, "%s,", tcp_conn_timers[i]); } - if (last > 0) + if (last >= 0) s = format (s, "%s]", tcp_conn_timers[i]); else s = format (s, "]"); @@ -526,19 +526,19 @@ format_tcp_vars (u8 * s, va_list * args) s = format (s, " snd_wnd %u rcv_wnd %u snd_wl1 %u snd_wl2 %u\n", tc->snd_wnd, tc->rcv_wnd, tc->snd_wl1 - tc->irs, tc->snd_wl2 - tc->iss); - s = format (s, " flight size %u send space %u rcv_wnd available %d\n", - tcp_flight_size (tc), tcp_snd_space (tc), - tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las)); + s = format (s, " flight size %u send space %u rcv_wnd_av %d\n", + tcp_flight_size (tc), tcp_available_snd_space (tc), + tcp_rcv_wnd_available (tc)); s = format (s, " cong %U ", format_tcp_congestion_status, tc); s = format (s, "cwnd %u ssthresh %u rtx_bytes %u bytes_acked %u\n", - tc->cwnd, tc->ssthresh, tc->rtx_bytes, tc->bytes_acked); - s = format (s, " prev_ssthresh %u snd_congestion %u\n", tc->prev_ssthresh, - tc->snd_congestion - tc->iss); + tc->cwnd, tc->ssthresh, tc->snd_rxt_bytes, tc->bytes_acked); + s = format (s, " prev_ssthresh %u snd_congestion %u dupack %u\n", + tc->prev_ssthresh, tc->snd_congestion - tc->iss, + tc->rcv_dupacks); s = format (s, " rto %u rto_boff %u srtt %u rttvar %u rtt_ts %u ", tc->rto, tc->rto_boff, tc->srtt, tc->rttvar, tc->rtt_ts); s = format (s, "rtt_seq %u\n", tc->rtt_seq); - if (scoreboard_first_hole (&tc->sack_sb)) - s = format (s, " scoreboard: %U\n", format_tcp_scoreboard, &tc->sack_sb); + s = format (s, " scoreboard: %U\n", format_tcp_scoreboard, &tc->sack_sb); if (vec_len (tc->snd_sacks)) s = format (s, " sacks tx: %U\n", format_tcp_sacks, tc); @@ -595,9 +595,10 @@ format_tcp_session (u8 * s, va_list * args) tc = tcp_connection_get (tci, thread_index); if (tc) - return format (s, "%U", format_tcp_connection, tc, verbose); + s = format (s, "%U", format_tcp_connection, tc, verbose); else - return format (s, "empty"); + s = format (s, "empty"); + return s; } u8 * @@ -643,13 +644,17 @@ format_tcp_scoreboard (u8 * s, va_list * args) { sack_scoreboard_t *sb = va_arg (*args, sack_scoreboard_t *); sack_scoreboard_hole_t *hole; - s = format (s, "head %u tail %u snd_una_adv %u\n", sb->head, sb->tail, - sb->snd_una_adv); - s = format (s, "sacked_bytes %u last_sacked_bytes %u", sb->sacked_bytes, - sb->last_sacked_bytes); - s = format (s, " max_byte_sacked %u\n", sb->max_byte_sacked); - s = format (s, "holes:\n"); + s = format (s, "sacked_bytes %u last_sacked_bytes %u lost_bytes %u\n", + sb->sacked_bytes, sb->last_sacked_bytes, sb->lost_bytes); + s = format (s, " last_bytes_delivered %u high_sacked %u snd_una_adv %u\n", + sb->last_bytes_delivered, sb->high_sacked, sb->snd_una_adv); + s = format (s, " cur_rxt_hole %u high_rxt %u rescue_rxt %u", + sb->cur_rxt_hole, sb->high_rxt, sb->rescue_rxt); + hole = scoreboard_first_hole (sb); + if (hole) + s = format (s, "\n head %u tail %u holes:\n", sb->head, sb->tail); + while (hole) { s = format (s, "%U", format_tcp_sack_hole, hole); @@ -736,7 +741,7 @@ tcp_snd_space (tcp_connection_t * tc) if (tcp_in_recovery (tc)) { tc->snd_nxt = tc->snd_una_max; - snd_space = tcp_available_wnd (tc) - tc->rtx_bytes + snd_space = tcp_available_wnd (tc) - tc->snd_rxt_bytes - (tc->snd_una_max - tc->snd_congestion); if (snd_space <= 0 || (tc->snd_una_max - tc->snd_una) >= tc->snd_wnd) return 0; @@ -744,8 +749,8 @@ tcp_snd_space (tcp_connection_t * tc) } /* If in fast recovery, send 1 SMSS if wnd allows */ - if (tcp_in_fastrecovery (tc) && tcp_available_snd_space (tc) - && tcp_fastrecovery_sent_1_smss (tc)) + if (tcp_in_fastrecovery (tc) + && tcp_available_snd_space (tc) && !tcp_fastrecovery_sent_1_smss (tc)) { tcp_fastrecovery_1_smss_on (tc); return tc->snd_mss; @@ -761,6 +766,12 @@ tcp_session_send_space (transport_connection_t * trans_conn) return tcp_snd_space (tc); } +i32 +tcp_rcv_wnd_available (tcp_connection_t * tc) +{ + return (i32) tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las); +} + u32 tcp_session_tx_fifo_offset (transport_connection_t * trans_conn) { diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index c3ebe22b..071f1ab1 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -34,6 +34,7 @@ #define TCP_MAX_RX_FIFO_SIZE 2 << 20 #define TCP_IW_N_SEGMENTS 10 #define TCP_ALWAYS_ACK 0 /**< If on, we always ack */ +#define TCP_USE_SACKS 1 /**< Disable only for testing */ /** TCP FSM state definitions as per RFC793. */ #define foreach_tcp_fsm_state \ @@ -94,7 +95,7 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler; #define TCP_DELACK_TIME 1 /* 0.1s */ #define TCP_ESTABLISH_TIME 750 /* 75s */ #define TCP_2MSL_TIME 300 /* 30s */ -#define TCP_CLOSEWAIT_TIME 1 /* 0.1s */ +#define TCP_CLOSEWAIT_TIME 20 /* 0.1s */ #define TCP_CLEANUP_TIME 5 /* 0.5s Time to wait before cleanup */ #define TCP_TIMER_PERSIST_MIN 2 /* 0.2s */ @@ -157,6 +158,7 @@ typedef struct _sack_scoreboard_hole u32 prev; /**< Index for previous entry in linked list */ u32 start; /**< Start sequence number */ u32 end; /**< End sequence number */ + u8 is_lost; /**< Mark hole as lost */ } sack_scoreboard_hole_t; typedef struct _sack_scoreboard @@ -166,8 +168,13 @@ typedef struct _sack_scoreboard u32 tail; /**< Index of last entry */ u32 sacked_bytes; /**< Number of bytes sacked in sb */ u32 last_sacked_bytes; /**< Number of bytes last sacked */ + u32 last_bytes_delivered; /**< Number of sack bytes delivered */ u32 snd_una_adv; /**< Bytes to add to snd_una */ - u32 max_byte_sacked; /**< Highest byte acked */ + u32 high_sacked; /**< Highest byte sacked (fack) */ + u32 high_rxt; /**< Highest retransmitted sequence */ + u32 rescue_rxt; /**< Rescue sequence number */ + u32 lost_bytes; /**< Bytes lost as per RFC6675 */ + u32 cur_rxt_hole; /**< Retransmitting from this hole */ } sack_scoreboard_t; typedef enum _tcp_cc_algorithm_type @@ -211,7 +218,7 @@ typedef struct _tcp_connection u32 irs; /**< initial remote sequence */ /* Options */ - tcp_options_t opt; /**< TCP connection options parsed */ + tcp_options_t rcv_opts; /**< Rx options for connection */ tcp_options_t snd_opts; /**< Tx options for connection */ u8 snd_opts_len; /**< Tx options len */ u8 rcv_wscale; /**< Window scale to advertise to peer */ @@ -229,8 +236,10 @@ typedef struct _tcp_connection u32 cwnd; /**< Congestion window */ u32 ssthresh; /**< Slow-start threshold */ u32 prev_ssthresh; /**< ssthresh before congestion */ + u32 prev_cwnd; /**< ssthresh before congestion */ u32 bytes_acked; /**< Bytes acknowledged by current segment */ - u32 rtx_bytes; /**< Retransmitted bytes */ + u32 snd_rxt_bytes; /**< Retransmitted bytes */ + u32 snd_rxt_ts; /**< Timestamp when first packet is retransmitted */ u32 tsecr_last_ack; /**< Timestamp echoed to us in last healthy ACK */ u32 snd_congestion; /**< snd_una_max when congestion is detected */ tcp_cc_algorithm_t *cc_algo; /**< Congestion control algorithm */ @@ -411,6 +420,7 @@ void tcp_send_syn (tcp_connection_t * tc); void tcp_send_fin (tcp_connection_t * tc); void tcp_init_mss (tcp_connection_t * tc); void tcp_update_snd_mss (tcp_connection_t * tc); +void tcp_update_rto (tcp_connection_t * tc); always_inline u32 tcp_end_seq (tcp_header_t * th, u32 len) @@ -428,17 +438,39 @@ tcp_end_seq (tcp_header_t * th, u32 len) #define timestamp_lt(_t1, _t2) ((i32)((_t1)-(_t2)) < 0) #define timestamp_leq(_t1, _t2) ((i32)((_t1)-(_t2)) <= 0) +/** + * Our estimate of the number of bytes that have left the network + */ +always_inline u32 +tcp_bytes_out (const tcp_connection_t * tc) +{ + if (tcp_opts_sack_permitted (&tc->rcv_opts)) + return tc->sack_sb.sacked_bytes + tc->sack_sb.lost_bytes; + else + return tc->rcv_dupacks * tc->snd_mss; +} + +/** + * Our estimate of the number of bytes in flight (pipe size) + */ always_inline u32 tcp_flight_size (const tcp_connection_t * tc) { int flight_size; - flight_size = (int) ((tc->snd_una_max - tc->snd_una) + tc->rtx_bytes) - - (tc->rcv_dupacks * tc->snd_mss) /* - tc->sack_sb.sacked_bytes */ ; + flight_size = (int) (tc->snd_una_max - tc->snd_una) - tcp_bytes_out (tc) + + tc->snd_rxt_bytes; - /* Happens if we don't clear sacked bytes */ if (flight_size < 0) - return 0; + { + if (0) + clib_warning + ("Negative: %u %u %u dupacks %u sacked bytes %u flags %d", + tc->snd_una_max - tc->snd_una, tcp_bytes_out (tc), + tc->snd_rxt_bytes, tc->rcv_dupacks, tc->sack_sb.sacked_bytes, + tc->rcv_opts.flags); + return 0; + } return flight_size; } @@ -481,14 +513,17 @@ tcp_available_snd_space (const tcp_connection_t * tc) return available_wnd - flight_size; } -u32 tcp_rcv_wnd_available (tcp_connection_t * tc); +i32 tcp_rcv_wnd_available (tcp_connection_t * tc); u32 tcp_snd_space (tcp_connection_t * tc); void tcp_update_rcv_wnd (tcp_connection_t * tc); void tcp_retransmit_first_unacked (tcp_connection_t * tc); +void tcp_fast_retransmit_no_sack (tcp_connection_t * tc); +void tcp_fast_retransmit_sack (tcp_connection_t * tc); void tcp_fast_retransmit (tcp_connection_t * tc); -void tcp_cc_congestion (tcp_connection_t * tc); -void tcp_cc_recover (tcp_connection_t * tc); +void tcp_cc_init_congestion (tcp_connection_t * tc); +int tcp_cc_recover (tcp_connection_t * tc); +void tcp_cc_fastrecovery_exit (tcp_connection_t * tc); /* Made public for unit testing only */ void tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end); @@ -563,16 +598,16 @@ tcp_retransmit_timer_set (tcp_connection_t * tc) } always_inline void -tcp_retransmit_timer_update (tcp_connection_t * tc) +tcp_retransmit_timer_reset (tcp_connection_t * tc) { - tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, - clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); + tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT); } always_inline void -tcp_retransmit_timer_reset (tcp_connection_t * tc) +tcp_retransmit_timer_force_update (tcp_connection_t * tc) { - tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT); + tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, + clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); } always_inline void @@ -598,15 +633,43 @@ tcp_persist_timer_reset (tcp_connection_t * tc) tcp_timer_reset (tc, TCP_TIMER_PERSIST); } +always_inline void +tcp_retransmit_timer_update (tcp_connection_t * tc) +{ + if (tc->snd_una == tc->snd_una_max) + { + tcp_retransmit_timer_reset (tc); + if (tc->snd_wnd < tc->snd_mss) + tcp_persist_timer_set (tc); + } + else + tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, + clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); +} + always_inline u8 tcp_timer_is_active (tcp_connection_t * tc, tcp_timers_e timer) { return tc->timers[timer] != TCP_TIMER_HANDLE_INVALID; } +#define tcp_validate_txf_size(_tc, _a) \ + ASSERT(_tc->state != TCP_STATE_ESTABLISHED \ + || stream_session_tx_fifo_max_dequeue (&_tc->connection) >= _a) + void scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole); +void scoreboard_update_lost (tcp_connection_t * tc, sack_scoreboard_t * sb); +sack_scoreboard_hole_t *scoreboard_insert_hole (sack_scoreboard_t * sb, + u32 prev_index, u32 start, + u32 end); +sack_scoreboard_hole_t *scoreboard_next_rxt_hole (sack_scoreboard_t * sb, + sack_scoreboard_hole_t * + start, u8 have_sent_1_smss, + u8 * can_rescue, + u8 * snd_limited); +void scoreboard_init_high_rxt (sack_scoreboard_t * sb); always_inline sack_scoreboard_hole_t * scoreboard_get_hole (sack_scoreboard_t * sb, u32 index) @@ -624,6 +687,14 @@ scoreboard_next_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) return 0; } +always_inline sack_scoreboard_hole_t * +scoreboard_prev_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) +{ + if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, hole->prev); + return 0; +} + always_inline sack_scoreboard_hole_t * scoreboard_first_hole (sack_scoreboard_t * sb) { @@ -643,15 +714,19 @@ scoreboard_last_hole (sack_scoreboard_t * sb) always_inline void scoreboard_clear (sack_scoreboard_t * sb) { - sack_scoreboard_hole_t *hole = scoreboard_first_hole (sb); + sack_scoreboard_hole_t *hole; while ((hole = scoreboard_first_hole (sb))) { scoreboard_remove_hole (sb, hole); } sb->sacked_bytes = 0; sb->last_sacked_bytes = 0; + sb->last_bytes_delivered = 0; sb->snd_una_adv = 0; - sb->max_byte_sacked = 0; + sb->high_sacked = 0; + sb->high_rxt = 0; + sb->lost_bytes = 0; + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; } always_inline u32 @@ -671,6 +746,7 @@ scoreboard_init (sack_scoreboard_t * sb) { sb->head = TCP_INVALID_SACK_HOLE_INDEX; sb->tail = TCP_INVALID_SACK_HOLE_INDEX; + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; } void tcp_rcv_sacks (tcp_connection_t * tc, u32 ack); diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index b4497a3b..3a16cf63 100755 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -393,7 +393,7 @@ typedef enum _tcp_dbg_evt DECLARE_ETD(_tc, _e, 4); \ ed->data[0] = _seq - _tc->irs; \ ed->data[1] = _end - _tc->irs; \ - ed->data[2] = _tc->opt.tsval; \ + ed->data[2] = _tc->rcv_opts.tsval; \ ed->data[3] = _tc->tsval_recent; \ } @@ -427,27 +427,27 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "rtx: snd_nxt %u offset %u snd %u rtx %u", \ + .format = "rxt: snd_nxt %u offset %u snd %u rxt %u", \ .format_args = "i4i4i4i4", \ }; \ DECLARE_ETD(_tc, _e, 4); \ ed->data[0] = _tc->snd_nxt - _tc->iss; \ ed->data[1] = offset; \ ed->data[2] = n_bytes; \ - ed->data[3] = _tc->rtx_bytes; \ + ed->data[3] = _tc->snd_rxt_bytes; \ } #define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "cc: %s wnd %u snd_cong %u rtx_bytes %u", \ + .format = "cc: %s wnd %u snd_cong %u rxt_bytes %u", \ .format_args = "t4i4i4i4", \ .n_enum_strings = 5, \ .enum_strings = { \ - "fast-rtx", \ - "rtx-timeout", \ - "first-rtx", \ + "fast-rxt", \ + "rxt-timeout", \ + "first-rxt", \ "recovered", \ "congestion", \ }, \ @@ -456,7 +456,7 @@ typedef enum _tcp_dbg_evt ed->data[0] = _sub_evt; \ ed->data[1] = tcp_available_snd_space (_tc); \ ed->data[2] = _tc->snd_congestion - _tc->iss; \ - ed->data[3] = _tc->rtx_bytes; \ + ed->data[3] = _tc->snd_rxt_bytes; \ } #define TCP_EVT_CC_PACK_HANDLER(_tc, ...) \ diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 35bc9094..ff2229b3 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -231,8 +231,8 @@ tcp_options_parse (tcp_header_t * th, tcp_options_t * to) always_inline int tcp_segment_check_paws (tcp_connection_t * tc) { - return tcp_opts_tstamp (&tc->opt) && tc->tsval_recent - && timestamp_lt (tc->opt.tsval, tc->tsval_recent); + return tcp_opts_tstamp (&tc->rcv_opts) && tc->tsval_recent + && timestamp_lt (tc->rcv_opts.tsval, tc->tsval_recent); } /** @@ -248,10 +248,10 @@ tcp_update_timestamp (tcp_connection_t * tc, u32 seq, u32 seq_end) * then the TSval from the segment is copied to TS.Recent; * otherwise, the TSval is ignored. */ - if (tcp_opts_tstamp (&tc->opt) && tc->tsval_recent + if (tcp_opts_tstamp (&tc->rcv_opts) && tc->tsval_recent && seq_leq (seq, tc->rcv_las) && seq_leq (tc->rcv_las, seq_end)) { - tc->tsval_recent = tc->opt.tsval; + tc->tsval_recent = tc->rcv_opts.tsval; tc->tsval_recent_age = tcp_time_now (); } } @@ -272,14 +272,21 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, if (PREDICT_FALSE (!tcp_ack (th0) && !tcp_rst (th0) && !tcp_syn (th0))) return -1; - if (PREDICT_FALSE (tcp_options_parse (th0, &tc0->opt))) + if (PREDICT_FALSE (tcp_options_parse (th0, &tc0->rcv_opts))) { return -1; } if (tcp_segment_check_paws (tc0)) { - clib_warning ("paws failed"); + if (CLIB_DEBUG > 2) + { + clib_warning ("paws failed\n%U", format_tcp_connection, tc0, 2); + clib_warning ("seq %u seq_end %u ack %u", + vnet_buffer (b0)->tcp.seq_number - tc0->irs, + vnet_buffer (b0)->tcp.seq_end - tc0->irs, + vnet_buffer (b0)->tcp.ack_number - tc0->iss); + } TCP_EVT_DBG (TCP_EVT_PAWS_FAIL, tc0, vnet_buffer (b0)->tcp.seq_number, vnet_buffer (b0)->tcp.seq_end); @@ -348,7 +355,6 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, /* If segment in window, save timestamp */ tcp_update_timestamp (tc0, vnet_buffer (b0)->tcp.seq_number, vnet_buffer (b0)->tcp.seq_end); - return 0; } @@ -391,6 +397,12 @@ tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt) } } +void +tcp_update_rto (tcp_connection_t * tc) +{ + tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); +} + /** Update RTT estimate and RTO timer * * Measure RTT: We have two sources of RTT measurements: TSOPT and ACK @@ -405,7 +417,7 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) u32 mrtt = 0; u8 rtx_acked; - /* Determine if only rtx bytes are acked. TODO fast retransmit */ + /* Determine if only rtx bytes are acked. TODO XXX fast retransmit */ rtx_acked = tc->rto_boff && (tc->bytes_acked <= tc->snd_mss); /* Karn's rule, part 1. Don't use retransmitted segments to estimate @@ -418,9 +430,10 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) * snd_una, i.e., the left side of the send window: * seq_lt (tc->snd_una, ack). Note: last condition could be dropped, we don't * try to update rtt for dupacks */ - else if (tcp_opts_tstamp (&tc->opt) && tc->opt.tsecr && tc->bytes_acked) + else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr + && tc->bytes_acked) { - mrtt = tcp_time_now () - tc->opt.tsecr; + mrtt = tcp_time_now () - tc->rcv_opts.tsecr; } /* Allow measuring of a new RTT */ @@ -436,7 +449,7 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) return 0; tcp_estimate_rtt (tc, mrtt); - tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); + tcp_update_rto (tc); return 0; } @@ -447,25 +460,46 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) static void tcp_dequeue_acked (tcp_connection_t * tc, u32 ack) { - /* Dequeue the newly ACKed bytes */ - stream_session_dequeue_drop (&tc->connection, tc->bytes_acked); + /* Dequeue the newly ACKed add SACKed bytes */ + stream_session_dequeue_drop (&tc->connection, + tc->bytes_acked + tc->sack_sb.snd_una_adv); + + tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); /* Update rtt and rto */ tcp_update_rtt (tc, ack); + + /* If everything has been acked, stop retransmit timer + * otherwise update. */ + tcp_retransmit_timer_update (tc); } /** - * Check if dupack as per RFC5681 Sec. 2 - * - * This works only if called before updating snd_wnd. - * */ -always_inline u8 -tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 new_snd_wnd) + * Check if duplicate ack as per RFC5681 Sec. 2 + */ +static u8 +tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 prev_snd_wnd, + u32 prev_snd_una) { - return ((vnet_buffer (b)->tcp.ack_number == tc->snd_una) + return ((vnet_buffer (b)->tcp.ack_number == prev_snd_una) && seq_gt (tc->snd_una_max, tc->snd_una) && (vnet_buffer (b)->tcp.seq_end == vnet_buffer (b)->tcp.seq_number) - && (new_snd_wnd == tc->snd_wnd)); + && (prev_snd_wnd == tc->snd_wnd)); +} + +/** + * Checks if ack is a congestion control event. + */ +static u8 +tcp_ack_is_cc_event (tcp_connection_t * tc, vlib_buffer_t * b, + u32 prev_snd_wnd, u32 prev_snd_una, u8 * is_dack) +{ + /* Check if ack is duplicate. Per RFC 6675, ACKs that SACK new data are + * defined to be 'duplicate' */ + *is_dack = tc->sack_sb.last_sacked_bytes + || tcp_ack_is_dupack (tc, b, prev_snd_wnd, prev_snd_una); + + return (*is_dack || tcp_in_cong_recovery (tc)); } void @@ -478,6 +512,10 @@ scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) next = pool_elt_at_index (sb->holes, hole->next); next->prev = hole->prev; } + else + { + sb->tail = hole->prev; + } if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX) { @@ -489,6 +527,9 @@ scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) sb->head = hole->next; } + if (scoreboard_hole_index (sb, hole) == sb->cur_rxt_hole) + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; + pool_put (sb->holes, hole); } @@ -527,26 +568,131 @@ scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index, return hole; } +void +scoreboard_update_lost (tcp_connection_t * tc, sack_scoreboard_t * sb) +{ + sack_scoreboard_hole_t *hole, *prev; + u32 bytes = 0, blks = 0; + + sb->lost_bytes = 0; + hole = scoreboard_last_hole (sb); + if (!hole) + return; + + if (seq_gt (sb->high_sacked, hole->end)) + { + bytes = sb->high_sacked - hole->end; + blks = 1; + } + + while ((prev = scoreboard_prev_hole (sb, hole)) + && (bytes < (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss + && blks < TCP_DUPACK_THRESHOLD)) + { + bytes += hole->start - prev->end; + blks++; + hole = prev; + } + + hole = prev; + while (hole) + { + sb->lost_bytes += scoreboard_hole_bytes (hole); + hole->is_lost = 1; + hole = scoreboard_prev_hole (sb, hole); + } +} + +/** + * Figure out the next hole to retransmit + * + * Follows logic proposed in RFC6675 Sec. 4, NextSeg() + */ +sack_scoreboard_hole_t * +scoreboard_next_rxt_hole (sack_scoreboard_t * sb, + sack_scoreboard_hole_t * start, + u8 have_sent_1_smss, + u8 * can_rescue, u8 * snd_limited) +{ + sack_scoreboard_hole_t *hole = 0; + + hole = start ? start : scoreboard_first_hole (sb); + while (hole && seq_leq (hole->end, sb->high_rxt) && hole->is_lost) + hole = scoreboard_next_hole (sb, hole); + + /* Nothing, return */ + if (!hole) + { + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; + return 0; + } + + /* Rule (1): if higher than rxt, less than high_sacked and lost */ + if (hole->is_lost && seq_lt (hole->start, sb->high_sacked)) + { + sb->cur_rxt_hole = scoreboard_hole_index (sb, hole); + } + else + { + /* Rule (2): output takes care of transmitting new data */ + if (!have_sent_1_smss) + { + hole = 0; + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; + } + /* Rule (3): if hole not lost */ + else if (seq_lt (hole->start, sb->high_sacked)) + { + *snd_limited = 1; + sb->cur_rxt_hole = scoreboard_hole_index (sb, hole); + } + /* Rule (4): if hole beyond high_sacked */ + else + { + ASSERT (seq_geq (hole->start, sb->high_sacked)); + *snd_limited = 1; + *can_rescue = 1; + /* HighRxt MUST NOT be updated */ + return 0; + } + } + + if (hole && seq_lt (sb->high_rxt, hole->start)) + sb->high_rxt = hole->start; + + return hole; +} + +void +scoreboard_init_high_rxt (sack_scoreboard_t * sb) +{ + sack_scoreboard_hole_t *hole; + hole = scoreboard_first_hole (sb); + sb->high_rxt = hole->start; + sb->cur_rxt_hole = sb->head; +} + void tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { sack_scoreboard_t *sb = &tc->sack_sb; sack_block_t *blk, tmp; sack_scoreboard_hole_t *hole, *next_hole, *last_hole, *new_hole; - u32 blk_index = 0, old_sacked_bytes, delivered_bytes, hole_index; + u32 blk_index = 0, old_sacked_bytes, hole_index; int i, j; sb->last_sacked_bytes = 0; sb->snd_una_adv = 0; old_sacked_bytes = sb->sacked_bytes; - delivered_bytes = 0; + sb->last_bytes_delivered = 0; - if (!tcp_opts_sack (&tc->opt) && sb->head == TCP_INVALID_SACK_HOLE_INDEX) + if (!tcp_opts_sack (&tc->rcv_opts) + && sb->head == TCP_INVALID_SACK_HOLE_INDEX) return; /* Remove invalid blocks */ - blk = tc->opt.sacks; - while (blk < vec_end (tc->opt.sacks)) + blk = tc->rcv_opts.sacks; + while (blk < vec_end (tc->rcv_opts.sacks)) { if (seq_lt (blk->start, blk->end) && seq_gt (blk->start, tc->snd_una) @@ -555,7 +701,7 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) blk++; continue; } - vec_del1 (tc->opt.sacks, blk - tc->opt.sacks); + vec_del1 (tc->rcv_opts.sacks, blk - tc->rcv_opts.sacks); } /* Add block for cumulative ack */ @@ -563,20 +709,20 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { tmp.start = tc->snd_una; tmp.end = ack; - vec_add1 (tc->opt.sacks, tmp); + vec_add1 (tc->rcv_opts.sacks, tmp); } - if (vec_len (tc->opt.sacks) == 0) + if (vec_len (tc->rcv_opts.sacks) == 0) return; /* Make sure blocks are ordered */ - for (i = 0; i < vec_len (tc->opt.sacks); i++) - for (j = i + 1; j < vec_len (tc->opt.sacks); j++) - if (seq_lt (tc->opt.sacks[j].start, tc->opt.sacks[i].start)) + for (i = 0; i < vec_len (tc->rcv_opts.sacks); i++) + for (j = i + 1; j < vec_len (tc->rcv_opts.sacks); j++) + if (seq_lt (tc->rcv_opts.sacks[j].start, tc->rcv_opts.sacks[i].start)) { - tmp = tc->opt.sacks[i]; - tc->opt.sacks[i] = tc->opt.sacks[j]; - tc->opt.sacks[j] = tmp; + tmp = tc->rcv_opts.sacks[i]; + tc->rcv_opts.sacks[i] = tc->rcv_opts.sacks[j]; + tc->rcv_opts.sacks[j] = tmp; } if (sb->head == TCP_INVALID_SACK_HOLE_INDEX) @@ -585,25 +731,25 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) last_hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX, tc->snd_una, tc->snd_una_max); sb->tail = scoreboard_hole_index (sb, last_hole); - tmp = tc->opt.sacks[vec_len (tc->opt.sacks) - 1]; - sb->max_byte_sacked = tmp.end; + tmp = tc->rcv_opts.sacks[vec_len (tc->rcv_opts.sacks) - 1]; + sb->high_sacked = tmp.end; } else { /* If we have holes but snd_una_max is beyond the last hole, update * last hole end */ - tmp = tc->opt.sacks[vec_len (tc->opt.sacks) - 1]; + tmp = tc->rcv_opts.sacks[vec_len (tc->rcv_opts.sacks) - 1]; last_hole = scoreboard_last_hole (sb); - if (seq_gt (tc->snd_una_max, sb->max_byte_sacked) + if (seq_gt (tc->snd_una_max, sb->high_sacked) && seq_gt (tc->snd_una_max, last_hole->end)) last_hole->end = tc->snd_una_max; } /* Walk the holes with the SACK blocks */ hole = pool_elt_at_index (sb->holes, sb->head); - while (hole && blk_index < vec_len (tc->opt.sacks)) + while (hole && blk_index < vec_len (tc->rcv_opts.sacks)) { - blk = &tc->opt.sacks[blk_index]; + blk = &tc->rcv_opts.sacks[blk_index]; if (seq_leq (blk->start, hole->start)) { @@ -617,9 +763,9 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { /* Bytes lost because snd_wnd left edge advances */ if (next_hole && seq_leq (next_hole->start, ack)) - delivered_bytes += next_hole->start - hole->end; + sb->last_bytes_delivered += next_hole->start - hole->end; else - delivered_bytes += ack - hole->end; + sb->last_bytes_delivered += ack - hole->end; } else { @@ -633,8 +779,8 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) last_hole = scoreboard_last_hole (sb); /* keep track of max byte sacked for when the last hole * is acked */ - if (seq_gt (hole->end, sb->max_byte_sacked)) - sb->max_byte_sacked = hole->end; + if (seq_gt (hole->end, sb->high_sacked)) + sb->high_sacked = hole->end; } /* snd_una needs to be advanced */ @@ -645,12 +791,12 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) sb->snd_una_adv = next_hole->start - ack; /* all these can be delivered */ - delivered_bytes += sb->snd_una_adv; + sb->last_bytes_delivered += sb->snd_una_adv; } else if (!next_hole) { - sb->snd_una_adv = sb->max_byte_sacked - ack; - delivered_bytes += sb->snd_una_adv; + sb->snd_una_adv = sb->high_sacked - ack; + sb->last_bytes_delivered += sb->snd_una_adv; } } @@ -691,28 +837,33 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) } blk_index++; - hole = scoreboard_next_hole (sb, hole); } - else + else if (seq_leq (blk->start, hole->end)) { sb->sacked_bytes += hole->end - blk->start; hole->end = blk->start; - hole = scoreboard_next_hole (sb, hole); } + + hole = scoreboard_next_hole (sb, hole); } } sb->last_sacked_bytes = sb->sacked_bytes - old_sacked_bytes; - sb->sacked_bytes -= delivered_bytes; + sb->sacked_bytes -= sb->last_bytes_delivered; + scoreboard_update_lost (tc, sb); } -/** Update snd_wnd +/** + * Try to update snd_wnd based on feedback received from peer. * - * If (SND.WL1 < SEG.SEQ or (SND.WL1 = SEG.SEQ and SND.WL2 =< SEG.ACK)), set - * SND.WND <- SEG.WND, set SND.WL1 <- SEG.SEQ, and set SND.WL2 <- SEG.ACK */ + * If successful, and new window is 'effectively' 0, activate persist + * timer. + */ static void tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd) { + /* If (SND.WL1 < SEG.SEQ or (SND.WL1 = SEG.SEQ and SND.WL2 =< SEG.ACK)), set + * SND.WND <- SEG.WND, set SND.WL1 <- SEG.SEQ, and set SND.WL2 <- SEG.ACK */ if (seq_lt (tc->snd_wl1, seq) || (tc->snd_wl1 == seq && seq_leq (tc->snd_wl2, ack))) { @@ -721,138 +872,269 @@ tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd) tc->snd_wl2 = ack; TCP_EVT_DBG (TCP_EVT_SND_WND, tc); - /* Set probe timer if we just got 0 wnd */ if (tc->snd_wnd < tc->snd_mss) { - if (!tcp_timer_is_active (tc, TCP_TIMER_PERSIST)) + /* Set persist timer if not set and we just got 0 wnd */ + if (!tcp_timer_is_active (tc, TCP_TIMER_PERSIST) + && !tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)) tcp_persist_timer_set (tc); } else - tcp_persist_timer_reset (tc); + { + tcp_persist_timer_reset (tc); + if (!tcp_in_recovery (tc) && tc->rto_boff > 0) + { + tc->rto_boff = 0; + tcp_update_rto (tc); + } + } } } void -tcp_cc_congestion (tcp_connection_t * tc) +tcp_cc_init_congestion (tcp_connection_t * tc) { - tc->snd_congestion = tc->snd_nxt; + tcp_fastrecovery_on (tc); + tc->snd_congestion = tc->snd_una_max; tc->cc_algo->congestion (tc); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 4); } -void -tcp_cc_recover (tcp_connection_t * tc) +static void +tcp_cc_recovery_exit (tcp_connection_t * tc) { - /* TODO: check if time to recover was small. It might be that RTO popped - * too soon. - */ + /* Deflate rto */ + tcp_update_rto (tc); + tc->rto_boff = 0; + tc->snd_rxt_ts = 0; + tcp_recovery_off (tc); +} +void +tcp_cc_fastrecovery_exit (tcp_connection_t * tc) +{ tc->cc_algo->recovered (tc); + tc->snd_rxt_bytes = 0; + tc->rcv_dupacks = 0; + tcp_fastrecovery_off (tc); + tcp_fastrecovery_1_smss_off (tc); +} - tc->rtx_bytes = 0; +static void +tcp_cc_congestion_undo (tcp_connection_t * tc) +{ + tc->cwnd = tc->prev_cwnd; + tc->ssthresh = tc->prev_ssthresh; + tc->snd_nxt = tc->snd_una_max; tc->rcv_dupacks = 0; - tc->snd_nxt = tc->snd_una; + if (tcp_in_recovery (tc)) + tcp_cc_recovery_exit (tc); + ASSERT (tc->rto_boff == 0); + /* TODO extend for fastrecovery */ +} - tc->cc_algo->rcv_ack (tc); - tc->tsecr_last_ack = tc->opt.tsecr; +static u8 +tcp_cc_is_spurious_retransmit (tcp_connection_t * tc) +{ + return (tc->snd_rxt_ts + && tcp_opts_tstamp (&tc->rcv_opts) + && timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts)); +} - tcp_cong_recovery_off (tc); +int +tcp_cc_recover (tcp_connection_t * tc) +{ + ASSERT (tcp_in_cong_recovery (tc)); + if (tcp_cc_is_spurious_retransmit (tc)) + { + tcp_cc_congestion_undo (tc); + return 1; + } + + if (tcp_in_recovery (tc)) + tcp_cc_recovery_exit (tc); + else if (tcp_in_fastrecovery (tc)) + tcp_cc_fastrecovery_exit (tc); + + ASSERT (tc->rto_boff == 0); + ASSERT (!tcp_in_cong_recovery (tc)); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); + return 0; } static void -tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) +tcp_cc_update (tcp_connection_t * tc, vlib_buffer_t * b) +{ + ASSERT (!tcp_in_cong_recovery (tc)); + + /* Congestion avoidance */ + tc->cc_algo->rcv_ack (tc); + tc->tsecr_last_ack = tc->rcv_opts.tsecr; + + /* If a cumulative ack, make sure dupacks is 0 */ + tc->rcv_dupacks = 0; + + /* When dupacks hits the threshold we only enter fast retransmit if + * cumulative ack covers more than snd_congestion. Should snd_una + * wrap this test may fail under otherwise valid circumstances. + * Therefore, proactively update snd_congestion when wrap detected. */ + if (PREDICT_FALSE + (seq_leq (tc->snd_congestion, tc->snd_una - tc->bytes_acked) + && seq_gt (tc->snd_congestion, tc->snd_una))) + tc->snd_congestion = tc->snd_una - 1; +} + +static u8 +tcp_should_fastrecover_sack (tcp_connection_t * tc) { - u8 partial_ack; - u32 bytes_advanced; + return (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss < tc->sack_sb.sacked_bytes; +} - if (tcp_in_fastrecovery (tc)) +static u8 +tcp_should_fastrecover (tcp_connection_t * tc) +{ + return (tc->rcv_dupacks == TCP_DUPACK_THRESHOLD + || tcp_should_fastrecover_sack (tc)); +} + +static void +tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) +{ + /* + * Duplicate ACK. Check if we should enter fast recovery, or if already in + * it account for the bytes that left the network. + */ + if (is_dack) { - partial_ack = seq_lt (tc->snd_una, tc->snd_congestion); - if (!partial_ack) + ASSERT (tc->snd_una != tc->snd_una_max + || tc->sack_sb.last_sacked_bytes); + tc->rcv_dupacks++; + + if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD && !tc->bytes_acked) { - /* Clear retransmitted bytes. */ - tcp_cc_recover (tc); + ASSERT (tcp_in_fastrecovery (tc)); + /* Pure duplicate ack. If some data got acked, it's handled lower */ + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + return; } - else + else if (tcp_should_fastrecover (tc)) { - TCP_EVT_DBG (TCP_EVT_CC_PACK, tc); + /* Things are already bad */ + if (tcp_in_cong_recovery (tc)) + { + tc->rcv_dupacks = 0; + goto partial_ack_test; + } - /* Clear retransmitted bytes. XXX should we clear all? */ - tc->rtx_bytes = 0; + /* If of of the two conditions lower hold, reset dupacks + * 1) Cumulative ack does not cover more than congestion threshold + * 2) RFC6582 heuristic to avoid multiple fast retransmits + */ + if (seq_leq (tc->snd_una, tc->snd_congestion) + || tc->rcv_opts.tsecr != tc->tsecr_last_ack) + { + tc->rcv_dupacks = 0; + return; + } + + tcp_cc_init_congestion (tc); + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); - tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK); + /* The first segment MUST be retransmitted */ + tcp_retransmit_first_unacked (tc); - /* In case snd_nxt is still in the past and output tries to - * shove some new bytes */ - tc->snd_nxt = tc->snd_una_max; + /* Post retransmit update cwnd to ssthresh and account for the + * three segments that have left the network and should've been + * buffered at the receiver XXX */ + tc->cwnd = tc->ssthresh + tc->rcv_dupacks * tc->snd_mss; - /* XXX need proper RFC6675 support */ - if (tc->sack_sb.last_sacked_bytes && !tcp_in_recovery (tc)) + /* If cwnd allows, send more data */ + if (tcp_opts_sack_permitted (&tc->rcv_opts) + && scoreboard_first_hole (&tc->sack_sb)) { - tcp_fast_retransmit (tc); + scoreboard_init_high_rxt (&tc->sack_sb); + tcp_fast_retransmit_sack (tc); } else { - /* Retransmit first unacked segment */ - tcp_retransmit_first_unacked (tc); + tcp_fast_retransmit_no_sack (tc); } + + return; } - } - else - { - tc->cc_algo->rcv_ack (tc); - tc->tsecr_last_ack = tc->opt.tsecr; - tc->rcv_dupacks = 0; - if (tcp_in_recovery (tc)) + else if (!tc->bytes_acked + || (tc->bytes_acked && !tcp_in_cong_recovery (tc))) { - bytes_advanced = tc->bytes_acked + tc->sack_sb.snd_una_adv; - tc->rtx_bytes -= clib_min (bytes_advanced, tc->rtx_bytes); - tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); - if (seq_geq (tc->snd_una, tc->snd_congestion)) - { - tc->rtx_bytes = 0; - tcp_recovery_off (tc); - } + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + return; } + else + goto partial_ack; } -} -static void -tcp_cc_rcv_dupack (tcp_connection_t * tc, u32 ack) -{ -// ASSERT (seq_geq(tc->snd_una, ack)); +partial_ack_test: + + if (!tc->bytes_acked) + return; + +partial_ack: + /* + * Legitimate ACK. 1) See if we can exit recovery + */ + /* XXX limit this only to first partial ack? */ + tcp_retransmit_timer_update (tc); - tc->rcv_dupacks++; - if (tc->rcv_dupacks == TCP_DUPACK_THRESHOLD) + if (seq_geq (tc->snd_una, tc->snd_congestion)) { - /* RFC6582 NewReno heuristic to avoid multiple fast retransmits */ - if (tc->opt.tsecr != tc->tsecr_last_ack) - { - tc->rcv_dupacks = 0; - return; - } + /* If spurious return, we've already updated everything */ + if (tcp_cc_recover (tc)) + return; + + tc->snd_nxt = tc->snd_una_max; - tcp_fastrecovery_on (tc); + /* Treat as congestion avoidance ack */ + tc->cc_algo->rcv_ack (tc); + tc->tsecr_last_ack = tc->rcv_opts.tsecr; + return; + } + + /* + * Legitimate ACK. 2) If PARTIAL ACK try to retransmit + */ + TCP_EVT_DBG (TCP_EVT_CC_PACK, tc); + + /* RFC6675: If the incoming ACK is a cumulative acknowledgment, + * reset dupacks to 0 */ + tc->rcv_dupacks = 0; - /* Handle congestion and dupack */ - tcp_cc_congestion (tc); - tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + tcp_retransmit_first_unacked (tc); - tcp_fast_retransmit (tc); + /* Post RTO timeout don't try anything fancy */ + if (tcp_in_recovery (tc)) + return; - /* Post retransmit update cwnd to ssthresh and account for the - * three segments that have left the network and should've been - * buffered at the receiver */ - tc->cwnd = tc->ssthresh + TCP_DUPACK_THRESHOLD * tc->snd_mss; + /* Remove retransmitted bytes that have been delivered */ + if (tc->sack_sb.last_bytes_delivered + && seq_gt (tc->sack_sb.high_rxt, tc->snd_una)) + { + /* If we have sacks and we haven't gotten an ack beyond high_rxt, + * remove sacked bytes delivered */ + tc->snd_rxt_bytes -= tc->sack_sb.last_bytes_delivered; } - else if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD) + else { - ASSERT (tcp_in_fastrecovery (tc)); - - tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + /* Either all retransmitted holes have been acked, or we're + * "in the blind" and retransmitting segment by segment */ + tc->snd_rxt_bytes = 0; } + + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK); + + /* + * Since this was a partial ack, try to retransmit some more data + */ + tcp_fast_retransmit (tc); } void @@ -862,14 +1144,18 @@ tcp_cc_init (tcp_connection_t * tc) tc->cc_algo->init (tc); } +/** + * Process incoming ACK + */ static int tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, tcp_header_t * th, u32 * next, u32 * error) { - u32 new_snd_wnd; + u32 prev_snd_wnd, prev_snd_una; + u8 is_dack; /* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) */ - if (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)) + if (PREDICT_FALSE (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt))) { /* If we have outstanding data and this is within the window, accept it, * probably retransmit has timed out. Otherwise ACK segment and then @@ -892,7 +1178,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, } /* If old ACK, probably it's an old dupack */ - if (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una)) + if (PREDICT_FALSE (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una))) { *error = TCP_ERROR_ACK_OLD; TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 1, @@ -900,54 +1186,50 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, if (tcp_in_fastrecovery (tc) && tc->rcv_dupacks == TCP_DUPACK_THRESHOLD) { TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc); - tcp_cc_rcv_dupack (tc, vnet_buffer (b)->tcp.ack_number); + tcp_cc_handle_event (tc, 1); } /* Don't drop yet */ return 0; } - if (tcp_opts_sack_permitted (&tc->opt)) - tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number); - - new_snd_wnd = clib_net_to_host_u16 (th->window) << tc->snd_wscale; - - if (tcp_ack_is_dupack (tc, b, new_snd_wnd)) - { - TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc, 1); - tcp_cc_rcv_dupack (tc, vnet_buffer (b)->tcp.ack_number); - *error = TCP_ERROR_ACK_DUP; - return -1; - } - /* - * Valid ACK + * Looks okay, process feedback */ - tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una; - tc->snd_una = vnet_buffer (b)->tcp.ack_number + tc->sack_sb.snd_una_adv; + TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc); + + if (tcp_opts_sack_permitted (&tc->rcv_opts)) + tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number); - /* Dequeue ACKed data and update RTT */ - tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number); + prev_snd_wnd = tc->snd_wnd; + prev_snd_una = tc->snd_una; tcp_update_snd_wnd (tc, vnet_buffer (b)->tcp.seq_number, - vnet_buffer (b)->tcp.ack_number, new_snd_wnd); + vnet_buffer (b)->tcp.ack_number, + clib_net_to_host_u16 (th->window) << tc->snd_wscale); + tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una; + tc->snd_una = vnet_buffer (b)->tcp.ack_number + tc->sack_sb.snd_una_adv; + tcp_validate_txf_size (tc, tc->bytes_acked); - /* If some of our sent bytes have been acked, update cc and retransmit - * timer. */ if (tc->bytes_acked) - { - TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc); + tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number); - /* Updates congestion control (slow start/congestion avoidance) */ - tcp_cc_rcv_ack (tc, b); + /* + * Check if we have congestion event + */ - /* If everything has been acked, stop retransmit timer - * otherwise update. */ - if (tc->snd_una == tc->snd_una_max) - tcp_retransmit_timer_reset (tc); - else - tcp_retransmit_timer_update (tc); + if (tcp_ack_is_cc_event (tc, b, prev_snd_wnd, prev_snd_una, &is_dack)) + { + tcp_cc_handle_event (tc, is_dack); + *error = TCP_ERROR_ACK_DUP; + TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc, 1); + return vnet_buffer (b)->tcp.data_len ? 0 : -1; } + /* + * Update congestion control (slow start/congestion avoidance) + */ + tcp_cc_update (tc, b); + return 0; } @@ -1059,7 +1341,7 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, } /* Update SACK list if need be */ - if (tcp_opts_sack_permitted (&tc->opt)) + if (tcp_opts_sack_permitted (&tc->rcv_opts)) { /* Remove SACK blocks that have been delivered */ tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt); @@ -1097,7 +1379,7 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, TCP_EVT_DBG (TCP_EVT_INPUT, tc, 1, data_len, data_len); /* Update SACK list if in use */ - if (tcp_opts_sack_permitted (&tc->opt)) + if (tcp_opts_sack_permitted (&tc->rcv_opts)) { ooo_segment_t *newest; u32 start, end; @@ -1294,7 +1576,6 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, u32 n_left_to_next; vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); - while (n_left_from > 0 && n_left_to_next > 0) { u32 bi0; @@ -1321,7 +1602,6 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } th0 = tcp_buffer_hdr (b0); - is_fin = (th0->flags & TCP_FLAG_FIN) != 0; /* SYNs, FINs and data consume sequence numbers */ @@ -1387,7 +1667,6 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, errors = session_manager_flush_enqueue_events (my_thread_index); tcp_established_inc_counter (vm, is_ip4, TCP_ERROR_EVENT_FIFO_FULL, errors); - return from_frame->n_vectors; } @@ -1582,17 +1861,17 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, new_tc0->irs = seq0; /* Parse options */ - if (tcp_options_parse (tcp0, &new_tc0->opt)) + if (tcp_options_parse (tcp0, &new_tc0->rcv_opts)) goto drop; - if (tcp_opts_tstamp (&new_tc0->opt)) + if (tcp_opts_tstamp (&new_tc0->rcv_opts)) { - new_tc0->tsval_recent = new_tc0->opt.tsval; + new_tc0->tsval_recent = new_tc0->rcv_opts.tsval; new_tc0->tsval_recent_age = tcp_time_now (); } - if (tcp_opts_wscale (&new_tc0->opt)) - new_tc0->snd_wscale = new_tc0->opt.wscale; + if (tcp_opts_wscale (&new_tc0->rcv_opts)) + new_tc0->snd_wscale = new_tc0->rcv_opts.wscale; /* No scaling */ new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window); @@ -1845,7 +2124,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Initialize session variables */ tc0->snd_una = vnet_buffer (b0)->tcp.ack_number; tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window) - << tc0->opt.wscale; + << tc0->rcv_opts.wscale; tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number; tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number; @@ -1903,13 +2182,21 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, break; case TCP_STATE_LAST_ACK: - /* The only thing that can arrive in this state is an + /* The only thing that [should] arrive in this state is an * acknowledgment of our FIN. If our FIN is now acknowledged, * delete the TCB, enter the CLOSED state, and return. */ if (!tcp_rcv_ack_is_acceptable (tc0, b0)) goto drop; + /* Apparently our FIN was lost */ + if (tcp_fin (tcp0)) + { + /* Don't "make" fin since that increments snd_nxt */ + tcp_send_fin (tc0); + goto drop; + } + tc0->state = TCP_STATE_CLOSED; /* Don't delete the connection/session yet. Instead, wait a @@ -1929,8 +2216,15 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, * retransmission of the remote FIN. Acknowledge it, and restart * the 2 MSL timeout. */ - /* TODO */ + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + + tcp_make_ack (tc0, b0); + tcp_timer_reset (tc0, TCP_TIMER_WAITCLOSE); + tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + goto drop; + break; default: ASSERT (0); @@ -2194,7 +2488,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; } - if (tcp_options_parse (th0, &child0->opt)) + if (tcp_options_parse (th0, &child0->rcv_opts)) { goto drop; } @@ -2205,14 +2499,14 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK} * segments are used to initialize PAWS. */ - if (tcp_opts_tstamp (&child0->opt)) + if (tcp_opts_tstamp (&child0->rcv_opts)) { - child0->tsval_recent = child0->opt.tsval; + child0->tsval_recent = child0->rcv_opts.tsval; child0->tsval_recent_age = tcp_time_now (); } - if (tcp_opts_wscale (&child0->opt)) - child0->snd_wscale = child0->opt.wscale; + if (tcp_opts_wscale (&child0->rcv_opts)) + child0->snd_wscale = child0->rcv_opts.wscale; /* No scaling */ child0->snd_wnd = clib_net_to_host_u16 (th0->window); @@ -2477,7 +2771,6 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_add_trace (vm, node, b0, sizeof (*t0)); tcp_set_rx_trace_data (t0, tc0, tcp0, b0, is_ip4); } - vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, n_left_to_next, bi0, next0); } @@ -2600,7 +2893,13 @@ do { \ _(FIN_WAIT_2, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(LAST_ACK, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, + TCP_ERROR_NONE); _(LAST_ACK, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(TIME_WAIT, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(TIME_WAIT, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, + TCP_ERROR_NONE); _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED); _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); #undef _ diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c index 3525f4e5..c66250e4 100644 --- a/src/vnet/tcp/tcp_newreno.c +++ b/src/vnet/tcp/tcp_newreno.c @@ -51,9 +51,23 @@ newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type) } else if (ack_type == TCP_CC_PARTIALACK) { - tc->cwnd -= tc->bytes_acked; - if (tc->bytes_acked > tc->snd_mss) - tc->bytes_acked += tc->snd_mss; + /* RFC 6582 Sec. 3.2 */ + if (!tcp_opts_sack_permitted (&tc->rcv_opts)) + { + /* Deflate the congestion window by the amount of new data + * acknowledged by the Cumulative Acknowledgment field. + * If the partial ACK acknowledges at least one SMSS of new data, + * then add back SMSS bytes to the congestion window. This + * artificially inflates the congestion window in order to reflect + * the additional segment that has left the network. This "partial + * window deflation" attempts to ensure that, when fast recovery + * eventually ends, approximately ssthresh amount of data will be + * outstanding in the network.*/ + tc->cwnd = (tc->cwnd > tc->bytes_acked) ? + tc->cwnd - tc->bytes_acked : 0; + if (tc->bytes_acked > tc->snd_mss) + tc->cwnd += tc->snd_mss; + } } } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 49fd6bef..47c94e6d 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -136,10 +136,10 @@ tcp_update_rcv_wnd (tcp_connection_t * tc) * Figure out how much space we have available */ available_space = stream_session_max_rx_enqueue (&tc->connection); - max_fifo = stream_session_fifo_size (&tc->connection); + max_fifo = stream_session_rx_fifo_size (&tc->connection); - ASSERT (tc->opt.mss < max_fifo); - if (available_space < tc->opt.mss && available_space < max_fifo >> 3) + ASSERT (tc->rcv_opts.mss < max_fifo); + if (available_space < tc->rcv_opts.mss && available_space < max_fifo >> 3) available_space = 0; /* @@ -276,8 +276,11 @@ tcp_make_syn_options (tcp_options_t * opts, u8 wnd_scale) opts->tsecr = 0; len += TCP_OPTION_LEN_TIMESTAMP; - opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; - len += TCP_OPTION_LEN_SACK_PERMITTED; + if (TCP_USE_SACKS) + { + opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; + len += TCP_OPTION_LEN_SACK_PERMITTED; + } /* Align to needed boundary */ len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN; @@ -293,14 +296,14 @@ tcp_make_synack_options (tcp_connection_t * tc, tcp_options_t * opts) opts->mss = tc->mss; len += TCP_OPTION_LEN_MSS; - if (tcp_opts_wscale (&tc->opt)) + if (tcp_opts_wscale (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_WSCALE; opts->wscale = tc->rcv_wscale; len += TCP_OPTION_LEN_WINDOW_SCALE; } - if (tcp_opts_tstamp (&tc->opt)) + if (tcp_opts_tstamp (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_TSTAMP; opts->tsval = tcp_time_now (); @@ -308,7 +311,7 @@ tcp_make_synack_options (tcp_connection_t * tc, tcp_options_t * opts) len += TCP_OPTION_LEN_TIMESTAMP; } - if (tcp_opts_sack_permitted (&tc->opt)) + if (tcp_opts_sack_permitted (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; len += TCP_OPTION_LEN_SACK_PERMITTED; @@ -326,14 +329,14 @@ tcp_make_established_options (tcp_connection_t * tc, tcp_options_t * opts) opts->flags = 0; - if (tcp_opts_tstamp (&tc->opt)) + if (tcp_opts_tstamp (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_TSTAMP; opts->tsval = tcp_time_now (); opts->tsecr = tc->tsval_recent; len += TCP_OPTION_LEN_TIMESTAMP; } - if (tcp_opts_sack_permitted (&tc->opt)) + if (tcp_opts_sack_permitted (&tc->rcv_opts)) { if (vec_len (tc->snd_sacks)) { @@ -395,7 +398,7 @@ tcp_update_snd_mss (tcp_connection_t * tc) tcp_make_options (tc, &tc->snd_opts, TCP_STATE_ESTABLISHED); /* XXX check if MTU has been updated */ - tc->snd_mss = clib_min (tc->mss, tc->opt.mss) - tc->snd_opts_len; + tc->snd_mss = clib_min (tc->mss, tc->rcv_opts.mss) - tc->snd_opts_len; ASSERT (tc->snd_mss > 0); } @@ -406,21 +409,21 @@ tcp_init_mss (tcp_connection_t * tc) tcp_update_rcv_mss (tc); /* TODO cache mss and consider PMTU discovery */ - tc->snd_mss = clib_min (tc->opt.mss, tc->mss); + tc->snd_mss = clib_min (tc->rcv_opts.mss, tc->mss); if (tc->snd_mss < 45) { clib_warning ("snd mss is 0"); /* Assume that at least the min default mss works */ tc->snd_mss = default_min_mss; - tc->opt.mss = default_min_mss; + tc->rcv_opts.mss = default_min_mss; } /* We should have enough space for 40 bytes of options */ ASSERT (tc->snd_mss > 45); /* If we use timestamp option, account for it */ - if (tcp_opts_tstamp (&tc->opt)) + if (tcp_opts_tstamp (&tc->rcv_opts)) tc->snd_mss -= TCP_OPTION_LEN_TIMESTAMP; } @@ -879,6 +882,7 @@ tcp_send_fin (tcp_connection_t * tc) tcp_make_fin (tc, b); tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); tc->flags |= TCP_CONN_FINSNT; + tcp_retransmit_timer_force_update (tc); TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc); } @@ -919,10 +923,7 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, if (compute_opts) tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); - /* Write pre-computed options */ tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t); - - /* Get rcv window to advertise */ advertise_wnd = tcp_window_to_advertise (tc, next_state); flags = tcp_make_state_flags (next_state); @@ -930,26 +931,25 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt, tc->rcv_nxt, tcp_hdr_opts_len, flags, advertise_wnd); - opts_write_len = tcp_options_write ((u8 *) (th + 1), &tc->snd_opts); ASSERT (opts_write_len == tc->snd_opts_len); - - /* Tag the buffer with the connection index */ vnet_buffer (b)->tcp.connection_index = tc->c_c_index; + /* + * Update connection variables + */ + tc->snd_nxt += data_len; tc->rcv_las = tc->rcv_nxt; /* TODO this is updated in output as well ... */ - if (tc->snd_nxt > tc->snd_una_max) - tc->snd_una_max = tc->snd_nxt; - - if (tc->rtt_ts == 0) + if (seq_gt (tc->snd_nxt, tc->snd_una_max)) { - tc->rtt_ts = tcp_time_now (); - tc->rtt_seq = tc->snd_nxt; + tc->snd_una_max = tc->snd_nxt; + tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); } + TCP_EVT_DBG (TCP_EVT_PKTIZE, tc); } @@ -987,13 +987,14 @@ tcp_timer_delack_handler (u32 index) * * @return the number of bytes in the segment or 0 if there's nothing to * retransmit - * */ + */ u32 tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, u32 offset, u32 max_bytes) { vlib_main_t *vm = vlib_get_main (); - u32 n_bytes = 0; + int n_bytes = 0; + u32 start; tcp_reuse_buffer (vm, b); @@ -1001,15 +1002,16 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, ASSERT (max_bytes != 0); max_bytes = clib_min (tc->snd_mss, max_bytes); + start = tc->snd_una + offset; /* Start is beyond snd_congestion */ - if (seq_geq (tc->snd_una + offset, tc->snd_congestion)) + if (seq_geq (start, tc->snd_congestion)) goto done; /* Don't overshoot snd_congestion */ - if (seq_gt (tc->snd_nxt + max_bytes, tc->snd_congestion)) + if (seq_gt (start + max_bytes, tc->snd_congestion)) { - max_bytes = tc->snd_congestion - tc->snd_nxt; + max_bytes = tc->snd_congestion - start; if (max_bytes == 0) goto done; } @@ -1021,15 +1023,12 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, n_bytes = stream_session_peek_bytes (&tc->connection, vlib_buffer_get_current (b), offset, max_bytes); - ASSERT (n_bytes != 0); + ASSERT (n_bytes > 0); b->current_length = n_bytes; tcp_push_hdr_i (tc, b, tc->state, 0); - /* Don't count multiple retransmits of the same segment */ - if (tc->rto_boff > 1) - goto done; - - tc->rtx_bytes += n_bytes; + if (tcp_in_fastrecovery (tc)) + tc->snd_rxt_bytes += n_bytes; done: TCP_EVT_DBG (TCP_EVT_CC_RTX, tc, offset, n_bytes); @@ -1042,18 +1041,15 @@ done: static void tcp_rtx_timeout_cc (tcp_connection_t * tc) { + tc->prev_ssthresh = tc->ssthresh; + tc->prev_cwnd = tc->cwnd; + /* Cleanly recover cc (also clears up fast retransmit) */ if (tcp_in_fastrecovery (tc)) - { - tcp_cc_recover (tc); - } - else - { - tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss); - } + tcp_cc_fastrecovery_exit (tc); /* Start again from the beginning */ - + tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss); tc->cwnd = tcp_loss_wnd (tc); tc->snd_congestion = tc->snd_una_max; tcp_recovery_on (tc); @@ -1081,18 +1077,31 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Make sure timer handle is set to invalid */ tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID; + if (!tcp_in_recovery (tc) && tc->rto_boff > 0 + && tc->state >= TCP_STATE_ESTABLISHED) + { + tc->rto_boff = 0; + tcp_update_rto (tc); + } + /* Increment RTO backoff (also equal to number of retries) */ tc->rto_boff += 1; /* Go back to first un-acked byte */ tc->snd_nxt = tc->snd_una; - /* Get buffer */ tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); if (tc->state >= TCP_STATE_ESTABLISHED) { + /* Lost FIN, retransmit and return */ + if (tc->flags & TCP_CONN_FINSNT) + { + tcp_send_fin (tc); + return; + } + /* First retransmit timeout */ if (tc->rto_boff == 1) tcp_rtx_timeout_cc (tc); @@ -1102,24 +1111,30 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1); - /* Send one segment. No fancy recovery for now! */ + /* Send one segment */ n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); + /* TODO be less aggressive about this */ scoreboard_clear (&tc->sack_sb); if (n_bytes == 0) { clib_warning ("could not retransmit anything"); + clib_warning ("%U", format_tcp_connection, tc, 2); + /* Try again eventually */ tcp_retransmit_timer_set (tc); + ASSERT (0 || (tc->rto_boff > 1 + && tc->snd_una == tc->snd_congestion)); return; } + + /* For first retransmit, record timestamp (Eifel detection RFC3522) */ + if (tc->rto_boff == 1) + tc->snd_rxt_ts = tcp_time_now (); } - else + /* Retransmit for SYN/SYNACK */ + else if (tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_SYN_SENT) { - /* Retransmit for SYN/SYNACK */ - ASSERT (tc->state == TCP_STATE_SYN_RCVD - || tc->state == TCP_STATE_SYN_SENT); - /* Try without increasing RTO a number of times. If this fails, * start growing RTO exponentially */ if (tc->rto_boff > TCP_RTO_SYN_RETRIES) @@ -1132,6 +1147,12 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Account for the SYN */ tc->snd_nxt += 1; } + else + { + ASSERT (tc->state == TCP_STATE_CLOSED); + clib_warning ("connection closed ..."); + return; + } if (!is_syn) { @@ -1180,7 +1201,8 @@ tcp_timer_persist_handler (u32 index) u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; - u32 bi, n_bytes; + u32 bi, old_snd_nxt; + int n_bytes = 0; tc = tcp_connection_get_if_valid (index, thread_index); @@ -1202,13 +1224,15 @@ tcp_timer_persist_handler (u32 index) /* Try to force the first unsent segment */ tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); + + tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); n_bytes = stream_session_peek_bytes (&tc->connection, vlib_buffer_get_current (b), tc->snd_una_max - tc->snd_una, tc->snd_mss); /* Nothing to send */ - if (n_bytes == 0) + if (n_bytes <= 0) { clib_warning ("persist found nothing to send"); tcp_return_buffer (tm); @@ -1216,7 +1240,13 @@ tcp_timer_persist_handler (u32 index) } b->current_length = n_bytes; + ASSERT (tc->snd_nxt == tc->snd_una_max || tc->rto_boff > 1 + || tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)); + + /* Allow updating of snd_una_max but don't update snd_nxt */ + old_snd_nxt = tc->snd_nxt; tcp_push_hdr_i (tc, b, tc->state, 0); + tc->snd_nxt = old_snd_nxt; tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); /* Re-enable persist timer */ @@ -1232,8 +1262,9 @@ tcp_retransmit_first_unacked (tcp_connection_t * tc) tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); vlib_buffer_t *b; - u32 bi, n_bytes; + u32 bi, n_bytes, old_snd_nxt; + old_snd_nxt = tc->snd_nxt; tc->snd_nxt = tc->snd_una; /* Get buffer */ @@ -1244,75 +1275,117 @@ tcp_retransmit_first_unacked (tcp_connection_t * tc) n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); if (n_bytes == 0) - goto done; + { + tcp_return_buffer (tm); + goto done; + } tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); done: - tc->snd_nxt = tc->snd_una_max; + tc->snd_nxt = old_snd_nxt; } -sack_scoreboard_hole_t * -scoreboard_first_rtx_hole (sack_scoreboard_t * sb) +/** + * Do fast retransmit with SACKs + */ +void +tcp_fast_retransmit_sack (tcp_connection_t * tc) { - sack_scoreboard_hole_t *hole = 0; - -// hole = scoreboard_first_hole (&tc->sack_sb); -// if (hole) -// { -// -// offset = hole->start - tc->snd_una; -// hole_size = hole->end - hole->start; -// -// ASSERT(hole_size); -// -// if (hole_size < max_bytes) -// max_bytes = hole_size; -// } - return hole; + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = vlib_get_main (); + u32 n_written = 0, offset = 0, max_bytes; + vlib_buffer_t *b; + sack_scoreboard_hole_t *hole; + sack_scoreboard_t *sb; + u32 bi, old_snd_nxt; + int snd_space; + u8 snd_limited = 0, can_rescue = 0; + + ASSERT (tcp_in_fastrecovery (tc)); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); + + old_snd_nxt = tc->snd_nxt; + sb = &tc->sack_sb; + snd_space = tcp_available_snd_space (tc); + + hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); + while (hole && snd_space > 0) + { + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (vm, bi); + + hole = scoreboard_next_rxt_hole (sb, hole, + tcp_fastrecovery_sent_1_smss (tc), + &can_rescue, &snd_limited); + if (!hole) + { + if (!can_rescue || !(seq_lt (sb->rescue_rxt, tc->snd_una) + || seq_gt (sb->rescue_rxt, + tc->snd_congestion))) + break; + + /* If rescue rxt undefined or less than snd_una then one segment of + * up to SMSS octets that MUST include the highest outstanding + * unSACKed sequence number SHOULD be returned, and RescueRxt set to + * RecoveryPoint. HighRxt MUST NOT be updated. + */ + max_bytes = clib_min (tc->snd_mss, snd_space); + offset = tc->snd_congestion - tc->snd_una - max_bytes; + sb->rescue_rxt = tc->snd_congestion; + tc->snd_nxt = tc->snd_una + offset; + tcp_prepare_retransmit_segment (tc, b, offset, max_bytes); + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + break; + } + + max_bytes = snd_limited ? tc->snd_mss : hole->end - sb->high_rxt; + offset = sb->high_rxt - tc->snd_una; + tc->snd_nxt = tc->snd_una + offset; + n_written = tcp_prepare_retransmit_segment (tc, b, offset, max_bytes); + + /* Nothing left to retransmit */ + if (n_written == 0) + { + tcp_return_buffer (tm); + break; + } + + sb->high_rxt += n_written; + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + snd_space -= n_written; + } + + /* If window allows, send 1 SMSS of new data */ + tc->snd_nxt = old_snd_nxt; } /** - * Do fast retransmit. + * Fast retransmit without SACK info */ void -tcp_fast_retransmit (tcp_connection_t * tc) +tcp_fast_retransmit_no_sack (tcp_connection_t * tc) { tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); - u32 bi; + u32 n_written = 0, offset = 0, bi, old_snd_nxt; int snd_space; - u32 n_written = 0, offset = 0; vlib_buffer_t *b; - u8 use_sacks = 0; ASSERT (tcp_in_fastrecovery (tc)); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); /* Start resending from first un-acked segment */ + old_snd_nxt = tc->snd_nxt; tc->snd_nxt = tc->snd_una; - snd_space = tcp_available_snd_space (tc); - TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); - - /* If we have SACKs use them */ - if (tcp_opts_sack_permitted (&tc->opt) - && scoreboard_first_hole (&tc->sack_sb)) - use_sacks = 0; while (snd_space > 0) { tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); - if (use_sacks) - { - scoreboard_first_rtx_hole (&tc->sack_sb); - } - else - { - offset += n_written; - } - + offset += n_written; n_written = tcp_prepare_retransmit_segment (tc, b, offset, snd_space); /* Nothing left to retransmit */ @@ -1326,9 +1399,21 @@ tcp_fast_retransmit (tcp_connection_t * tc) snd_space -= n_written; } - /* If window allows, send 1 SMSS of new data */ - if (seq_lt (tc->snd_nxt, tc->snd_congestion)) - tc->snd_nxt = tc->snd_congestion; + /* Restore snd_nxt. If window allows, send 1 SMSS of new data */ + tc->snd_nxt = old_snd_nxt; +} + +/** + * Do fast retransmit + */ +void +tcp_fast_retransmit (tcp_connection_t * tc) +{ + if (tcp_opts_sack_permitted (&tc->rcv_opts) + && scoreboard_first_hole (&tc->sack_sb)) + tcp_fast_retransmit_sack (tc); + else + tcp_fast_retransmit_no_sack (tc); } always_inline u32 @@ -1544,6 +1629,12 @@ tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b) tc = (tcp_connection_t *) tconn; tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED, 0); + + if (tc->rtt_ts == 0) + { + tc->rtt_ts = tcp_time_now (); + tc->rtt_seq = tc->snd_nxt; + } return 0; } diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c index 2af38484..3f8afa40 100644 --- a/src/vnet/tcp/tcp_test.c +++ b/src/vnet/tcp/tcp_test.c @@ -54,7 +54,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) tc->snd_una = 0; tc->snd_una_max = 1000; tc->snd_nxt = 1000; - tc->opt.flags |= TCP_OPTS_FLAG_SACK; + tc->rcv_opts.flags |= TCP_OPTS_FLAG_SACK; scoreboard_init (&tc->sack_sb); for (i = 0; i < 1000 / 100; i++) @@ -70,9 +70,9 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) for (i = 0; i < 1000 / 200; i++) { - vec_add1 (tc->opt.sacks, sacks[i * 2]); + vec_add1 (tc->rcv_opts.sacks, sacks[i * 2]); } - tc->opt.n_sack_blocks = vec_len (tc->opt.sacks); + tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks); tcp_rcv_sacks (tc, 0); if (verbose) @@ -93,18 +93,17 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv); TCP_TEST ((sb->last_sacked_bytes == 400), "last sacked bytes %d", sb->last_sacked_bytes); - TCP_TEST ((sb->max_byte_sacked == 900), - "max byte sacked %u", sb->max_byte_sacked); + TCP_TEST ((sb->high_sacked == 900), "max byte sacked %u", sb->high_sacked); /* * Inject odd blocks */ - vec_reset_length (tc->opt.sacks); + vec_reset_length (tc->rcv_opts.sacks); for (i = 0; i < 1000 / 200; i++) { - vec_add1 (tc->opt.sacks, sacks[i * 2 + 1]); + vec_add1 (tc->rcv_opts.sacks, sacks[i * 2 + 1]); } - tc->opt.n_sack_blocks = vec_len (tc->opt.sacks); + tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks); tcp_rcv_sacks (tc, 0); if (verbose) @@ -118,8 +117,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) "first hole start %u end %u", hole->start, hole->end); TCP_TEST ((sb->sacked_bytes == 900), "sacked bytes %d", sb->sacked_bytes); TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv); - TCP_TEST ((sb->max_byte_sacked == 1000), - "max sacked byte %u", sb->max_byte_sacked); + TCP_TEST ((sb->high_sacked == 1000), "max sacked byte %u", sb->high_sacked); TCP_TEST ((sb->last_sacked_bytes == 500), "last sacked bytes %d", sb->last_sacked_bytes); @@ -135,8 +133,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) "scoreboard has %d elements", pool_elts (sb->holes)); TCP_TEST ((sb->snd_una_adv == 900), "snd_una_adv after ack %u", sb->snd_una_adv); - TCP_TEST ((sb->max_byte_sacked == 1000), - "max sacked byte %u", sb->max_byte_sacked); + TCP_TEST ((sb->high_sacked == 1000), "max sacked byte %u", sb->high_sacked); TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes); TCP_TEST ((sb->last_sacked_bytes == 0), "last sacked bytes %d", sb->last_sacked_bytes); @@ -145,11 +142,11 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) * Add new block */ - vec_reset_length (tc->opt.sacks); + vec_reset_length (tc->rcv_opts.sacks); block.start = 1200; block.end = 1300; - vec_add1 (tc->opt.sacks, block); + vec_add1 (tc->rcv_opts.sacks, block); if (verbose) vlib_cli_output (vm, "add [1200, 1300]:\n%U", format_tcp_scoreboard, sb); @@ -171,8 +168,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) "first hole start %u end %u", hole->start, hole->end); TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv after ack %u", sb->snd_una_adv); - TCP_TEST ((sb->max_byte_sacked == 1300), - "max sacked byte %u", sb->max_byte_sacked); + TCP_TEST ((sb->high_sacked == 1300), "max sacked byte %u", sb->high_sacked); hole = scoreboard_last_hole (sb); TCP_TEST ((hole->start == 1300 && hole->end == 1500), "last hole start %u end %u", hole->start, hole->end); @@ -182,7 +178,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) * Ack first hole */ - vec_reset_length (tc->opt.sacks); + vec_reset_length (tc->rcv_opts.sacks); tcp_rcv_sacks (tc, 1200); if (verbose) @@ -196,8 +192,16 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) "scoreboard has %d elements", pool_elts (sb->holes)); /* - * Remove all + * Add some more blocks and then remove all */ + vec_reset_length (tc->rcv_opts.sacks); + for (i = 0; i < 5; i++) + { + block.start = i * 100 + 1200; + block.end = (i + 1) * 100 + 1200; + vec_add1 (tc->rcv_opts.sacks, block); + } + tcp_rcv_sacks (tc, 1900); scoreboard_clear (sb); if (verbose) @@ -205,6 +209,9 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) TCP_TEST ((pool_elts (sb->holes) == 0), "number of holes %d", pool_elts (sb->holes)); + TCP_TEST ((sb->head == TCP_INVALID_SACK_HOLE_INDEX), "head %u", sb->head); + TCP_TEST ((sb->tail == TCP_INVALID_SACK_HOLE_INDEX), "tail %u", sb->tail); + /* * Re-inject odd blocks and ack them all */ @@ -214,9 +221,9 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) tc->snd_nxt = 1000; for (i = 0; i < 5; i++) { - vec_add1 (tc->opt.sacks, sacks[i * 2 + 1]); + vec_add1 (tc->rcv_opts.sacks, sacks[i * 2 + 1]); } - tc->opt.n_sack_blocks = vec_len (tc->opt.sacks); + tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks); tcp_rcv_sacks (tc, 0); if (verbose) vlib_cli_output (vm, "sb added odd blocks and ack [0, 950]:\n%U", @@ -740,6 +747,10 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) TCP_TEST (0, "[%d] peeked %u expected %u", j, data_buf[j], data[j]); } + /* Try to peek beyond the data */ + rv = svm_fifo_peek (f, svm_fifo_max_dequeue (f), vec_len (data), data_buf); + TCP_TEST ((rv == 0), "peeked %u expected 0", rv); + vec_free (data_buf); svm_fifo_free (f); vec_free (test_data); @@ -1239,7 +1250,7 @@ tcp_test_session (vlib_main_t * vm, unformat_input_t * input) tc0->c_thread_index = 0; tc0->c_lcl_ip4.as_u32 = local.as_u32; tc0->c_rmt_ip4.as_u32 = remote.as_u32; - tc0->opt.mss = 1450; + tc0->rcv_opts.mss = 1450; tcp_connection_init_vars (tc0); TCP_EVT_DBG (TCP_EVT_OPEN, tc0); -- cgit 1.2.3-korg From f03a59ab008908f98fd7d1b187a8c0fb78b01add Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Fri, 9 Jun 2017 21:07:32 -0700 Subject: Overall tcp performance improvements (VPP-846) - limit minimum rto per connection - cleanup sack scoreboard - switched svm fifo out-of-order data handling from absolute offsets to relative offsets. - improve cwnd handling when using sacks - add cc event debug stats - improved uri tcp test client/server: bugfixes and added half-duplex mode - expanded builtin client/server - updated uri socket client/server code to work in half-duplex - ensure session node unsets fifo event for empty fifo - fix session detach Change-Id: Ia446972340e32a65e0694ee2844355167d0c170d Signed-off-by: Florin Coras --- src/svm/svm_fifo.c | 152 +++++++++-------- src/svm/svm_fifo.h | 26 ++- src/svm/svm_fifo_segment.c | 7 +- src/uri/uri_socket_server.c | 25 ++- src/uri/uri_socket_test.c | 93 +++++++---- src/uri/uri_tcp_test.c | 120 ++++++++++---- src/vnet/session/application.c | 2 +- src/vnet/session/node.c | 7 +- src/vnet/session/segment_manager.c | 5 + src/vnet/session/session.c | 25 ++- src/vnet/session/session.h | 4 +- src/vnet/session/session_api.c | 2 +- src/vnet/session/transport.h | 2 + src/vnet/tcp/builtin_client.c | 325 +++++++++++++++++++------------------ src/vnet/tcp/builtin_client.h | 107 +++++------- src/vnet/tcp/builtin_server.c | 114 ++++++++++--- src/vnet/tcp/tcp.c | 22 ++- src/vnet/tcp/tcp.h | 12 +- src/vnet/tcp/tcp_debug.h | 186 +++++++++++++++------ src/vnet/tcp/tcp_input.c | 193 ++++++++++++---------- src/vnet/tcp/tcp_newreno.c | 4 +- src/vnet/tcp/tcp_output.c | 10 +- src/vnet/tcp/tcp_test.c | 70 ++++++-- 23 files changed, 945 insertions(+), 568 deletions(-) (limited to 'src/vnet/tcp/tcp_newreno.c') diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c index 5c8f244a..6ca437cf 100644 --- a/src/svm/svm_fifo.c +++ b/src/svm/svm_fifo.c @@ -15,10 +15,39 @@ #include -#define offset_lt(_a, _b) ((i32)((_a)-(_b)) < 0) -#define offset_leq(_a, _b) ((i32)((_a)-(_b)) <= 0) -#define offset_gt(_a, _b) ((i32)((_a)-(_b)) > 0) -#define offset_geq(_a, _b) ((i32)((_a)-(_b)) >= 0) +static inline u8 +position_lt (svm_fifo_t * f, u32 a, u32 b) +{ + return (ooo_segment_distance_to_tail (f, a) + < ooo_segment_distance_to_tail (f, b)); +} + +static inline u8 +position_leq (svm_fifo_t * f, u32 a, u32 b) +{ + return (ooo_segment_distance_to_tail (f, a) + <= ooo_segment_distance_to_tail (f, b)); +} + +static inline u8 +position_gt (svm_fifo_t * f, u32 a, u32 b) +{ + return (ooo_segment_distance_to_tail (f, a) + > ooo_segment_distance_to_tail (f, b)); +} + +static inline u32 +position_diff (svm_fifo_t * f, u32 posa, u32 posb) +{ + return ooo_segment_distance_to_tail (f, posa) + - ooo_segment_distance_to_tail (f, posb); +} + +static inline u32 +ooo_segment_end_pos (svm_fifo_t * f, ooo_segment_t * s) +{ + return (s->start + s->length) % f->nitems; +} u8 * format_ooo_segment (u8 * s, va_list * args) @@ -145,13 +174,17 @@ static void ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) { ooo_segment_t *s, *new_s, *prev, *next, *it; - u32 new_index, end_offset, s_sof, s_eof, s_index; + u32 new_index, s_end_pos, s_index; + u32 normalized_position, normalized_end_position; + + normalized_position = (f->tail + offset) % f->nitems; + normalized_end_position = (f->tail + offset + length) % f->nitems; - end_offset = offset + length; + f->ooos_newest = OOO_SEGMENT_INVALID_INDEX; if (f->ooos_list_head == OOO_SEGMENT_INVALID_INDEX) { - s = ooo_segment_new (f, offset, length); + s = ooo_segment_new (f, normalized_position, length); f->ooos_list_head = s - f->ooo_segments; f->ooos_newest = f->ooos_list_head; return; @@ -160,28 +193,26 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) /* Find first segment that starts after new segment */ s = pool_elt_at_index (f->ooo_segments, f->ooos_list_head); while (s->next != OOO_SEGMENT_INVALID_INDEX - && offset_leq (ooo_segment_offset (f, s), offset)) + && position_lt (f, s->start, normalized_position)) s = pool_elt_at_index (f->ooo_segments, s->next); /* If we have a previous and we overlap it, use it as starting point */ prev = ooo_segment_get_prev (f, s); - if (prev && offset_leq (offset, ooo_segment_end_offset (f, prev))) + if (prev + && position_leq (f, normalized_position, ooo_segment_end_pos (f, prev))) { s = prev; - prev = ooo_segment_get_prev (f, s); - s_sof = ooo_segment_offset (f, s); - s_eof = ooo_segment_end_offset (f, s); + s_end_pos = ooo_segment_end_pos (f, s); goto merge; } s_index = s - f->ooo_segments; - s_sof = ooo_segment_offset (f, s); - s_eof = ooo_segment_end_offset (f, s); + s_end_pos = ooo_segment_end_pos (f, s); /* No overlap, add before current segment */ - if (offset_lt (end_offset, s_sof)) + if (position_lt (f, normalized_end_position, s->start)) { - new_s = ooo_segment_new (f, offset, length); + new_s = ooo_segment_new (f, normalized_position, length); new_index = new_s - f->ooo_segments; /* Pool might've moved, get segment again */ @@ -198,28 +229,23 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) f->ooos_list_head = new_index; } - new_s->next = s - f->ooo_segments; + new_s->next = s_index; s->prev = new_index; f->ooos_newest = new_index; return; } /* No overlap, add after current segment */ - else if (offset_gt (offset, s_eof)) + else if (position_gt (f, normalized_position, s_end_pos)) { - new_s = ooo_segment_new (f, offset, length); + new_s = ooo_segment_new (f, normalized_position, length); new_index = new_s - f->ooo_segments; /* Pool might've moved, get segment again */ s = pool_elt_at_index (f->ooo_segments, s_index); - if (s->next != OOO_SEGMENT_INVALID_INDEX) - { - new_s->next = s->next; - next = pool_elt_at_index (f->ooo_segments, new_s->next); - next->prev = new_index; - } + ASSERT (s->next == OOO_SEGMENT_INVALID_INDEX); - new_s->prev = s - f->ooo_segments; + new_s->prev = s_index; s->next = new_index; f->ooos_newest = new_index; @@ -233,30 +259,32 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) merge: /* Merge at head */ - if (offset_lt (offset, s_sof)) + if (position_lt (f, normalized_position, s->start)) { - s->start = offset; - s->length = s_eof - ooo_segment_offset (f, s); + s->start = normalized_position; + s->length = position_diff (f, s_end_pos, s->start); } - /* Last but overlapping previous */ - else if (offset_gt (end_offset, s_eof)) + /* Overlapping tail */ + else if (position_gt (f, normalized_end_position, s_end_pos)) { - s->length = end_offset - ooo_segment_offset (f, s); + s->length = position_diff (f, normalized_end_position, s->start); } /* New segment completely covered by current one */ else { /* Do Nothing */ + s = 0; goto done; } /* The new segment's tail may cover multiple smaller ones */ - if (offset_geq (end_offset, s_eof)) + if (position_gt (f, normalized_end_position, s_end_pos)) { /* Remove the completely overlapped segments */ it = (s->next != OOO_SEGMENT_INVALID_INDEX) ? pool_elt_at_index (f->ooo_segments, s->next) : 0; - while (it && offset_leq (ooo_segment_end_offset (f, it), end_offset)) + while (it && position_leq (f, ooo_segment_end_pos (f, it), + normalized_end_position)) { next = (it->next != OOO_SEGMENT_INVALID_INDEX) ? pool_elt_at_index (f->ooo_segments, it->next) : 0; @@ -265,17 +293,17 @@ merge: } /* If partial overlap with last, merge */ - if (it && offset_leq (ooo_segment_offset (f, it), end_offset)) + if (it && position_leq (f, it->start, normalized_end_position)) { - s->length = ooo_segment_end_offset (f, it) - - ooo_segment_offset (f, s); + s->length = ooo_segment_end_pos (f, it) - s->start; ooo_segment_del (f, it - f->ooo_segments); } } done: /* Most recently updated segment */ - f->ooos_newest = s - f->ooo_segments; + if (s) + f->ooos_newest = s - f->ooo_segments; } /** @@ -286,32 +314,28 @@ static int ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) { ooo_segment_t *s; - u32 index, bytes = 0, diff; - u32 cursize, norm_start, nitems; - - /* current size has not yet been updated */ - cursize = svm_fifo_max_dequeue (f) + n_bytes_enqueued; - nitems = f->nitems; + u32 index, bytes = 0; + i32 diff; s = pool_elt_at_index (f->ooo_segments, f->ooos_list_head); - norm_start = s->start % nitems; - diff = (f->nitems + (i32) (f->tail - norm_start)) % nitems; + diff = (f->tail >= s->start) ? + f->tail - s->start : f->nitems + f->tail - s->start; - if (diff > cursize) + if (diff > n_bytes_enqueued) return 0; /* If last tail update overlaps one/multiple ooo segments, remove them */ - while (0 < diff && diff < cursize) + while (0 <= diff && diff < n_bytes_enqueued) { index = s - f->ooo_segments; /* Segment end is beyond the tail. Advance tail and remove segment */ - if (diff < s->length) + if (s->length > diff) { - f->tail += s->length - diff; - f->tail %= f->nitems; bytes = s->length - diff; + f->tail += bytes; + f->tail %= f->nitems; ooo_segment_del (f, index); break; } @@ -320,8 +344,8 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) if (s->next != OOO_SEGMENT_INVALID_INDEX) { s = pool_elt_at_index (f->ooo_segments, s->next); - norm_start = s->start % nitems; - diff = (f->nitems + (i32) (f->tail - norm_start)) % nitems; + diff = (f->tail >= s->start) ? + f->tail - s->start : f->nitems + f->tail - s->start; ooo_segment_del (f, index); } /* End of search */ @@ -332,18 +356,6 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) } } - /* If tail is adjacent to an ooo segment, 'consume' it */ - if (diff == 0) - { - bytes = ((nitems - cursize) >= s->length) ? s->length : - nitems - cursize; - - f->tail += bytes; - f->tail %= nitems; - - ooo_segment_del (f, s - f->ooo_segments); - } - return bytes; } @@ -355,6 +367,7 @@ svm_fifo_enqueue_internal (svm_fifo_t * f, u32 max_bytes, u8 * copy_from_here) /* read cursize, which can only increase while we're working */ cursize = svm_fifo_max_dequeue (f); + f->ooos_newest = OOO_SEGMENT_INVALID_INDEX; if (PREDICT_FALSE (cursize == f->nitems)) return -2; /* fifo stuffed */ @@ -424,13 +437,16 @@ svm_fifo_enqueue_with_offset_internal (svm_fifo_t * f, u8 * copy_from_here) { u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; - u32 cursize, nitems; - u32 normalized_offset, offset_from_tail; + u32 cursize, nitems, normalized_offset; + u32 offset_from_tail; + + f->ooos_newest = OOO_SEGMENT_INVALID_INDEX; /* read cursize, which can only increase while we're working */ cursize = svm_fifo_max_dequeue (f); nitems = f->nitems; - normalized_offset = offset % nitems; + + normalized_offset = (f->tail + offset) % nitems; /* Will this request fit? */ offset_from_tail = (nitems + normalized_offset - f->tail) % nitems; diff --git a/src/svm/svm_fifo.h b/src/svm/svm_fifo.h index 9cb93ff4..f32ef41d 100644 --- a/src/svm/svm_fifo.h +++ b/src/svm/svm_fifo.h @@ -127,21 +127,37 @@ format_function_t format_svm_fifo; always_inline ooo_segment_t * svm_fifo_newest_ooo_segment (svm_fifo_t * f) { - return f->ooo_segments + f->ooos_newest; + if (f->ooos_newest == OOO_SEGMENT_INVALID_INDEX) + return 0; + return pool_elt_at_index (f->ooo_segments, f->ooos_newest); +} + +always_inline u32 +ooo_segment_distance_to_tail (svm_fifo_t * f, u32 a) +{ + /* Ambiguous. Assumption is that ooo segments don't touch tail */ + if (a == f->tail && f->tail == f->head) + return f->nitems; + + return ((f->nitems + a - f->tail) % f->nitems); } always_inline u32 ooo_segment_offset (svm_fifo_t * f, ooo_segment_t * s) { -// return ((f->nitems + s->fifo_position - f->tail) % f->nitems); - return s->start; + return ooo_segment_distance_to_tail (f, s->start); } always_inline u32 ooo_segment_end_offset (svm_fifo_t * f, ooo_segment_t * s) { -// return ((f->nitems + s->fifo_position + s->length - f->tail) % f->nitems); - return s->start + s->length; + return ooo_segment_distance_to_tail (f, s->start) + s->length; +} + +always_inline u32 +ooo_segment_length (svm_fifo_t * f, ooo_segment_t * s) +{ + return s->length; } always_inline ooo_segment_t * diff --git a/src/svm/svm_fifo_segment.c b/src/svm/svm_fifo_segment.c index eef2168c..c4ac2352 100644 --- a/src/svm/svm_fifo_segment.c +++ b/src/svm/svm_fifo_segment.c @@ -305,14 +305,17 @@ svm_fifo_segment_free_fifo (svm_fifo_segment_private_t * s, svm_fifo_t * f, /* Remove from active list */ if (f->prev) f->prev->next = f->next; + else + fsh->fifos = f->next; if (f->next) f->next->prev = f->prev; - /* FALLTHROUGH */ + /* Fall through: we add only rx fifos to active pool */ case FIFO_SEGMENT_TX_FREELIST: /* Add to free list */ f->next = fsh->free_fifos[list_index]; + f->prev = 0; fsh->free_fifos[list_index] = f; - /* FALLTHROUGH */ + break; case FIFO_SEGMENT_FREELIST_NONE: break; diff --git a/src/uri/uri_socket_server.c b/src/uri/uri_socket_server.c index 2366f420..4f4c5f30 100644 --- a/src/uri/uri_socket_server.c +++ b/src/uri/uri_socket_server.c @@ -22,6 +22,7 @@ #include #include #include +#include volatile int signal_received; @@ -78,7 +79,10 @@ main (int argc, char *argv[]) struct sockaddr_in serv_addr; struct sockaddr_in client; struct hostent *server; - u8 *rx_buffer = 0; + u8 *rx_buffer = 0, no_echo = 0; + struct timeval start, end; + long rcvd = 0; + double deltat; if (argc > 1 && argc < 3) { @@ -86,8 +90,9 @@ main (int argc, char *argv[]) exit (0); } - if (argc >= 3) + if (argc >= 4) { + no_echo = atoi (argv[3]); portno = atoi (argv[2]); server = gethostbyname (argv[1]); if (server == NULL) @@ -137,7 +142,7 @@ main (int argc, char *argv[]) exit (1); } - vec_validate (rx_buffer, 8999 /* jumbo mtu */ ); + vec_validate (rx_buffer, 128 << 10); if (listen (sockfd, 5 /* backlog */ ) < 0) { @@ -160,6 +165,8 @@ main (int argc, char *argv[]) } fformat (stderr, "Accepted connection from: %s : %d\n", inet_ntoa (client.sin_addr), client.sin_port); + gettimeofday (&start, NULL); + while (1) { n = recv (accfd, rx_buffer, vec_len (rx_buffer), 0 /* flags */ ); @@ -167,6 +174,14 @@ main (int argc, char *argv[]) { /* Graceful exit */ close (accfd); + gettimeofday (&end, NULL); + deltat = (end.tv_sec - start.tv_sec); + deltat += (end.tv_usec - start.tv_usec) / 1000000.0; + clib_warning ("Finished in %.6f", deltat); + clib_warning ("%.4f Gbit/second %s", + (((f64) rcvd * 8.0) / deltat / 1e9), + no_echo ? "half" : "full"); + rcvd = 0; break; } if (n < 0) @@ -179,6 +194,10 @@ main (int argc, char *argv[]) if (signal_received) break; + rcvd += n; + if (no_echo) + continue; + sent = send (accfd, rx_buffer, n, 0 /* flags */ ); if (n < 0) { diff --git a/src/uri/uri_socket_test.c b/src/uri/uri_socket_test.c index 9f049bda..5f7084d5 100644 --- a/src/uri/uri_socket_test.c +++ b/src/uri/uri_socket_test.c @@ -19,6 +19,7 @@ #include #include #include +#include int main (int argc, char *argv[]) @@ -26,28 +27,44 @@ main (int argc, char *argv[]) int sockfd, portno, n; struct sockaddr_in serv_addr; struct hostent *server; - u8 *rx_buffer = 0, *tx_buffer = 0; + u8 *rx_buffer = 0, *tx_buffer = 0, no_echo = 0, test_bytes = 0; u32 offset; - int iter, i; - if (0 && argc < 3) + long bytes = 1 << 20, to_send; + int i; + struct timeval start, end; + double deltat; + + if (argc >= 3) { - fformat (stderr, "usage %s hostname port\n", argv[0]); - exit (0); + bytes = ((long) atoi (argv[4])) << 20; + no_echo = atoi (argv[3]); + portno = atoi (argv[2]); + server = gethostbyname (argv[1]); + if (server == NULL) + { + clib_unix_warning ("gethostbyname"); + exit (1); + } + } + else + { + portno = 1234; // atoi(argv[2]); + server = gethostbyname ("6.0.1.1" /* argv[1] */ ); + if (server == NULL) + { + clib_unix_warning ("gethostbyname"); + exit (1); + } } - portno = 1234; // atoi(argv[2]); + to_send = bytes; sockfd = socket (AF_INET, SOCK_STREAM, 0); if (sockfd < 0) { clib_unix_error ("socket"); exit (1); } - server = gethostbyname ("6.0.1.1" /* argv[1] */ ); - if (server == NULL) - { - clib_unix_warning ("gethostbyname"); - exit (1); - } + bzero ((char *) &serv_addr, sizeof (serv_addr)); serv_addr.sin_family = AF_INET; bcopy ((char *) server->h_addr, @@ -59,8 +76,8 @@ main (int argc, char *argv[]) exit (1); } - vec_validate (rx_buffer, 1400); - vec_validate (tx_buffer, 1400); + vec_validate (rx_buffer, 128 << 10); + vec_validate (tx_buffer, 128 << 10); for (i = 0; i < vec_len (tx_buffer); i++) tx_buffer[i] = (i + 1) % 0xff; @@ -75,19 +92,28 @@ main (int argc, char *argv[]) exit (0); } - for (iter = 0; iter < 100000; iter++) + gettimeofday (&start, NULL); + while (bytes > 0) { - if (iter < 99999) + /* + * TX + */ + n = send (sockfd, tx_buffer, vec_len (tx_buffer), 0 /* flags */ ); + if (n != vec_len (tx_buffer)) { - n = send (sockfd, tx_buffer, vec_len (tx_buffer), 0 /* flags */ ); - if (n != vec_len (tx_buffer)) - { - clib_unix_warning ("write"); - exit (0); - } + clib_unix_warning ("write"); + exit (0); } - offset = 0; + bytes -= n; + if (no_echo) + continue; + + /* + * RX + */ + + offset = 0; do { n = recv (sockfd, rx_buffer + offset, @@ -101,18 +127,27 @@ main (int argc, char *argv[]) } while (offset < vec_len (rx_buffer)); - for (i = 0; i < vec_len (rx_buffer); i++) + if (test_bytes) { - if (rx_buffer[i] != tx_buffer[i]) + for (i = 0; i < vec_len (rx_buffer); i++) { - clib_warning ("[%d] read 0x%x not 0x%x", - rx_buffer[i], tx_buffer[i]); - exit (1); + if (rx_buffer[i] != tx_buffer[i]) + { + clib_warning ("[%d] read 0x%x not 0x%x", rx_buffer[i], + tx_buffer[i]); + exit (1); + } } } - } close (sockfd); + gettimeofday (&end, NULL); + + deltat = (end.tv_sec - start.tv_sec); + deltat += (end.tv_usec - start.tv_usec) / 1000000.0; // us to ms + clib_warning ("Finished in %.6f", deltat); + clib_warning ("%.4f Gbit/second %s", (((f64) to_send * 8.0) / deltat / 1e9), + no_echo ? "half" : "full"); return 0; } diff --git a/src/uri/uri_tcp_test.c b/src/uri/uri_tcp_test.c index e201a359..d1694cf4 100755 --- a/src/uri/uri_tcp_test.c +++ b/src/uri/uri_tcp_test.c @@ -46,6 +46,8 @@ typedef struct svm_fifo_t *server_tx_fifo; u64 vpp_session_handle; + u64 bytes_received; + f64 start; } session_t; typedef enum @@ -174,7 +176,7 @@ wait_for_state_change (uri_tcp_test_main_t * utm, connection_state_t state) if (utm->state == STATE_FAILED) return -1; if (utm->time_to_stop == 1) - return -1; + return 0; } clib_warning ("timeout waiting for STATE_READY"); return -1; @@ -184,7 +186,7 @@ void application_send_attach (uri_tcp_test_main_t * utm) { vl_api_application_attach_t *bmp; - u32 fifo_size = 3 << 20; + u32 fifo_size = 4 << 20; bmp = vl_msg_api_alloc (sizeof (*bmp)); memset (bmp, 0, sizeof (*bmp)); @@ -343,11 +345,23 @@ vl_api_map_another_segment_t_handler (vl_api_map_another_segment_t * mp) mp->segment_size); } +static void +session_print_stats (uri_tcp_test_main_t * utm, session_t * session) +{ + f64 deltat; + u64 bytes; + + deltat = clib_time_now (&utm->clib_time) - session->start; + bytes = utm->i_am_master ? session->bytes_received : utm->bytes_to_send; + fformat (stdout, "Finished in %.6f\n", deltat); + fformat (stdout, "%.4f Gbit/second\n", (bytes * 8.0) / deltat / 1e9); +} + static void vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) { uri_tcp_test_main_t *utm = &uri_tcp_test_main; - session_t *session; + session_t *session = 0; vl_api_disconnect_session_reply_t *rmp; uword *p; int rv = 0; @@ -366,7 +380,7 @@ vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) rv = -11; } - utm->time_to_stop = 1; +// utm->time_to_stop = 1; rmp = vl_msg_api_alloc (sizeof (*rmp)); memset (rmp, 0, sizeof (*rmp)); @@ -375,6 +389,9 @@ vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) rmp->retval = rv; rmp->handle = mp->handle; vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & rmp); + + if (session) + session_print_stats (utm, session); } static void @@ -431,14 +448,19 @@ client_handle_fifo_event_rx (uri_tcp_test_main_t * utm, if (n_read > 0) { bytes -= n_read; - for (i = 0; i < n_read; i++) + if (utm->test_return_packets) { - if (utm->rx_buf[i] != ((utm->client_bytes_received + i) & 0xff)) + for (i = 0; i < n_read; i++) { - clib_warning ("error at byte %lld, 0x%x not 0x%x", - utm->client_bytes_received + i, - utm->rx_buf[i], - ((utm->client_bytes_received + i) & 0xff)); + if (utm->rx_buf[i] + != ((utm->client_bytes_received + i) & 0xff)) + { + clib_warning ("error at byte %lld, 0x%x not 0x%x", + utm->client_bytes_received + i, + utm->rx_buf[i], + ((utm->client_bytes_received + + i) & 0xff)); + } } } utm->client_bytes_received += n_read; @@ -545,6 +567,7 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) session->server_rx_fifo = rx_fifo; session->server_tx_fifo = tx_fifo; session->vpp_session_handle = mp->handle; + session->start = clib_time_now (&utm->clib_time); /* Save handle */ utm->connected_session_index = session_index; @@ -571,7 +594,7 @@ send_test_chunk (uri_tcp_test_main_t * utm, svm_fifo_t * tx_fifo, int mypid, u64 bytes_sent = 0; int test_buf_offset = 0; u32 bytes_to_snd; - u32 queue_max_chunk = 64 << 10, actual_write; + u32 queue_max_chunk = 128 << 10, actual_write; session_fifo_event_t evt; static int serial_number = 0; int rv; @@ -582,8 +605,8 @@ send_test_chunk (uri_tcp_test_main_t * utm, svm_fifo_t * tx_fifo, int mypid, while (bytes_to_snd > 0) { - actual_write = - bytes_to_snd > queue_max_chunk ? queue_max_chunk : bytes_to_snd; + actual_write = (bytes_to_snd > queue_max_chunk) ? + queue_max_chunk : bytes_to_snd; rv = svm_fifo_enqueue_nowait (tx_fifo, actual_write, test_data + test_buf_offset); @@ -635,9 +658,9 @@ client_send_data (uri_tcp_test_main_t * utm) if (leftover) send_test_chunk (utm, tx_fifo, mypid, leftover); - if (utm->test_return_packets) + if (!utm->drop_packets) { - f64 timeout = clib_time_now (&utm->clib_time) + 2; + f64 timeout = clib_time_now (&utm->clib_time) + 10; /* Wait for the outstanding packets */ while (utm->client_bytes_received < @@ -698,6 +721,7 @@ int client_disconnect (uri_tcp_test_main_t * utm) { client_send_disconnect (utm); + clib_warning ("Sent disconnect"); if (wait_for_state_change (utm, STATE_START)) { clib_warning ("Disconnect failed"); @@ -721,7 +745,7 @@ client_test (uri_tcp_test_main_t * utm) } /* Init test data */ - vec_validate (utm->connect_test_data, 64 * 1024 - 1); + vec_validate (utm->connect_test_data, 128 * 1024 - 1); for (i = 0; i < vec_len (utm->connect_test_data); i++) utm->connect_test_data[i] = i & 0xff; @@ -899,6 +923,9 @@ vl_api_accept_session_t_handler (vl_api_accept_session_t * mp) rmp->_vl_msg_id = ntohs (VL_API_ACCEPT_SESSION_REPLY); rmp->handle = mp->handle; vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & rmp); + + session->bytes_received = 0; + session->start = clib_time_now (&utm->clib_time); } void @@ -909,37 +936,50 @@ server_handle_fifo_event_rx (uri_tcp_test_main_t * utm, int n_read; session_fifo_event_t evt; unix_shared_memory_queue_t *q; - int rv, bytes; + session_t *session; + int rv; + u32 max_dequeue, offset, max_transfer, rx_buf_len; + rx_buf_len = vec_len (utm->rx_buf); rx_fifo = e->fifo; - tx_fifo = utm->sessions[rx_fifo->client_session_index].server_tx_fifo; + session = &utm->sessions[rx_fifo->client_session_index]; + tx_fifo = session->server_tx_fifo; - bytes = svm_fifo_max_dequeue (rx_fifo); + max_dequeue = svm_fifo_max_dequeue (rx_fifo); /* Allow enqueuing of a new event */ svm_fifo_unset_event (rx_fifo); - if (bytes == 0) - return; + if (PREDICT_FALSE (max_dequeue == 0)) + { + return; + } - /* Read the bytes */ + /* Read the max_dequeue */ do { - n_read = svm_fifo_dequeue_nowait (rx_fifo, vec_len (utm->rx_buf), - utm->rx_buf); + max_transfer = clib_min (rx_buf_len, max_dequeue); + n_read = svm_fifo_dequeue_nowait (rx_fifo, max_transfer, utm->rx_buf); if (n_read > 0) - bytes -= n_read; - - if (utm->drop_packets) - continue; + { + max_dequeue -= n_read; + session->bytes_received += n_read; + } /* Reflect if a non-drop session */ - if (n_read > 0) + if (!utm->drop_packets && n_read > 0) { + offset = 0; do { - rv = svm_fifo_enqueue_nowait (tx_fifo, n_read, utm->rx_buf); + rv = svm_fifo_enqueue_nowait (tx_fifo, n_read, + &utm->rx_buf[offset]); + if (rv > 0) + { + n_read -= rv; + offset += rv; + } } - while (rv <= 0 && !utm->time_to_stop); + while ((rv <= 0 || n_read > 0) && !utm->time_to_stop); /* If event wasn't set, add one */ if (svm_fifo_set_event (tx_fifo)) @@ -951,11 +991,11 @@ server_handle_fifo_event_rx (uri_tcp_test_main_t * utm, q = utm->vpp_event_queue; unix_shared_memory_queue_add (q, (u8 *) & evt, - 0 /* do wait for mutex */ ); + 1 /* do wait for mutex */ ); } } } - while ((n_read < 0 || bytes > 0) && !utm->time_to_stop); + while ((n_read < 0 || max_dequeue > 0) && !utm->time_to_stop); } void @@ -1068,9 +1108,18 @@ vl_api_disconnect_session_reply_t_handler (vl_api_disconnect_session_reply_t * mp) { uri_tcp_test_main_t *utm = &uri_tcp_test_main; + session_t *session; + + if (mp->retval) + { + clib_warning ("vpp complained about disconnect: %d", + ntohl (mp->retval)); + } - clib_warning ("retval %d", ntohl (mp->retval)); utm->state = STATE_START; + session = pool_elt_at_index (utm->sessions, utm->connected_session_index); + if (session) + session_print_stats (utm, session); } #define foreach_uri_msg \ @@ -1123,7 +1172,7 @@ main (int argc, char **argv) /* make the main heap thread-safe */ h->flags |= MHEAP_FLAG_THREAD_SAFE; - vec_validate (utm->rx_buf, 65536); + vec_validate (utm->rx_buf, 128 << 10); utm->session_index_by_vpp_handles = hash_create (0, sizeof (uword)); @@ -1186,6 +1235,7 @@ main (int argc, char **argv) utm->drop_packets = drop_packets; utm->test_return_packets = test_return_packets; utm->bytes_to_send = bytes_to_send; + utm->time_to_stop = 0; setup_signal_handlers (); uri_api_hookup (utm); diff --git a/src/vnet/session/application.c b/src/vnet/session/application.c index c679b1f5..4bdb1027 100644 --- a/src/vnet/session/application.c +++ b/src/vnet/session/application.c @@ -117,7 +117,7 @@ application_del (application_t * app) /* Actual listener cleanup */ for (i = 0; i < vec_len (handles); i++) { - a->app_index = app->api_client_index; + a->app_index = app->index; a->handle = handles[i]; /* seg manager is removed when unbind completes */ vnet_unbind (a); diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c index 07eeae82..c0ab1bf0 100644 --- a/src/vnet/session/node.c +++ b/src/vnet/session/node.c @@ -171,7 +171,10 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, /* Nothing to read return */ if (max_dequeue0 == 0) - return 0; + { + svm_fifo_unset_event (s0->server_tx_fifo); + return 0; + } /* Ensure we're not writing more than transport window allows */ if (max_dequeue0 < snd_space0) @@ -393,7 +396,7 @@ session_event_get_session (session_fifo_event_t * e0, u8 thread_index) s0 = stream_session_get_if_valid (session_index0, thread_index); - ASSERT (s0->thread_index == thread_index); + ASSERT (s0 == 0 || s0->thread_index == thread_index); return s0; } diff --git a/src/vnet/session/segment_manager.c b/src/vnet/session/segment_manager.c index caf8eaa3..bf571963 100644 --- a/src/vnet/session/segment_manager.c +++ b/src/vnet/session/segment_manager.c @@ -306,11 +306,13 @@ again: if (added_a_segment) { clib_warning ("added a segment, still cant allocate a fifo"); + clib_spinlock_unlock (&sm->lockp); return SESSION_ERROR_NEW_SEG_NO_SPACE; } if (session_manager_add_segment (sm)) { + clib_spinlock_unlock (&sm->lockp); return VNET_API_ERROR_URI_FIFO_CREATE_FAILED; } @@ -320,6 +322,7 @@ again: else { clib_warning ("No space to allocate fifos!"); + clib_spinlock_unlock (&sm->lockp); return SESSION_ERROR_NO_SPACE; } } @@ -361,8 +364,10 @@ segment_manager_dealloc_fifos (u32 svm_segment_index, svm_fifo_t * rx_fifo, if (sm->segment_indices[0] != svm_segment_index && !svm_fifo_segment_has_fifos (fifo_segment)) { + clib_spinlock_lock (&sm->lockp); svm_fifo_segment_delete (fifo_segment); vec_del1 (sm->segment_indices, svm_segment_index); + clib_spinlock_unlock (&sm->lockp); } } diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 534598d6..fe198044 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -700,7 +700,7 @@ stream_session_init_fifos_pointers (transport_connection_t * tc, svm_fifo_init_pointers (s->server_tx_fifo, tx_pointer); } -void +int stream_session_connect_notify (transport_connection_t * tc, u8 sst, u8 is_fail) { @@ -709,6 +709,7 @@ stream_session_connect_notify (transport_connection_t * tc, u8 sst, stream_session_t *new_s = 0; u64 handle; u32 api_context = 0; + int error = 0; handle = stream_session_half_open_lookup (smm, &tc->lcl_ip, &tc->rmt_ip, tc->lcl_port, tc->rmt_port, @@ -716,7 +717,7 @@ stream_session_connect_notify (transport_connection_t * tc, u8 sst, if (handle == HALF_OPEN_LOOKUP_INVALID_VALUE) { clib_warning ("This can't be good!"); - return; + return -1; } /* Get the app's index from the handle we stored when opening connection */ @@ -730,9 +731,12 @@ stream_session_connect_notify (transport_connection_t * tc, u8 sst, /* Create new session (svm segments are allocated if needed) */ if (stream_session_create_i (sm, tc, &new_s)) - return; - - new_s->app_index = app->index; + { + is_fail = 1; + error = -1; + } + else + new_s->app_index = app->index; } /* Notify client */ @@ -741,6 +745,8 @@ stream_session_connect_notify (transport_connection_t * tc, u8 sst, /* Cleanup session lookup */ stream_session_half_open_table_del (smm, sst, tc); + + return error; } void @@ -981,8 +987,13 @@ session_send_session_evt_to_thread (u64 session_handle, /* Based on request block (or not) for lack of space */ if (PREDICT_TRUE (q->cursize < q->maxsize)) - unix_shared_memory_queue_add (q, (u8 *) & evt, - 0 /* do wait for mutex */ ); + { + if (unix_shared_memory_queue_add (q, (u8 *) & evt, + 1 /* do wait for mutex */ )) + { + clib_warning ("failed to enqueue evt"); + } + } else { clib_warning ("queue full"); diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index d9c38bd1..5fa4225c 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -368,8 +368,8 @@ stream_session_peek_bytes (transport_connection_t * tc, u8 * buffer, u32 offset, u32 max_bytes); u32 stream_session_dequeue_drop (transport_connection_t * tc, u32 max_bytes); -void stream_session_connect_notify (transport_connection_t * tc, u8 sst, - u8 is_fail); +int stream_session_connect_notify (transport_connection_t * tc, u8 sst, + u8 is_fail); void stream_session_init_fifos_pointers (transport_connection_t * tc, u32 rx_pointer, u32 tx_pointer); diff --git a/src/vnet/session/session_api.c b/src/vnet/session/session_api.c index f772cb9f..60f764af 100755 --- a/src/vnet/session/session_api.c +++ b/src/vnet/session/session_api.c @@ -419,7 +419,7 @@ done: REPLY_MACRO (VL_API_UNBIND_URI_REPLY); } -void +static void vl_api_connect_uri_t_handler (vl_api_connect_uri_t * mp) { vl_api_connect_uri_reply_t *rmp; diff --git a/src/vnet/session/transport.h b/src/vnet/session/transport.h index e5f788be..04bd5ca0 100644 --- a/src/vnet/session/transport.h +++ b/src/vnet/session/transport.h @@ -39,6 +39,7 @@ typedef struct _transport_connection #if TRANSPORT_DEBUG elog_track_t elog_track; /**< Event logging */ + u32 cc_stat_tstamp; /**< CC stats timestamp */ #endif /** Macros for 'derived classes' where base is named "connection" */ @@ -57,6 +58,7 @@ typedef struct _transport_connection #define c_is_ip4 connection.is_ip4 #define c_thread_index connection.thread_index #define c_elog_track connection.elog_track +#define c_cc_stat_tstamp connection.cc_stat_tstamp } transport_connection_t; /* diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index 7238cda3..6f8be082 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -43,7 +43,7 @@ #include #undef vl_printfun -#define TCP_BUILTIN_CLIENT_DBG (1) +#define TCP_BUILTIN_CLIENT_DBG (0) static void send_test_chunk (tclient_main_t * tm, session_t * s) @@ -92,7 +92,7 @@ send_test_chunk (tclient_main_t * tm, session_t * s) ed->data[2] = s->bytes_to_send; } - /* Poke the TCP state machine */ + /* Poke the session layer */ if (svm_fifo_set_event (s->server_tx_fifo)) { /* Fabricate TX event, send to vpp */ @@ -100,8 +100,9 @@ send_test_chunk (tclient_main_t * tm, session_t * s) evt.event_type = FIFO_EVENT_APP_TX; evt.event_id = serial_number++; - unix_shared_memory_queue_add (tm->vpp_event_queue, (u8 *) & evt, - 0 /* do wait for mutex */ ); + if (unix_shared_memory_queue_add (tm->vpp_event_queue, (u8 *) & evt, + 0 /* do wait for mutex */ )) + clib_warning ("could not enqueue event"); } } } @@ -188,13 +189,13 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, sp = pool_elt_at_index (tm->sessions, connection_indices[i]); - if (tx_quota < 60 && sp->bytes_to_send > 0) + if ((tm->no_return || tx_quota < 60) && sp->bytes_to_send > 0) { send_test_chunk (tm, sp); delete_session = 0; tx_quota++; } - if (sp->bytes_to_receive > 0) + if (!tm->no_return && sp->bytes_to_receive > 0) { prev_bytes_received_this_session = sp->bytes_received; receive_test_chunk (tm, sp); @@ -205,13 +206,14 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, } if (PREDICT_FALSE (delete_session == 1)) { + __sync_fetch_and_add (&tm->tx_total, tm->bytes_to_send); __sync_fetch_and_add (&tm->rx_total, sp->bytes_received); + dmp = vl_msg_api_alloc_as_if_client (sizeof (*dmp)); memset (dmp, 0, sizeof (*dmp)); dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION); dmp->client_index = tm->my_client_index; dmp->handle = sp->vpp_session_handle; -// vl_msg_api_send_shmem (tm->vl_input_queue, (u8 *) & dmp); if (!unix_shared_memory_queue_add (tm->vl_input_queue, (u8 *) & dmp, 1)) { @@ -247,7 +249,6 @@ VLIB_REGISTER_NODE (builtin_client_node) = }; /* *INDENT-ON* */ - /* So we don't get "no handler for... " msgs */ static void vl_api_memclnt_create_reply_t_handler (vl_api_memclnt_create_reply_t * mp) @@ -255,76 +256,10 @@ vl_api_memclnt_create_reply_t_handler (vl_api_memclnt_create_reply_t * mp) vlib_main_t *vm = vlib_get_main (); tclient_main_t *tm = &tclient_main; tm->my_client_index = mp->index; - vlib_process_signal_event (vm, tm->node_index, 1 /* evt */ , + vlib_process_signal_event (vm, tm->cli_node_index, 1 /* evt */ , 0 /* data */ ); } -static void -vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) -{ - tclient_main_t *tm = &tclient_main; - session_t *session; - u32 session_index; - i32 retval = /* clib_net_to_host_u32 ( */ mp->retval /*) */ ; - int i; - - if (retval < 0) - { - clib_warning ("connection failed: retval %d", retval); - return; - } - - tm->our_event_queue = - uword_to_pointer (mp->vpp_event_queue_address, - unix_shared_memory_queue_t *); - tm->vpp_event_queue = - uword_to_pointer (mp->vpp_event_queue_address, - unix_shared_memory_queue_t *); - - /* - * Setup session - */ - pool_get (tm->sessions, session); - memset (session, 0, sizeof (*session)); - session_index = session - tm->sessions; - session->bytes_to_receive = session->bytes_to_send = tm->bytes_to_send; - - session->server_rx_fifo = - uword_to_pointer (mp->server_rx_fifo, svm_fifo_t *); - session->server_rx_fifo->client_session_index = session_index; - session->server_tx_fifo = - uword_to_pointer (mp->server_tx_fifo, svm_fifo_t *); - session->server_tx_fifo->client_session_index = session_index; - session->vpp_session_handle = mp->handle; - - /* Add it to the session lookup table */ - hash_set (tm->session_index_by_vpp_handles, mp->handle, session_index); - - if (tm->ready_connections == tm->expected_connections - 1) - { - vlib_thread_main_t *thread_main = vlib_get_thread_main (); - int thread_index; - - thread_index = 0; - for (i = 0; i < pool_elts (tm->sessions); i++) - { - vec_add1 (tm->connection_index_by_thread[thread_index], i); - thread_index++; - if (thread_index == thread_main->n_vlib_mains) - thread_index = 0; - } - } - __sync_fetch_and_add (&tm->ready_connections, 1); - if (tm->ready_connections == tm->expected_connections) - { - tm->run_test = 1; - tm->test_start_time = vlib_time_now (tm->vlib_main); - /* Signal the CLI process that the action is starting... */ - vlib_process_signal_event (tm->vlib_main, tm->cli_node_index, - 1, 0 /* data */ ); - } -} - static int create_api_loopback (tclient_main_t * tm) { @@ -347,12 +282,11 @@ create_api_loopback (tclient_main_t * tm) mp->_vl_msg_id = VL_API_MEMCLNT_CREATE; mp->context = 0xFEEDFACE; mp->input_queue = pointer_to_uword (tm->vl_input_queue); - strncpy ((char *) mp->name, "tcp_tester", sizeof (mp->name) - 1); + strncpy ((char *) mp->name, "tcp_clients_tester", sizeof (mp->name) - 1); vl_api_memclnt_create_t_handler (mp); /* Wait for reply */ - tm->node_index = vlib_get_current_process (vm)->node_runtime.node_index; vlib_process_wait_for_event_or_clock (vm, 1.0); event_type = vlib_process_get_events (vm, &event_data); switch (event_type) @@ -373,7 +307,6 @@ create_api_loopback (tclient_main_t * tm) #define foreach_tclient_static_api_msg \ _(MEMCLNT_CREATE_REPLY, memclnt_create_reply) \ -_(CONNECT_URI_REPLY, connect_uri_reply) static clib_error_t * tclient_api_hookup (vlib_main_t * vm) @@ -411,8 +344,8 @@ tcp_test_clients_init (vlib_main_t * vm) if (create_api_loopback (tm)) return -1; - /* Init test data */ - vec_validate (tm->connect_test_data, 64 * 1024 - 1); + /* Init test data. Big buffer */ + vec_validate (tm->connect_test_data, 1024 * 1024 - 1); for (i = 0; i < vec_len (tm->connect_test_data); i++) tm->connect_test_data[i] = i & 0xff; @@ -430,37 +363,66 @@ static int builtin_session_connected_callback (u32 app_index, u32 api_context, stream_session_t * s, u8 is_fail) { - vl_api_connect_uri_reply_t _m, *mp = &_m; - unix_shared_memory_queue_t *q; - application_t *app; - unix_shared_memory_queue_t *vpp_queue; + tclient_main_t *tm = &tclient_main; + session_t *session; + u32 session_index; + int i; - app = application_get (app_index); - q = vl_api_client_index_to_input_queue (app->api_client_index); + if (is_fail) + { + clib_warning ("connection %d failed!", api_context); + vlib_process_signal_event (tm->vlib_main, tm->cli_node_index, -1, + 0 /* data */ ); + return -1; + } - if (!q) - return -1; + /* Mark vpp session as connected */ + s->session_state = SESSION_STATE_READY; - memset (mp, 0, sizeof (*mp)); - mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_CONNECT_URI_REPLY); - mp->context = api_context; - if (!is_fail) + tm->our_event_queue = session_manager_get_vpp_event_queue (s->thread_index); + tm->vpp_event_queue = session_manager_get_vpp_event_queue (s->thread_index); + + /* + * Setup session + */ + pool_get (tm->sessions, session); + memset (session, 0, sizeof (*session)); + session_index = session - tm->sessions; + session->bytes_to_receive = session->bytes_to_send = tm->bytes_to_send; + session->server_rx_fifo = s->server_rx_fifo; + session->server_rx_fifo->client_session_index = session_index; + session->server_tx_fifo = s->server_tx_fifo; + session->server_tx_fifo->client_session_index = session_index; + session->vpp_session_handle = stream_session_handle (s); + + /* Add it to the session lookup table */ + hash_set (tm->session_index_by_vpp_handles, session->vpp_session_handle, + session_index); + + if (tm->ready_connections == tm->expected_connections - 1) { - vpp_queue = session_manager_get_vpp_event_queue (s->thread_index); - mp->server_rx_fifo = pointer_to_uword (s->server_rx_fifo); - mp->server_tx_fifo = pointer_to_uword (s->server_tx_fifo); - mp->handle = stream_session_handle (s); - mp->vpp_event_queue_address = pointer_to_uword (vpp_queue); - mp->retval = 0; - s->session_state = SESSION_STATE_READY; + vlib_thread_main_t *thread_main = vlib_get_thread_main (); + int thread_index; + + thread_index = 0; + for (i = 0; i < pool_elts (tm->sessions); i++) + { + vec_add1 (tm->connection_index_by_thread[thread_index], i); + thread_index++; + if (thread_index == thread_main->n_vlib_mains) + thread_index = 0; + } } - else + __sync_fetch_and_add (&tm->ready_connections, 1); + if (tm->ready_connections == tm->expected_connections) { - mp->retval = clib_host_to_net_u32 (VNET_API_ERROR_SESSION_CONNECT_FAIL); + tm->run_test = 1; + tm->test_start_time = vlib_time_now (tm->vlib_main); + /* Signal the CLI process that the action is starting... */ + vlib_process_signal_event (tm->vlib_main, tm->cli_node_index, 1, + 0 /* data */ ); } - vl_api_connect_uri_reply_t_handler (mp); - return 0; } @@ -489,23 +451,22 @@ builtin_server_rx_callback (stream_session_t * s) } /* *INDENT-OFF* */ -static session_cb_vft_t builtin_clients = - { - .session_reset_callback = builtin_session_reset_callback, - .session_connected_callback = builtin_session_connected_callback, - .session_accept_callback = builtin_session_create_callback, - .session_disconnect_callback = builtin_session_disconnect_callback, - .builtin_server_rx_callback = builtin_server_rx_callback - }; +static session_cb_vft_t builtin_clients = { + .session_reset_callback = builtin_session_reset_callback, + .session_connected_callback = builtin_session_connected_callback, + .session_accept_callback = builtin_session_create_callback, + .session_disconnect_callback = builtin_session_disconnect_callback, + .builtin_server_rx_callback = builtin_server_rx_callback +}; /* *INDENT-ON* */ static int -attach_builtin_test_clients () +attach_builtin_test_clients_app (void) { tclient_main_t *tm = &tclient_main; vnet_app_attach_args_t _a, *a = &_a; u8 segment_name[128]; - u32 segment_name_length; + u32 segment_name_length, prealloc_fifos; u64 options[16]; segment_name_length = ARRAY_LEN (segment_name); @@ -518,13 +479,68 @@ attach_builtin_test_clients () a->segment_name_length = segment_name_length; a->session_cb_vft = &builtin_clients; + prealloc_fifos = tm->prealloc_fifos ? tm->expected_connections : 1; + options[SESSION_OPTIONS_ACCEPT_COOKIE] = 0x12345678; - options[SESSION_OPTIONS_SEGMENT_SIZE] = (2 << 30); /*$$$$ config / arg */ + options[SESSION_OPTIONS_SEGMENT_SIZE] = (2ULL << 32); + options[SESSION_OPTIONS_RX_FIFO_SIZE] = tm->fifo_size; + options[SESSION_OPTIONS_TX_FIFO_SIZE] = tm->fifo_size / 2; + options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = prealloc_fifos; + options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP; a->options = options; - return vnet_application_attach (a); + if (vnet_application_attach (a)) + return -1; + + tm->app_index = a->app_index; + return 0; +} + +static void * +tclient_thread_fn (void *arg) +{ + return 0; +} + +/** Start a transmit thread */ +int +start_tx_pthread (tclient_main_t * tm) +{ + if (tm->client_thread_handle == 0) + { + int rv = pthread_create (&tm->client_thread_handle, + NULL /*attr */ , + tclient_thread_fn, 0); + if (rv) + { + tm->client_thread_handle = 0; + return -1; + } + } + return 0; +} + +void +clients_connect (vlib_main_t * vm, u8 * uri, u32 n_clients) +{ + tclient_main_t *tm = &tclient_main; + vnet_connect_args_t _a, *a = &_a; + int i; + for (i = 0; i < n_clients; i++) + { + memset (a, 0, sizeof (*a)); + + a->uri = (char *) uri; + a->api_context = i; + a->app_index = tm->app_index; + a->mp = 0; + vnet_connect_uri (a); + + /* Crude pacing for call setups, 100k/sec */ + vlib_process_suspend (vm, 10e-6); + } } static clib_error_t * @@ -534,17 +550,18 @@ test_tcp_clients_command_fn (vlib_main_t * vm, { tclient_main_t *tm = &tclient_main; vlib_thread_main_t *thread_main = vlib_get_thread_main (); - uword *event_data = 0; - uword event_type; - u8 *connect_uri = (u8 *) "tcp://6.0.1.1/1234"; - u8 *uri; + uword *event_data = 0, event_type; + u8 *default_connect_uri = (u8 *) "tcp://6.0.1.1/1234", *uri; + u64 tmp, total_bytes; + f64 cli_timeout = 20.0, delta; u32 n_clients = 1; + char *transfer_type; int i; - u64 tmp; - f64 cli_timeout = 20.0; - f64 delta; tm->bytes_to_send = 8192; + tm->no_return = 0; + tm->fifo_size = 64 << 10; + vec_free (tm->connect_uri); while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) @@ -561,11 +578,18 @@ test_tcp_clients_command_fn (vlib_main_t * vm, ; else if (unformat (input, "cli-timeout %f", &cli_timeout)) ; + else if (unformat (input, "no-return")) + tm->no_return = 1; + else if (unformat (input, "fifo-size %d", &tm->fifo_size)) + tm->fifo_size <<= 10; else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); } + /* Store cli process node index for signalling */ + tm->cli_node_index = vlib_get_current_process (vm)->node_runtime.node_index; + if (tm->is_init == 0) { if (tcp_test_clients_init (vm)) @@ -575,28 +599,25 @@ test_tcp_clients_command_fn (vlib_main_t * vm, tm->ready_connections = 0; tm->expected_connections = n_clients; tm->rx_total = 0; + tm->tx_total = 0; - uri = connect_uri; + uri = default_connect_uri; if (tm->connect_uri) uri = tm->connect_uri; #if TCP_BUILTIN_CLIENT_PTHREAD - /* Start a transmit thread */ - if (tm->client_thread_handle == 0) + start_tx_pthread (); +#endif + + vnet_session_enable_disable (vm, 1 /* turn on TCP, etc. */ ); + + if (tm->test_client_attached == 0) { - int rv = pthread_create (&tm->client_thread_handle, - NULL /*attr */ , - tclient_thread_fn, 0); - if (rv) + if (attach_builtin_test_clients_app ()) { - tm->client_thread_handle = 0; - return clib_error_return (0, "pthread_create returned %d", rv); + return clib_error_return (0, "app attach failed"); } } -#endif - vnet_session_enable_disable (vm, 1 /* turn on TCP, etc. */ ); - if (tm->test_client_attached == 0) - attach_builtin_test_clients (); tm->test_client_attached = 1; /* Turn on the builtin client input nodes */ @@ -604,25 +625,8 @@ test_tcp_clients_command_fn (vlib_main_t * vm, vlib_node_set_state (vlib_mains[i], builtin_client_node.index, VLIB_NODE_STATE_POLLING); - tm->cli_node_index = vlib_get_current_process (vm)->node_runtime.node_index; - /* Fire off connect requests */ - for (i = 0; i < n_clients; i++) - { - vl_api_connect_uri_t _cmp, *cmp = &_cmp; - void vl_api_connect_uri_t_handler (vl_api_connect_uri_t * cmp); - - memset (cmp, 0, sizeof (*cmp)); - - cmp->_vl_msg_id = ntohs (VL_API_CONNECT_URI); - cmp->client_index = tm->my_client_index; - cmp->context = ntohl (0xfeedface); - memcpy (cmp->uri, uri, strlen ((char *) uri) + 1); - - vl_api_connect_uri_t_handler (cmp); - /* Crude pacing for call setups, 100k/sec */ - vlib_process_suspend (vm, 10e-6); - } + clients_connect (vm, uri, n_clients); /* Park until the sessions come up, or ten seconds elapse... */ vlib_process_wait_for_event_or_clock (vm, 10.0 /* timeout, seconds */ ); @@ -668,14 +672,17 @@ test_tcp_clients_command_fn (vlib_main_t * vm, if (delta != 0.0) { + total_bytes = (tm->no_return ? tm->tx_total : tm->rx_total); + transfer_type = tm->no_return ? "half-duplex" : "full-duplex"; vlib_cli_output (vm, "%lld bytes (%lld mbytes, %lld gbytes) in %.2f seconds", - tm->rx_total, tm->rx_total / (1ULL << 20), - tm->rx_total / (1ULL << 30), delta); - vlib_cli_output (vm, "%.2f bytes/second full-duplex", - ((f64) tm->rx_total) / (delta)); - vlib_cli_output (vm, "%.4f gbit/second full-duplex", - (((f64) tm->rx_total * 8.0) / delta / 1e9)); + total_bytes, total_bytes / (1ULL << 20), + total_bytes / (1ULL << 30), delta); + vlib_cli_output (vm, "%.2f bytes/second %s", + ((f64) total_bytes) / (delta), transfer_type); + vlib_cli_output (vm, "%.4f gbit/second %s", + (((f64) total_bytes * 8.0) / delta / 1e9), + transfer_type); } else vlib_cli_output (vm, "zero delta-t?"); diff --git a/src/vnet/tcp/builtin_client.h b/src/vnet/tcp/builtin_client.h index d5d79e53..3462e0ee 100644 --- a/src/vnet/tcp/builtin_client.h +++ b/src/vnet/tcp/builtin_client.h @@ -44,78 +44,59 @@ typedef struct typedef struct { - /* API message ID base */ - u16 msg_id_base; - - /* vpe input queue */ - unix_shared_memory_queue_t *vl_input_queue; - - /* API client handle */ - u32 my_client_index; - - /* The URI we're playing with */ - u8 *uri; - - /* Session pool */ - session_t *sessions; - - /* Hash table for disconnect processing */ - uword *session_index_by_vpp_handles; - - /* intermediate rx buffer */ - u8 *rx_buf; - - /* URI for slave's connect */ - u8 *connect_uri; - - u32 connected_session_index; - - int i_am_master; - - /* drop all packets */ - int drop_packets; - - /* Our event queue */ - unix_shared_memory_queue_t *our_event_queue; - - /* $$$ single thread only for the moment */ - unix_shared_memory_queue_t *vpp_event_queue; - - pid_t my_pid; - - f64 test_start_time; - f64 test_end_time; - - u32 expected_connections; + /* + * Application setup parameters + */ + unix_shared_memory_queue_t *vl_input_queue; /**< vpe input queue */ + unix_shared_memory_queue_t *our_event_queue; /**< Our event queue */ + unix_shared_memory_queue_t *vpp_event_queue; /**< $$$ single thread */ + + u32 cli_node_index; /**< cli process node index */ + u32 my_client_index; /**< loopback API client handle */ + u32 app_index; /**< app index after attach */ + + /* + * Configuration params + */ + u8 *connect_uri; /**< URI for slave's connect */ + u64 bytes_to_send; /**< Bytes to send */ + u32 configured_segment_size; + u32 fifo_size; + u32 expected_connections; /**< Number of clients/connections */ + + /* + * Test state variables + */ + session_t *sessions; /**< Sessions pool */ + u8 *rx_buf; /**< intermediate rx buffer */ + uword *session_index_by_vpp_handles; /**< Hash table for disconnecting */ + u8 *connect_test_data; /**< Pre-computed test data */ u32 **connection_index_by_thread; + pthread_t client_thread_handle; + volatile u32 ready_connections; volatile u32 finished_connections; - volatile u64 rx_total; - u32 cli_node_index; - - /* Signal variable */ - volatile int run_test; - - /* Bytes to send */ - u64 bytes_to_send; - - u32 configured_segment_size; + volatile u64 tx_total; + volatile int run_test; /**< Signal start of test */ - /* VNET_API_ERROR_FOO -> "Foo" hash table */ - uword *error_string_by_error_number; - - u8 *connect_test_data; - pthread_t client_thread_handle; - u64 client_bytes_received; - u8 test_return_packets; + f64 test_start_time; + f64 test_end_time; + /* + * Flags + */ u8 is_init; u8 test_client_attached; + u8 no_return; + u8 test_return_packets; + int i_am_master; + int drop_packets; /**< drop all packets */ + u8 prealloc_fifos; /**< Request fifo preallocation */ - u32 node_index; - - /* convenience */ + /* + * Convenience + */ vlib_main_t *vlib_main; vnet_main_t *vnet_main; ethernet_main_t *ethernet_main; diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c index 8bd2f360..775bfc26 100644 --- a/src/vnet/tcp/builtin_server.c +++ b/src/vnet/tcp/builtin_server.c @@ -39,21 +39,30 @@ typedef struct { - /* Per-thread RX buffer */ - u8 **rx_buf; + /* + * Server app parameters + */ unix_shared_memory_queue_t **vpp_queue; - u64 byte_index; + unix_shared_memory_queue_t *vl_input_queue; /**< Sever's event queue */ - /* Sever's event queue */ - unix_shared_memory_queue_t *vl_input_queue; + u32 app_index; /**< Server app index */ + u32 my_client_index; /**< API client handle */ + u32 node_index; /**< process node index for evnt scheduling */ - /* API client handle */ - u32 my_client_index; + /* + * Config params + */ + u8 no_echo; /**< Don't echo traffic */ + u32 fifo_size; /**< Fifo size */ + u32 rcv_buffer_size; /**< Rcv buffer size */ + u32 prealloc_fifos; /**< Preallocate fifos */ - u32 app_index; + /* + * Test state + */ + u8 **rx_buf; /**< Per-thread RX buffer */ + u64 byte_index; - /* process node index for evnt scheduling */ - u32 node_index; vlib_main_t *vlib_main; } builtin_server_main_t; @@ -132,6 +141,29 @@ test_bytes (builtin_server_main_t * bsm, int actual_transfer) bsm->byte_index += actual_transfer; } +/* + * If no-echo, just read the data and be done with it + */ +int +builtin_server_rx_callback_no_echo (stream_session_t * s) +{ + builtin_server_main_t *bsm = &builtin_server_main; + u32 my_thread_id = vlib_get_thread_index (); + int actual_transfer; + svm_fifo_t *rx_fifo; + + rx_fifo = s->server_rx_fifo; + + do + { + actual_transfer = + svm_fifo_dequeue_nowait (rx_fifo, bsm->rcv_buffer_size, + bsm->rx_buf[my_thread_id]); + } + while (actual_transfer > 0); + return 0; +} + int builtin_server_rx_callback (stream_session_t * s) { @@ -143,8 +175,8 @@ builtin_server_rx_callback (stream_session_t * s) static int serial_number = 0; u32 my_thread_id = vlib_get_thread_index (); - tx_fifo = s->server_tx_fifo; rx_fifo = s->server_rx_fifo; + tx_fifo = s->server_tx_fifo; max_dequeue = svm_fifo_max_dequeue (s->server_rx_fifo); max_enqueue = svm_fifo_max_enqueue (s->server_tx_fifo); @@ -164,19 +196,22 @@ builtin_server_rx_callback (stream_session_t * s) /* Program self-tap to retry */ if (svm_fifo_set_event (rx_fifo)) { + unix_shared_memory_queue_t *q; evt.fifo = rx_fifo; evt.event_type = FIFO_EVENT_BUILTIN_RX; evt.event_id = 0; - unix_shared_memory_queue_add (bsm->vpp_queue[s->thread_index], - (u8 *) & evt, - 0 /* do wait for mutex */ ); + + q = bsm->vpp_queue[s->thread_index]; + if (PREDICT_FALSE (q->cursize == q->maxsize)) + clib_warning ("out of event queue space"); + else + unix_shared_memory_queue_add (q, (u8 *) & evt, + 0 /* don't wait for mutex */ ); } return 0; } - vec_validate (bsm->rx_buf, my_thread_id); - vec_validate (bsm->rx_buf[my_thread_id], max_transfer - 1); _vec_len (bsm->rx_buf[my_thread_id]) = max_transfer; actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, max_transfer, @@ -281,14 +316,21 @@ server_attach () memset (a, 0, sizeof (*a)); memset (options, 0, sizeof (options)); + if (bsm->no_echo) + builtin_session_cb_vft.builtin_server_rx_callback = + builtin_server_rx_callback_no_echo; + else + builtin_session_cb_vft.builtin_server_rx_callback = + builtin_server_rx_callback; a->api_client_index = bsm->my_client_index; a->session_cb_vft = &builtin_session_cb_vft; a->options = options; a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 512 << 20; - a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 64 << 10; - a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 64 << 10; + a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = bsm->fifo_size; + a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = bsm->fifo_size; a->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP; - a->options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = 8192; + a->options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = + bsm->prealloc_fifos ? bsm->prealloc_fifos : 1; a->segment_name = segment_name; a->segment_name_length = ARRAY_LEN (segment_name); @@ -316,17 +358,24 @@ static int server_create (vlib_main_t * vm) { builtin_server_main_t *bsm = &builtin_server_main; - u32 num_threads; vlib_thread_main_t *vtm = vlib_get_thread_main (); + u32 num_threads; + int i; if (bsm->my_client_index == (u32) ~ 0) { if (create_api_loopback (vm)) - return -1; + { + clib_warning ("failed to create api loopback"); + return -1; + } } num_threads = 1 /* main thread */ + vtm->n_threads; vec_validate (builtin_server_main.vpp_queue, num_threads - 1); + vec_validate (bsm->rx_buf, num_threads - 1); + for (i = 0; i < num_threads; i++) + vec_validate (bsm->rx_buf[i], bsm->rcv_buffer_size); if (server_attach ()) { @@ -381,23 +430,35 @@ tcp_builtin_server_api_hookup (vlib_main_t * vm) } static clib_error_t * -server_create_command_fn (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) +server_create_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) { + builtin_server_main_t *bsm = &builtin_server_main; int rv; -#if 0 + + bsm->no_echo = 0; + bsm->fifo_size = 64 << 10; + bsm->rcv_buffer_size = 128 << 10; + bsm->prealloc_fifos = 0; + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { - if (unformat (input, "whatever %d", &whatever)) + if (unformat (input, "no-echo")) + bsm->no_echo = 1; + else if (unformat (input, "fifo-size %d", &bsm->fifo_size)) + bsm->fifo_size <<= 10; + else if (unformat (input, "rcv-buf-size %d", &bsm->rcv_buffer_size)) + ; + else if (unformat (input, "prealloc-fifos", &bsm->prealloc_fifos)) ; else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); } -#endif tcp_builtin_server_api_hookup (vm); vnet_session_enable_disable (vm, 1 /* turn on TCP, etc. */ ); + rv = server_create (vm); switch (rv) { @@ -406,6 +467,7 @@ server_create_command_fn (vlib_main_t * vm, default: return clib_error_return (0, "server_create returned %d", rv); } + return 0; } diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index e0b67a8e..5c554bac 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -726,15 +726,25 @@ tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space) u32 tcp_snd_space (tcp_connection_t * tc) { - int snd_space; + int snd_space, snt_limited; - /* If we haven't gotten dupacks or if we did and have gotten sacked bytes - * then we can still send */ - if (PREDICT_TRUE (tcp_in_cong_recovery (tc) == 0 - && (tc->rcv_dupacks == 0 - || tc->sack_sb.last_sacked_bytes))) + if (PREDICT_TRUE (tcp_in_cong_recovery (tc) == 0)) { snd_space = tcp_available_snd_space (tc); + + /* If we haven't gotten dupacks or if we did and have gotten sacked + * bytes then we can still send as per Limited Transmit (RFC3042) */ + if (PREDICT_FALSE (tc->rcv_dupacks != 0 + && (tcp_opts_sack_permitted (tc) + && tc->sack_sb.last_sacked_bytes == 0))) + { + if (tc->rcv_dupacks == 1 && tc->limited_transmit != tc->snd_nxt) + tc->limited_transmit = tc->snd_nxt; + ASSERT (seq_leq (tc->limited_transmit, tc->snd_nxt)); + + snt_limited = tc->snd_nxt - tc->limited_transmit; + snd_space = clib_max (2 * tc->snd_mss - snt_limited, 0); + } return tcp_round_snd_space (tc, snd_space); } diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 071f1ab1..e8398718 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -31,9 +31,9 @@ #define TCP_MAX_OPTION_SPACE 40 #define TCP_DUPACK_THRESHOLD 3 -#define TCP_MAX_RX_FIFO_SIZE 2 << 20 +#define TCP_MAX_RX_FIFO_SIZE 4 << 20 #define TCP_IW_N_SEGMENTS 10 -#define TCP_ALWAYS_ACK 0 /**< If on, we always ack */ +#define TCP_ALWAYS_ACK 1 /**< On/off delayed acks */ #define TCP_USE_SACKS 1 /**< Disable only for testing */ /** TCP FSM state definitions as per RFC793. */ @@ -100,6 +100,7 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler; #define TCP_TIMER_PERSIST_MIN 2 /* 0.2s */ #define TCP_RTO_MAX 60 * THZ /* Min max RTO (60s) as per RFC6298 */ +#define TCP_RTO_MIN 0.2 * THZ /* Min RTO (200ms) - lower than standard */ #define TCP_RTT_MAX 30 * THZ /* 30s (probably too much) */ #define TCP_RTO_SYN_RETRIES 3 /* SYN retries without doubling RTO */ #define TCP_RTO_INIT 1 * THZ /* Initial retransmit timer */ @@ -149,7 +150,7 @@ enum #undef _ }; -#define TCP_MAX_SACK_BLOCKS 5 /**< Max number of SACK blocks stored */ +#define TCP_MAX_SACK_BLOCKS 15 /**< Max number of SACK blocks stored */ #define TCP_INVALID_SACK_HOLE_INDEX ((u32)~0) typedef struct _sack_scoreboard_hole @@ -208,6 +209,7 @@ typedef struct _tcp_connection u32 snd_wl1; /**< seq number used for last snd.wnd update */ u32 snd_wl2; /**< ack number used for last snd.wnd update */ u32 snd_nxt; /**< next seq number to be sent */ + u16 snd_mss; /**< Effective send max seg (data) size */ /** Receive sequence variables RFC793 */ u32 rcv_nxt; /**< next sequence number expected */ @@ -252,8 +254,8 @@ typedef struct _tcp_connection u32 rtt_ts; /**< Timestamp for tracked ACK */ u32 rtt_seq; /**< Sequence number for tracked ACK */ - u16 snd_mss; /**< Effective send max seg (data) size */ u16 mss; /**< Our max seg size that includes options */ + u32 limited_transmit; /**< snd_nxt when limited transmit starts */ } tcp_connection_t; struct _tcp_cc_algorithm @@ -433,6 +435,7 @@ tcp_end_seq (tcp_header_t * th, u32 len) #define seq_leq(_s1, _s2) ((i32)((_s1)-(_s2)) <= 0) #define seq_gt(_s1, _s2) ((i32)((_s1)-(_s2)) > 0) #define seq_geq(_s1, _s2) ((i32)((_s1)-(_s2)) >= 0) +#define seq_max(_s1, _s2) (seq_gt((_s1), (_s2)) ? (_s1) : (_s2)) /* Modulo arithmetic for timestamps */ #define timestamp_lt(_t1, _t2) ((i32)((_t1)-(_t2)) < 0) @@ -719,6 +722,7 @@ scoreboard_clear (sack_scoreboard_t * sb) { scoreboard_remove_hole (sb, hole); } + ASSERT (sb->head == sb->tail && sb->head == TCP_INVALID_SACK_HOLE_INDEX); sb->sacked_bytes = 0; sb->last_sacked_bytes = 0; sb->last_bytes_delivered = 0; diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index 3a16cf63..ae68ad1b 100755 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -19,8 +19,10 @@ #include #define TCP_DEBUG (1) +#define TCP_DEBUG_SM (0) #define TCP_DEBUG_CC (1) -#define TCP_DEBUG_VERBOSE (0) +#define TCP_DEBUG_CC_STAT (1) +#define TCP_DEBUG_SM_VERBOSE (0) #define foreach_tcp_dbg_evt \ _(INIT, "") \ @@ -49,6 +51,8 @@ _(CC_RTX, "retransmit") \ _(CC_EVT, "cc event") \ _(CC_PACK, "cc partial ack") \ + _(CC_STAT, "cc stats") \ + _(CC_RTO_STAT, "cc rto stats") \ _(SEG_INVALID, "invalid segment") \ _(PAWS_FAIL, "failed paws check") \ _(ACK_RCV_ERR, "invalid ack") \ @@ -72,6 +76,10 @@ typedef enum _tcp_dbg_evt #define TRANSPORT_DEBUG (1) +/* + * Infra and evt track setup + */ + #define TCP_DBG(_tc, _evt, _args...) \ { \ u8 *_tmp = 0; \ @@ -158,6 +166,30 @@ typedef enum _tcp_dbg_evt TCP_EVT_DEALLOC_HANDLER(_tc); \ } +#define TCP_EVT_SYN_RCVD_HANDLER(_tc, ...) \ +{ \ + TCP_EVT_INIT_HANDLER(_tc, "s%d%c"); \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "SYNrx: irs %u", \ + .format_args = "i4", \ + }; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->irs; \ +} + +#define CONCAT_HELPER(_a, _b) _a##_b +#define CC(_a, _b) CONCAT_HELPER(_a, _b) +#define TCP_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args) +#else +#define TCP_EVT_DBG(_evt, _args...) +#endif + +/* + * State machine + */ +#if TCP_DEBUG_SM + #define TCP_EVT_ACK_SENT_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ @@ -234,18 +266,6 @@ typedef enum _tcp_dbg_evt ed->data[1] = _tc->rcv_nxt - _tc->irs; \ } -#define TCP_EVT_SYN_RCVD_HANDLER(_tc, ...) \ -{ \ - TCP_EVT_INIT_HANDLER(_tc, "s%d%c"); \ - ELOG_TYPE_DECLARE (_e) = \ - { \ - .format = "SYNrx: irs %u", \ - .format_args = "i4", \ - }; \ - DECLARE_ETD(_tc, _e, 1); \ - ed->data[0] = _tc->irs; \ -} - #define TCP_EVT_FIN_RCVD_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ @@ -418,6 +438,74 @@ typedef enum _tcp_dbg_evt ed->data[4] = _tc->snd_una_max - _tc->iss; \ } +#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...) \ +{ \ +if (_av > 0) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "huh?: rcv_wnd %u obsd %u av %u rcv_nxt %u rcv_las %u", \ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->rcv_wnd; \ + ed->data[1] = _obs; \ + ed->data[2] = _av; \ + ed->data[3] = _tc->rcv_nxt - _tc->irs; \ + ed->data[4] = _tc->rcv_las - _tc->irs; \ +} \ +} +#else +#define TCP_EVT_ACK_SENT_HANDLER(_tc, ...) +#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...) +#define TCP_EVT_SYN_SENT_HANDLER(_tc, ...) +#define TCP_EVT_SYN_RTX_HANDLER(_tc, ...) +#define TCP_EVT_FIN_SENT_HANDLER(_tc, ...) +#define TCP_EVT_RST_SENT_HANDLER(_tc, ...) +#define TCP_EVT_FIN_RCVD_HANDLER(_tc, ...) +#define TCP_EVT_RST_RCVD_HANDLER(_tc, ...) +#define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...) +#define TCP_EVT_DUPACK_RCVD_HANDLER(_tc, ...) +#define TCP_EVT_PKTIZE_HANDLER(_tc, ...) +#define TCP_EVT_INPUT_HANDLER(_tc, _type, _len, _written, ...) +#define TCP_EVT_TIMER_POP_HANDLER(_tc_index, _timer_id, ...) +#define TCP_EVT_SEG_INVALID_HANDLER(_tc, _seq, _end, ...) +#define TCP_EVT_PAWS_FAIL_HANDLER(_tc, _seq, _end, ...) +#define TCP_EVT_ACK_RCV_ERR_HANDLER(_tc, _type, _ack, ...) +#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...) +#endif + +/* + * State machine verbose + */ +#if TCP_DBG_SM_VERBOSE +#define TCP_EVT_SND_WND_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "snd_wnd update: %u ", \ + .format_args = "i4", \ + }; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->snd_wnd; \ +} + +#define TCP_EVT_OUTPUT_HANDLER(_tc, flags, n_bytes,...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "out: flags %x, bytes %u", \ + .format_args = "i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = flags; \ + ed->data[1] = n_bytes; \ +} +#else +#define TCP_EVT_SND_WND_HANDLER(_tc, ...) +#define TCP_EVT_OUTPUT_HANDLER(_tc, flags, n_bytes,...) +#endif + /* * Congestion Control */ @@ -471,67 +559,59 @@ typedef enum _tcp_dbg_evt ed->data[1] = _tc->snd_una_max - _tc->iss; \ } -#else -#define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...) -#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, _snd_space, ...) -#define TCP_EVT_CC_PACK_HANDLER(_tc, ...) -#endif +/* + * Congestion control stats + */ +#if TCP_DEBUG_CC_STAT -#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...) \ +#define STATS_INTERVAL 1 + +#define TCP_EVT_CC_RTO_STAT_HANDLER(_tc, ...) \ { \ -if (_av > 0) \ +if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now()) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "huh?: rcv_wnd %u obsd %u av %u rcv_nxt %u rcv_las %u", \ - .format_args = "i4i4i4i4i4", \ + .format = "rto_stat: rto %u srtt %u rttvar %u ", \ + .format_args = "i4i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 5); \ - ed->data[0] = _tc->rcv_wnd; \ - ed->data[1] = _obs; \ - ed->data[2] = _av; \ - ed->data[3] = _tc->rcv_nxt - _tc->irs; \ - ed->data[4] = _tc->rcv_las - _tc->irs; \ + DECLARE_ETD(_tc, _e, 3); \ + ed->data[0] = _tc->rto; \ + ed->data[1] = _tc->srtt; \ + ed->data[2] = _tc->rttvar; \ } \ } -#if TCP_DBG_VERBOSE -#define TCP_EVT_SND_WND_HANDLER(_tc, ...) \ +#define TCP_EVT_CC_STAT_HANDLER(_tc, ...) \ { \ - ELOG_TYPE_DECLARE (_e) = \ - { \ - .format = "snd_wnd update: %u ", \ - .format_args = "i4", \ - }; \ - DECLARE_ETD(_tc, _e, 1); \ - ed->data[0] = _tc->snd_wnd; \ -} - -#define TCP_EVT_OUTPUT_HANDLER(_tc, flags, n_bytes,...) \ +if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now()) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "out: flags %x, bytes %u", \ - .format_args = "i4i4", \ + .format = "cc_stat: cwnd %u flight %u space %u ssthresh %u snd_wnd %u",\ + .format_args = "i4i4i4i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 2); \ - ed->data[0] = flags; \ - ed->data[1] = n_bytes; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->cwnd; \ + ed->data[1] = tcp_flight_size (_tc); \ + ed->data[2] = tcp_snd_space (_tc); \ + ed->data[3] = _tc->ssthresh; \ + ed->data[4] = _tc->snd_wnd; \ + TCP_EVT_CC_RTO_STAT_HANDLER (_tc); \ + _tc->c_cc_stat_tstamp = tcp_time_now(); \ +} \ } + #else -#define TCP_EVT_SND_WND_HANDLER(_tc, ...) -#define TCP_EVT_OUTPUT_HANDLER(_tc, flags, n_bytes,...) +#define TCP_EVT_CC_STAT_HANDLER(_tc, ...) #endif -#define CONCAT_HELPER(_a, _b) _a##_b -#define CC(_a, _b) CONCAT_HELPER(_a, _b) -#define TCP_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args) - #else -#define TCP_EVT_DBG(_evt, _args...) +#define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...) +#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...) +#define TCP_EVT_CC_PACK_HANDLER(_tc, ...) #endif - #endif /* SRC_VNET_TCP_TCP_DEBUG_H_ */ /* * fd.io coding-style-patch-verification: ON diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index ff2229b3..a2e6dad1 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -378,16 +378,20 @@ tcp_rcv_ack_is_acceptable (tcp_connection_t * tc0, vlib_buffer_t * tb0) static void tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt) { - int err; + int err, diff; if (tc->srtt != 0) { err = mrtt - tc->srtt; - tc->srtt += err >> 3; +// tc->srtt += err >> 3; /* XXX Drop in RTT results in RTTVAR increase and bigger RTO. * The increase should be bound */ - tc->rttvar += ((int) clib_abs (err) - (int) tc->rttvar) >> 2; +// tc->rttvar += ((int) clib_abs (err) - (int) tc->rttvar) >> 2; + + tc->srtt = clib_max ((int) tc->srtt + (err >> 3), 1); + diff = (clib_abs (err) - (int) tc->rttvar) >> 2; + tc->rttvar = clib_max ((int) tc->rttvar + diff, 1); } else { @@ -401,6 +405,7 @@ void tcp_update_rto (tcp_connection_t * tc) { tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); + tc->rto = clib_max (tc->rto, TCP_RTO_MIN); } /** Update RTT estimate and RTO timer @@ -417,8 +422,8 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) u32 mrtt = 0; u8 rtx_acked; - /* Determine if only rtx bytes are acked. TODO XXX fast retransmit */ - rtx_acked = tc->rto_boff && (tc->bytes_acked <= tc->snd_mss); + /* Determine if only rtx bytes are acked. */ + rtx_acked = tcp_in_cong_recovery (tc) || !tc->bytes_acked; /* Karn's rule, part 1. Don't use retransmitted segments to estimate * RTT because they're ambiguous. */ @@ -428,8 +433,7 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) } /* As per RFC7323 TSecr can be used for RTTM only if the segment advances * snd_una, i.e., the left side of the send window: - * seq_lt (tc->snd_una, ack). Note: last condition could be dropped, we don't - * try to update rtt for dupacks */ + * seq_lt (tc->snd_una, ack). */ else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr && tc->bytes_acked) { @@ -550,11 +554,13 @@ scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index, prev = scoreboard_get_hole (sb, prev_index); if (prev) { - hole->prev = prev - sb->holes; + hole->prev = prev_index; hole->next = prev->next; if ((next = scoreboard_next_hole (sb, hole))) next->prev = hole_index; + else + sb->tail = hole_index; prev->next = hole_index; } @@ -569,12 +575,13 @@ scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index, } void -scoreboard_update_lost (tcp_connection_t * tc, sack_scoreboard_t * sb) +scoreboard_update_bytes (tcp_connection_t * tc, sack_scoreboard_t * sb) { sack_scoreboard_hole_t *hole, *prev; u32 bytes = 0, blks = 0; sb->lost_bytes = 0; + sb->sacked_bytes = 0; hole = scoreboard_last_hole (sb); if (!hole) return; @@ -594,13 +601,16 @@ scoreboard_update_lost (tcp_connection_t * tc, sack_scoreboard_t * sb) hole = prev; } - hole = prev; while (hole) { sb->lost_bytes += scoreboard_hole_bytes (hole); hole->is_lost = 1; + prev = hole; hole = scoreboard_prev_hole (sb, hole); + if (hole) + bytes += prev->start - hole->end; } + sb->sacked_bytes = bytes; } /** @@ -677,7 +687,7 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { sack_scoreboard_t *sb = &tc->sack_sb; sack_block_t *blk, tmp; - sack_scoreboard_hole_t *hole, *next_hole, *last_hole, *new_hole; + sack_scoreboard_hole_t *hole, *next_hole, *last_hole; u32 blk_index = 0, old_sacked_bytes, hole_index; int i, j; @@ -743,6 +753,10 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) if (seq_gt (tc->snd_una_max, sb->high_sacked) && seq_gt (tc->snd_una_max, last_hole->end)) last_hole->end = tc->snd_una_max; + /* keep track of max byte sacked for when the last hole + * is acked */ + if (seq_gt (tmp.end, sb->high_sacked)) + sb->high_sacked = tmp.end; } /* Walk the holes with the SACK blocks */ @@ -758,45 +772,20 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { next_hole = scoreboard_next_hole (sb, hole); - /* Byte accounting */ - if (seq_leq (hole->end, ack)) - { - /* Bytes lost because snd_wnd left edge advances */ - if (next_hole && seq_leq (next_hole->start, ack)) - sb->last_bytes_delivered += next_hole->start - hole->end; - else - sb->last_bytes_delivered += ack - hole->end; - } - else - { - sb->sacked_bytes += scoreboard_hole_bytes (hole); - } - - /* About to remove last hole */ - if (hole == last_hole) - { - sb->tail = hole->prev; - last_hole = scoreboard_last_hole (sb); - /* keep track of max byte sacked for when the last hole - * is acked */ - if (seq_gt (hole->end, sb->high_sacked)) - sb->high_sacked = hole->end; - } - - /* snd_una needs to be advanced */ - if (blk->end == ack && seq_geq (ack, hole->end)) + /* Byte accounting: snd_una needs to be advanced */ + if (blk->end == ack) { - if (next_hole && seq_lt (ack, next_hole->start)) + if (next_hole) { - sb->snd_una_adv = next_hole->start - ack; - - /* all these can be delivered */ - sb->last_bytes_delivered += sb->snd_una_adv; + if (seq_lt (ack, next_hole->start)) + sb->snd_una_adv = next_hole->start - ack; + sb->last_bytes_delivered += + next_hole->start - hole->end; } else if (!next_hole) { sb->snd_una_adv = sb->high_sacked - ack; - sb->last_bytes_delivered += sb->snd_una_adv; + sb->last_bytes_delivered += sb->high_sacked - hole->end; } } @@ -808,7 +797,6 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { if (seq_gt (blk->end, hole->start)) { - sb->sacked_bytes += blk->end - hole->start; hole->start = blk->end; } blk_index++; @@ -819,28 +807,16 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) /* Hole must be split */ if (seq_lt (blk->end, hole->end)) { - sb->sacked_bytes += blk->end - blk->start; hole_index = scoreboard_hole_index (sb, hole); - new_hole = scoreboard_insert_hole (sb, hole_index, blk->end, - hole->end); + scoreboard_insert_hole (sb, hole_index, blk->end, hole->end); /* Pool might've moved */ hole = scoreboard_get_hole (sb, hole_index); hole->end = blk->start; - - /* New or split of tail */ - if ((last_hole->end == new_hole->end) - || seq_lt (last_hole->end, new_hole->start)) - { - last_hole = new_hole; - sb->tail = scoreboard_hole_index (sb, new_hole); - } - blk_index++; } - else if (seq_leq (blk->start, hole->end)) + else if (seq_lt (blk->start, hole->end)) { - sb->sacked_bytes += hole->end - blk->start; hole->end = blk->start; } @@ -848,9 +824,13 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) } } - sb->last_sacked_bytes = sb->sacked_bytes - old_sacked_bytes; - sb->sacked_bytes -= sb->last_bytes_delivered; - scoreboard_update_lost (tc, sb); + scoreboard_update_bytes (tc, sb); + sb->last_sacked_bytes = sb->sacked_bytes + - (old_sacked_bytes - sb->last_bytes_delivered); + ASSERT (sb->sacked_bytes == 0 + || sb->sacked_bytes < tc->snd_una_max - seq_max (tc->snd_una, ack)); + ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_una_max + - seq_max (tc->snd_una, ack)); } /** @@ -998,9 +978,14 @@ tcp_should_fastrecover (tcp_connection_t * tc) || tcp_should_fastrecover_sack (tc)); } +/** + * One function to rule them all ... and in the darkness bind them + */ static void tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) { + u32 rxt_delivered; + /* * Duplicate ACK. Check if we should enter fast recovery, or if already in * it account for the bytes that left the network. @@ -1028,10 +1013,15 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) } /* If of of the two conditions lower hold, reset dupacks - * 1) Cumulative ack does not cover more than congestion threshold + * 1) Cumulative ack does not cover more than congestion threshold, + * and the following doesn't hold: the congestion window is + * greater than SMSS bytes and the difference between highest_ack + * and prev_highest_ack is at most 4*SMSS bytes (XXX) * 2) RFC6582 heuristic to avoid multiple fast retransmits */ - if (seq_leq (tc->snd_una, tc->snd_congestion) + if ((seq_gt (tc->snd_una, tc->snd_congestion) + || !(tc->cwnd > tc->snd_mss + && tc->bytes_acked <= 4 * tc->snd_mss)) || tc->rcv_opts.tsecr != tc->tsecr_last_ack) { tc->rcv_dupacks = 0; @@ -1089,7 +1079,10 @@ partial_ack: { /* If spurious return, we've already updated everything */ if (tcp_cc_recover (tc)) - return; + { + tc->tsecr_last_ack = tc->rcv_opts.tsecr; + return; + } tc->snd_nxt = tc->snd_una_max; @@ -1115,12 +1108,16 @@ partial_ack: return; /* Remove retransmitted bytes that have been delivered */ - if (tc->sack_sb.last_bytes_delivered - && seq_gt (tc->sack_sb.high_rxt, tc->snd_una)) + ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv + >= tc->sack_sb.last_bytes_delivered); + rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv + - tc->sack_sb.last_bytes_delivered; + if (rxt_delivered && seq_gt (tc->sack_sb.high_rxt, tc->snd_una)) { /* If we have sacks and we haven't gotten an ack beyond high_rxt, * remove sacked bytes delivered */ - tc->snd_rxt_bytes -= tc->sack_sb.last_bytes_delivered; + ASSERT (tc->snd_rxt_bytes >= rxt_delivered); + tc->snd_rxt_bytes -= rxt_delivered; } else { @@ -1154,6 +1151,8 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, u32 prev_snd_wnd, prev_snd_una; u8 is_dack; + TCP_EVT_DBG (TCP_EVT_CC_STAT, tc); + /* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) */ if (PREDICT_FALSE (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt))) { @@ -1282,6 +1281,10 @@ tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end) { vec_add1 (new_list, tc->snd_sacks[i]); } + else + { + clib_warning ("sack discarded"); + } } ASSERT (vec_len (new_list) <= TCP_MAX_SACK_BLOCKS); @@ -1358,16 +1361,18 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, stream_session_t *s0; int rv; + ASSERT (seq_gt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt)); + /* Pure ACK. Do nothing */ if (PREDICT_FALSE (data_len == 0)) { return TCP_ERROR_PURE_ACK; } - /* Enqueue out-of-order data with absolute offset */ + /* Enqueue out-of-order data with relative offset */ rv = stream_session_enqueue_data (&tc->connection, b, - vnet_buffer (b)->tcp.seq_number, - 0 /* queue event */ , 0); + vnet_buffer (b)->tcp.seq_number - + tc->rcv_nxt, 0 /* queue event */ , 0); /* Nothing written */ if (rv) @@ -1388,10 +1393,15 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, /* Get the newest segment from the fifo */ newest = svm_fifo_newest_ooo_segment (s0->server_rx_fifo); - start = ooo_segment_offset (s0->server_rx_fifo, newest); - end = ooo_segment_end_offset (s0->server_rx_fifo, newest); + if (newest) + { + start = + tc->rcv_nxt + ooo_segment_offset (s0->server_rx_fifo, newest); + end = start + ooo_segment_length (s0->server_rx_fifo, newest); + tcp_update_sack_list (tc, start, end); - tcp_update_sack_list (tc, start, end); + ASSERT (seq_gt (start, tc->rcv_nxt)); + } } return TCP_ERROR_ENQUEUED; @@ -1411,7 +1421,7 @@ tcp_can_delack (tcp_connection_t * tc) /* constrained to send ack */ || (tc->flags & TCP_CONN_SNDACK) != 0 /* we're almost out of tx wnd */ - || tcp_available_snd_space (tc) < 2 * tc->snd_mss) + || tcp_available_snd_space (tc) < 4 * tc->snd_mss) return 0; return 1; @@ -1434,7 +1444,7 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, *next0 = TCP_NEXT_DROP; /* Completely in the past (possible retransmit) */ - if (seq_lt (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt)) + if (seq_leq (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt)) goto done; /* Chop off the bytes in the past */ @@ -1873,8 +1883,8 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (tcp_opts_wscale (&new_tc0->rcv_opts)) new_tc0->snd_wscale = new_tc0->rcv_opts.wscale; - /* No scaling */ - new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window); + new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window) + << new_tc0->snd_wscale; new_tc0->snd_wl1 = seq0; new_tc0->snd_wl2 = ack0; @@ -1892,8 +1902,15 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Make sure las is initialized for the wnd computation */ new_tc0->rcv_las = new_tc0->rcv_nxt; - /* Notify app that we have connection */ - stream_session_connect_notify (&new_tc0->connection, sst, 0); + /* Notify app that we have connection. If session layer can't + * allocate session send reset */ + if (stream_session_connect_notify (&new_tc0->connection, sst, + 0)) + { + tcp_connection_cleanup (new_tc0); + tcp_send_reset (b0, is_ip4); + goto drop; + } stream_session_init_fifos_pointers (&new_tc0->connection, new_tc0->irs + 1, @@ -1907,7 +1924,14 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, new_tc0->state = TCP_STATE_SYN_RCVD; /* Notify app that we have connection */ - stream_session_connect_notify (&new_tc0->connection, sst, 0); + if (stream_session_connect_notify + (&new_tc0->connection, sst, 0)) + { + tcp_connection_cleanup (new_tc0); + tcp_send_reset (b0, is_ip4); + goto drop; + } + stream_session_init_fifos_pointers (&new_tc0->connection, new_tc0->irs + 1, new_tc0->iss + 1); @@ -2508,8 +2532,8 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (tcp_opts_wscale (&child0->rcv_opts)) child0->snd_wscale = child0->rcv_opts.wscale; - /* No scaling */ - child0->snd_wnd = clib_net_to_host_u16 (th0->window); + child0->snd_wnd = clib_net_to_host_u16 (th0->window) + << child0->snd_wscale; child0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number; child0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number; @@ -2892,6 +2916,9 @@ do { \ _(FIN_WAIT_2, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(FIN_WAIT_2, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(CLOSE_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(CLOSE_WAIT, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, + TCP_ERROR_NONE); _(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(LAST_ACK, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c index c66250e4..c825e952 100644 --- a/src/vnet/tcp/tcp_newreno.c +++ b/src/vnet/tcp/tcp_newreno.c @@ -18,7 +18,6 @@ void newreno_congestion (tcp_connection_t * tc) { - tc->prev_ssthresh = tc->ssthresh; tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss); } @@ -47,7 +46,8 @@ newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type) { if (ack_type == TCP_CC_DUPACK) { - tc->cwnd += tc->snd_mss; + if (!tcp_opts_sack_permitted (tc)) + tc->cwnd += tc->snd_mss; } else if (ack_type == TCP_CC_PARTIALACK) { diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 47c94e6d..554a981d 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -1052,6 +1052,7 @@ tcp_rtx_timeout_cc (tcp_connection_t * tc) tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss); tc->cwnd = tcp_loss_wnd (tc); tc->snd_congestion = tc->snd_una_max; + tcp_recovery_on (tc); } @@ -1213,7 +1214,7 @@ tcp_timer_persist_handler (u32 index) tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID; /* Problem already solved or worse */ - if (tc->state == TCP_STATE_CLOSED + if (tc->state == TCP_STATE_CLOSED || tc->state > TCP_STATE_ESTABLISHED || tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc)) return; @@ -1505,10 +1506,7 @@ tcp46_output_inline (vlib_main_t * vm, /* Stop DELACK timer and fix flags */ tc0->flags &= ~(TCP_CONN_SNDACK); - if (tcp_timer_is_active (tc0, TCP_TIMER_DELACK)) - { - tcp_timer_reset (tc0, TCP_TIMER_DELACK); - } + tcp_timer_reset (tc0, TCP_TIMER_DELACK); /* If not retransmitting * 1) update snd_una_max (SYN, SYNACK, FIN) @@ -1630,7 +1628,7 @@ tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b) tc = (tcp_connection_t *) tconn; tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED, 0); - if (tc->rtt_ts == 0) + if (tc->rtt_ts == 0 && !tcp_in_cong_recovery (tc)) { tc->rtt_ts = tcp_time_now (); tc->rtt_seq = tc->snd_nxt; diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c index 3f8afa40..a461e3b8 100644 --- a/src/vnet/tcp/tcp_test.c +++ b/src/vnet/tcp/tcp_test.c @@ -190,11 +190,18 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes); TCP_TEST ((pool_elts (sb->holes) == 1), "scoreboard has %d elements", pool_elts (sb->holes)); + hole = scoreboard_first_hole (sb); + TCP_TEST ((hole->prev == TCP_INVALID_SACK_HOLE_INDEX + && hole->next == TCP_INVALID_SACK_HOLE_INDEX), "hole is valid"); + TCP_TEST ((sb->last_bytes_delivered == 100), "last bytes delivered %d", + sb->last_bytes_delivered); /* * Add some more blocks and then remove all */ vec_reset_length (tc->rcv_opts.sacks); + tc->snd_una += sb->snd_una_adv; + tc->snd_una_max = 1900; for (i = 0; i < 5; i++) { block.start = i * 100 + 1200; @@ -242,6 +249,39 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) TCP_TEST ((sb->last_sacked_bytes == 0), "last sacked bytes %d", sb->last_sacked_bytes); + /* + * Inject one block, ack it and overlap hole + */ + + tc->snd_una = 0; + tc->snd_una_max = 1000; + tc->snd_nxt = 1000; + + block.start = 100; + block.end = 500; + vec_add1 (tc->rcv_opts.sacks, block); + tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks); + + tcp_rcv_sacks (tc, 0); + + if (verbose) + vlib_cli_output (vm, "sb added [100, 500]:\n%U", + format_tcp_scoreboard, sb); + + tcp_rcv_sacks (tc, 800); + + if (verbose) + vlib_cli_output (vm, "sb ack [0, 800]:\n%U", format_tcp_scoreboard, sb); + + TCP_TEST ((pool_elts (sb->holes) == 1), + "scoreboard has %d elements", pool_elts (sb->holes)); + TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv); + TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes); + TCP_TEST ((sb->last_sacked_bytes == 0), + "last sacked bytes %d", sb->last_sacked_bytes); + TCP_TEST ((sb->last_bytes_delivered == 400), + "last bytes delivered %d", sb->last_bytes_delivered); + return 0; } @@ -571,7 +611,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) */ for (i = 0; i < 3; i++) { - offset = (2 * i + 1) * sizeof (u32); + offset = (2 * i + 1) * sizeof (u32) - f->tail; data = (u8 *) (test_data + (2 * i + 1)); if (i == 0) { @@ -600,7 +640,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) /* * Try adding a completely overlapped segment */ - offset = 3 * sizeof (u32); + offset = 3 * sizeof (u32) - f->tail; data = (u8 *) (test_data + 3); rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data); if (rv) @@ -626,7 +666,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) */ for (i = 3; i > 1; i--) { - offset = (2 * i + 0) * sizeof (u32); + offset = (2 * i + 0) * sizeof (u32) - f->tail; data = (u8 *) (test_data + (2 * i + 0)); rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data); if (verbose) @@ -688,7 +728,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) for (i = 0; i < 4; i++) { - offset = (2 * i + 1) * sizeof (u32); + offset = (2 * i + 1) * sizeof (u32) - f->tail; data = (u8 *) (test_data + (2 * i + 1)); rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data); if (verbose) @@ -701,7 +741,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) } } - rv = svm_fifo_enqueue_with_offset (f, 8, 21, data); + rv = svm_fifo_enqueue_with_offset (f, 8 - f->tail, 21, data); TCP_TEST ((rv == 0), "ooo enqueued %u", rv); TCP_TEST ((svm_fifo_number_ooo_segments (f) == 1), "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); @@ -722,7 +762,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) for (i = 0; i < 4; i++) { - offset = (2 * i + 1) * sizeof (u32); + offset = (2 * i + 1) * sizeof (u32) - f->tail; data = (u8 *) (test_data + (2 * i + 1)); rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data); if (verbose) @@ -735,7 +775,13 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) } } + if (verbose) + vlib_cli_output (vm, "fifo after enqueue: %U", format_svm_fifo, f, 1); + rv = svm_fifo_enqueue_nowait (f, 29, data); + if (verbose) + vlib_cli_output (vm, "fifo after enqueueing 29: %U", format_svm_fifo, f, + 1); TCP_TEST ((rv == 32), "ooo enqueued %u", rv); TCP_TEST ((svm_fifo_number_ooo_segments (f) == 0), "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); @@ -788,7 +834,8 @@ tcp_test_fifo2 (vlib_main_t * vm) { tp = vp + i; data64 = tp->offset; - svm_fifo_enqueue_with_offset (f, tp->offset, tp->len, (u8 *) & data64); + svm_fifo_enqueue_with_offset (f, tp->offset - f->tail, tp->len, + (u8 *) & data64); } /* Expected result: one big fat chunk at offset 4 */ @@ -817,7 +864,7 @@ tcp_test_fifo2 (vlib_main_t * vm) { tp = &test_data[i]; data64 = tp->offset; - rv = svm_fifo_enqueue_with_offset (f, tp->offset, tp->len, + rv = svm_fifo_enqueue_with_offset (f, tp->offset - f->tail, tp->len, (u8 *) & data64); if (rv) { @@ -991,8 +1038,9 @@ tcp_test_fifo3 (vlib_main_t * vm, unformat_input_t * input) for (i = !randomize; i < vec_len (generate); i++) { tp = generate + i; - svm_fifo_enqueue_with_offset (f, fifo_initial_offset + tp->offset, - tp->len, + svm_fifo_enqueue_with_offset (f, + fifo_initial_offset + tp->offset - + f->tail, tp->len, (u8 *) data_pattern + tp->offset); } @@ -1107,7 +1155,7 @@ tcp_test_fifo4 (vlib_main_t * vm, unformat_input_t * input) for (i = test_n_bytes - 1; i > 0; i--) { - rv = svm_fifo_enqueue_with_offset (f, fifo_initial_offset + i, + rv = svm_fifo_enqueue_with_offset (f, fifo_initial_offset + i - f->tail, sizeof (u8), &test_data[i]); if (verbose) vlib_cli_output (vm, "add [%d] [%d, %d]", i, i, i + sizeof (u8)); -- cgit 1.2.3-korg From 2c25a62cc1cc4937165de740a3b32d78429c72d6 Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Mon, 26 Jun 2017 11:35:07 -0400 Subject: Horizontal (nSessions) scaling draft - Data structure preallocation. - Input state machine fixes for mid-stream 3-way handshake retries. - Batch connections in the builtin_client - Multiple private fifo segment support - Fix elog simultaneous event type registration - Fix sacks when segment hole is added after highest sacked - Add "accepting" session state for sessions pending accept - Add ssvm non-recursive locking - Estimate RTT for syn-ack - Don't init fifo pointers. We're using relative offsets for ooo segments - CLI to dump individual session Change-Id: Ie0598563fd246537bafba4feed7985478ea1d415 Signed-off-by: Dave Barach Signed-off-by: Florin Coras --- src/svm/ssvm.h | 17 +++ src/svm/svm_fifo.c | 56 +++++--- src/svm/svm_fifo.h | 16 ++- src/svm/svm_fifo_segment.c | 114 +++++++++++----- src/svm/svm_fifo_segment.h | 4 +- src/svm/test_svm_fifo1.c | 10 +- src/uri/uri_udp_test.c | 2 +- src/vnet/session/application.c | 2 + src/vnet/session/application_interface.c | 21 --- src/vnet/session/application_interface.h | 12 +- src/vnet/session/node.c | 23 +--- src/vnet/session/segment_manager.c | 26 ++-- src/vnet/session/segment_manager.h | 4 + src/vnet/session/session.c | 72 +++++++--- src/vnet/session/session.h | 30 ++++- src/vnet/session/session_cli.c | 99 +++++++++++--- src/vnet/session/transport.h | 6 + src/vnet/tcp/builtin_client.c | 118 +++++++++++----- src/vnet/tcp/builtin_client.h | 7 +- src/vnet/tcp/builtin_server.c | 66 +++++++-- src/vnet/tcp/tcp.c | 225 ++++++++++++++++++++++++++++--- src/vnet/tcp/tcp.h | 13 ++ src/vnet/tcp/tcp_debug.h | 13 +- src/vnet/tcp/tcp_input.c | 97 ++++++++----- src/vnet/tcp/tcp_newreno.c | 4 +- src/vnet/tcp/tcp_output.c | 53 +++++--- src/vnet/tcp/tcp_packet.h | 1 + src/vnet/tcp/tcp_test.c | 10 +- src/vnet/udp/udp_input.c | 2 +- 29 files changed, 838 insertions(+), 285 deletions(-) (limited to 'src/vnet/tcp/tcp_newreno.c') diff --git a/src/svm/ssvm.h b/src/svm/ssvm.h index bccfc164..8466e155 100644 --- a/src/svm/ssvm.h +++ b/src/svm/ssvm.h @@ -101,6 +101,15 @@ ssvm_lock (ssvm_shared_header_t * h, u32 my_pid, u32 tag) h->tag = tag; } +always_inline void +ssvm_lock_non_recursive (ssvm_shared_header_t * h, u32 tag) +{ + while (__sync_lock_test_and_set (&h->lock, 1)) + ; + + h->tag = tag; +} + always_inline void ssvm_unlock (ssvm_shared_header_t * h) { @@ -113,6 +122,14 @@ ssvm_unlock (ssvm_shared_header_t * h) } } +always_inline void +ssvm_unlock_non_recursive (ssvm_shared_header_t * h) +{ + h->tag = 0; + CLIB_MEMORY_BARRIER (); + h->lock = 0; +} + static inline void * ssvm_push_heap (ssvm_shared_header_t * sh) { diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c index aed5d6a7..da60fee5 100644 --- a/src/svm/svm_fifo.c +++ b/src/svm/svm_fifo.c @@ -19,29 +19,29 @@ static inline u8 position_lt (svm_fifo_t * f, u32 a, u32 b) { - return (ooo_segment_distance_to_tail (f, a) - < ooo_segment_distance_to_tail (f, b)); + return (ooo_segment_distance_from_tail (f, a) + < ooo_segment_distance_from_tail (f, b)); } static inline u8 position_leq (svm_fifo_t * f, u32 a, u32 b) { - return (ooo_segment_distance_to_tail (f, a) - <= ooo_segment_distance_to_tail (f, b)); + return (ooo_segment_distance_from_tail (f, a) + <= ooo_segment_distance_from_tail (f, b)); } static inline u8 position_gt (svm_fifo_t * f, u32 a, u32 b) { - return (ooo_segment_distance_to_tail (f, a) - > ooo_segment_distance_to_tail (f, b)); + return (ooo_segment_distance_from_tail (f, a) + > ooo_segment_distance_from_tail (f, b)); } static inline u32 position_diff (svm_fifo_t * f, u32 posa, u32 posb) { - return ooo_segment_distance_to_tail (f, posa) - - ooo_segment_distance_to_tail (f, posb); + return ooo_segment_distance_from_tail (f, posa) + - ooo_segment_distance_from_tail (f, posb); } static inline u32 @@ -113,7 +113,7 @@ svm_fifo_create (u32 data_size_in_bytes) if (f == 0) return 0; - memset (f, 0, sizeof (*f) + data_size_in_bytes); + memset (f, 0, sizeof (*f)); f->nitems = data_size_in_bytes; f->ooos_list_head = OOO_SEGMENT_INVALID_INDEX; @@ -204,7 +204,19 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) { s = prev; s_end_pos = ooo_segment_end_pos (f, s); - goto merge; + + /* Check head and tail now since segment may be wider at both ends so + * merge tests lower won't work */ + if (position_lt (f, normalized_position, s->start)) + { + s->start = normalized_position; + s->length = position_diff (f, s_end_pos, s->start); + } + if (position_gt (f, normalized_end_position, s_end_pos)) + { + s->length = position_diff (f, normalized_end_position, s->start); + } + goto check_tail; } s_index = s - f->ooo_segments; @@ -257,8 +269,6 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) * Merge needed */ -merge: - /* Merge at head */ if (position_lt (f, normalized_position, s->start)) { @@ -278,6 +288,7 @@ merge: goto done; } +check_tail: /* The new segment's tail may cover multiple smaller ones */ if (position_gt (f, normalized_end_position, s_end_pos)) { @@ -296,7 +307,8 @@ merge: /* If partial overlap with last, merge */ if (it && position_leq (f, it->start, normalized_end_position)) { - s->length = ooo_segment_end_pos (f, it) - s->start; + s->length = + position_diff (f, ooo_segment_end_pos (f, it), s->start); ooo_segment_del (f, it - f->ooo_segments); } } @@ -319,9 +331,9 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) i32 diff; s = pool_elt_at_index (f->ooo_segments, f->ooos_list_head); + diff = ooo_segment_distance_to_tail (f, s->start); - diff = (f->tail >= s->start) ? - f->tail - s->start : f->nitems + f->tail - s->start; + ASSERT (diff != n_bytes_enqueued); if (diff > n_bytes_enqueued) return 0; @@ -345,8 +357,7 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) if (s->next != OOO_SEGMENT_INVALID_INDEX) { s = pool_elt_at_index (f->ooo_segments, s->next); - diff = (f->tail >= s->start) ? - f->tail - s->start : f->nitems + f->tail - s->start; + diff = ooo_segment_distance_to_tail (f, s->start); ooo_segment_del (f, index); } /* End of search */ @@ -357,6 +368,7 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) } } + ASSERT (bytes >= 0 && bytes <= f->nitems); return bytes; } @@ -401,6 +413,8 @@ svm_fifo_enqueue_internal (svm_fifo_t * f, u32 max_bytes, u8 * copy_from_here) } else { + ASSERT (0); + /* Account for a zero-copy enqueue done elsewhere */ ASSERT (max_bytes <= (nitems - cursize)); f->tail += max_bytes; @@ -413,6 +427,7 @@ svm_fifo_enqueue_internal (svm_fifo_t * f, u32 max_bytes, u8 * copy_from_here) total_copy_bytes += ooo_segment_try_collect (f, total_copy_bytes); /* Atomically increase the queue length */ + ASSERT (cursize + total_copy_bytes <= nitems); __sync_fetch_and_add (&f->cursize, total_copy_bytes); return (total_copy_bytes); @@ -475,6 +490,8 @@ svm_fifo_enqueue_with_offset_internal (svm_fifo_t * f, cursize = svm_fifo_max_dequeue (f); nitems = f->nitems; + ASSERT (required_bytes < nitems); + normalized_offset = (f->tail + offset) % nitems; /* Will this request fit? */ @@ -557,6 +574,7 @@ svm_fifo_dequeue_internal (svm_fifo_t * f, u32 max_bytes, u8 * copy_here) } else { + ASSERT (0); /* Account for a zero-copy dequeue done elsewhere */ ASSERT (max_bytes <= cursize); f->head += max_bytes; @@ -565,6 +583,8 @@ svm_fifo_dequeue_internal (svm_fifo_t * f, u32 max_bytes, u8 * copy_here) total_copy_bytes = max_bytes; } + ASSERT (f->head <= nitems); + ASSERT (cursize >= total_copy_bytes); __sync_fetch_and_sub (&f->cursize, total_copy_bytes); return (total_copy_bytes); @@ -702,6 +722,8 @@ svm_fifo_dequeue_drop (svm_fifo_t * f, u32 max_bytes) f->head = (f->head == nitems) ? 0 : f->head; } + ASSERT (f->head <= nitems); + ASSERT (cursize >= total_drop_bytes); __sync_fetch_and_sub (&f->cursize, total_drop_bytes); return total_drop_bytes; diff --git a/src/svm/svm_fifo.h b/src/svm/svm_fifo.h index f32ef41d..fe21de47 100644 --- a/src/svm/svm_fifo.h +++ b/src/svm/svm_fifo.h @@ -133,25 +133,31 @@ svm_fifo_newest_ooo_segment (svm_fifo_t * f) } always_inline u32 -ooo_segment_distance_to_tail (svm_fifo_t * f, u32 a) +ooo_segment_distance_from_tail (svm_fifo_t * f, u32 pos) { /* Ambiguous. Assumption is that ooo segments don't touch tail */ - if (a == f->tail && f->tail == f->head) + if (PREDICT_FALSE (pos == f->tail && f->tail == f->head)) return f->nitems; - return ((f->nitems + a - f->tail) % f->nitems); + return (((f->nitems + pos) - f->tail) % f->nitems); +} + +always_inline u32 +ooo_segment_distance_to_tail (svm_fifo_t * f, u32 pos) +{ + return (((f->nitems + f->tail) - pos) % f->nitems); } always_inline u32 ooo_segment_offset (svm_fifo_t * f, ooo_segment_t * s) { - return ooo_segment_distance_to_tail (f, s->start); + return ooo_segment_distance_from_tail (f, s->start); } always_inline u32 ooo_segment_end_offset (svm_fifo_t * f, ooo_segment_t * s) { - return ooo_segment_distance_to_tail (f, s->start) + s->length; + return ooo_segment_distance_from_tail (f, s->start) + s->length; } always_inline u32 diff --git a/src/svm/svm_fifo_segment.c b/src/svm/svm_fifo_segment.c index c4ac2352..69d4ecb9 100644 --- a/src/svm/svm_fifo_segment.c +++ b/src/svm/svm_fifo_segment.c @@ -35,6 +35,11 @@ preallocate_fifo_pairs (svm_fifo_segment_header_t * fsh, rx_fifo_size = (sizeof (*f) + a->rx_fifo_size) * a->preallocated_fifo_pairs; tx_fifo_size = (sizeof (*f) + a->tx_fifo_size) * a->preallocated_fifo_pairs; + if (0) + clib_warning ("rx_fifo_size %u (%d mb), tx_fifo_size %u (%d mb)", + rx_fifo_size, rx_fifo_size >> 20, + tx_fifo_size, tx_fifo_size >> 20); + /* Allocate rx fifo space. May fail. */ rx_fifo_space = clib_mem_alloc_aligned_at_offset (rx_fifo_size, CLIB_CACHE_LINE_BYTES, 0 /* align_offset */ , @@ -129,7 +134,7 @@ svm_fifo_segment_create (svm_fifo_segment_create_args_t * a) ssvm_pop_heap (oldheap); sh->ready = 1; - a->new_segment_index = s - sm->segments; + vec_add1 (a->new_segment_indices, s - sm->segments); return (0); } @@ -141,35 +146,81 @@ svm_fifo_segment_create_process_private (svm_fifo_segment_create_args_t * a) svm_fifo_segment_main_t *sm = &svm_fifo_segment_main; ssvm_shared_header_t *sh; svm_fifo_segment_header_t *fsh; + void *oldheap; + u8 **heaps = 0; + mheap_t *heap_header; + int segment_count = 1; + int i; - /* Allocate a fresh segment */ - pool_get (sm->segments, s); - memset (s, 0, sizeof (*s)); - - s->ssvm.ssvm_size = ~0; - s->ssvm.i_am_master = 1; - s->ssvm.my_pid = getpid (); - s->ssvm.name = (u8 *) a->segment_name; - s->ssvm.requested_va = ~0; - - /* Allocate a [sic] shared memory header, in process memory... */ - sh = clib_mem_alloc_aligned (sizeof (*sh), CLIB_CACHE_LINE_BYTES); - s->ssvm.sh = sh; + if (a->private_segment_count && a->private_segment_size) + { + void *mem; + u8 *heap; + u32 pagesize = clib_mem_get_page_size (); + u32 rnd_size; - memset (sh, 0, sizeof (*sh)); - sh->heap = clib_mem_get_heap (); + for (i = 0; i < a->private_segment_count; i++) + { + rnd_size = (a->private_segment_size + (pagesize - 1)) & ~pagesize; + + mem = mmap (0, rnd_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + -1 /* fd */ , 0 /* offset */ ); + + if (mem == MAP_FAILED) + { + clib_unix_warning ("mmap"); + return -1; + } + heap = mheap_alloc (mem, rnd_size); + heap_header = mheap_header (heap); + heap_header->flags |= MHEAP_FLAG_THREAD_SAFE; + vec_add1 (heaps, heap); + } + segment_count = a->private_segment_count; + } - /* Set up svm_fifo_segment shared header */ - fsh = clib_mem_alloc (sizeof (*fsh)); - memset (fsh, 0, sizeof (*fsh)); - sh->opaque[0] = fsh; - s->h = fsh; - fsh->segment_name = format (0, "%s%c", a->segment_name, 0); + /* Spread preallocated fifo pairs across segments */ + a->preallocated_fifo_pairs /= segment_count; - preallocate_fifo_pairs (fsh, a); + /* Allocate segments */ + for (i = 0; i < segment_count; i++) + { + pool_get (sm->segments, s); + memset (s, 0, sizeof (*s)); + + s->ssvm.ssvm_size = ~0; + s->ssvm.i_am_master = 1; + s->ssvm.my_pid = getpid (); + s->ssvm.name = (u8 *) a->segment_name; + s->ssvm.requested_va = ~0; + + /* Allocate a [sic] shared memory header, in process memory... */ + sh = clib_mem_alloc_aligned (sizeof (*sh), CLIB_CACHE_LINE_BYTES); + s->ssvm.sh = sh; + + memset (sh, 0, sizeof (*sh)); + sh->heap = a->private_segment_count ? heaps[i] : clib_mem_get_heap (); + + /* Set up svm_fifo_segment shared header */ + fsh = clib_mem_alloc (sizeof (*fsh)); + memset (fsh, 0, sizeof (*fsh)); + sh->opaque[0] = fsh; + s->h = fsh; + fsh->segment_name = format (0, "%s%c", a->segment_name, 0); + + if (a->private_segment_count) + { + oldheap = clib_mem_get_heap (); + clib_mem_set_heap (sh->heap); + preallocate_fifo_pairs (fsh, a); + clib_mem_set_heap (oldheap); + } - sh->ready = 1; - a->new_segment_index = s - sm->segments; + sh->ready = 1; + vec_add1 (a->new_segment_indices, s - sm->segments); + } + vec_free (heaps); return (0); } @@ -205,7 +256,7 @@ svm_fifo_segment_attach (svm_fifo_segment_create_args_t * a) fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; s->h = fsh; - a->new_segment_index = s - sm->segments; + vec_add1 (a->new_segment_indices, s - sm->segments); return (0); } @@ -230,7 +281,7 @@ svm_fifo_segment_alloc_fifo (svm_fifo_segment_private_t * s, sh = s->ssvm.sh; fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; - ssvm_lock (sh, 1, 0); + ssvm_lock_non_recursive (sh, 1); oldheap = ssvm_push_heap (sh); switch (list_index) @@ -261,7 +312,7 @@ svm_fifo_segment_alloc_fifo (svm_fifo_segment_private_t * s, if (PREDICT_FALSE (f == 0)) { ssvm_pop_heap (oldheap); - ssvm_unlock (sh); + ssvm_unlock_non_recursive (sh); return (0); } @@ -281,7 +332,7 @@ found: } ssvm_pop_heap (oldheap); - ssvm_unlock (sh); + ssvm_unlock_non_recursive (sh); return (f); } @@ -293,10 +344,11 @@ svm_fifo_segment_free_fifo (svm_fifo_segment_private_t * s, svm_fifo_t * f, svm_fifo_segment_header_t *fsh; void *oldheap; + sh = s->ssvm.sh; fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; - ssvm_lock (sh, 1, 0); + ssvm_lock_non_recursive (sh, 2); oldheap = ssvm_push_heap (sh); switch (list_index) @@ -325,7 +377,7 @@ svm_fifo_segment_free_fifo (svm_fifo_segment_private_t * s, svm_fifo_t * f, } ssvm_pop_heap (oldheap); - ssvm_unlock (sh); + ssvm_unlock_non_recursive (sh); } void diff --git a/src/svm/svm_fifo_segment.h b/src/svm/svm_fifo_segment.h index 31e14db5..a7a3f469 100644 --- a/src/svm/svm_fifo_segment.h +++ b/src/svm/svm_fifo_segment.h @@ -57,10 +57,12 @@ typedef struct { char *segment_name; u32 segment_size; - u32 new_segment_index; + u32 *new_segment_indices; u32 rx_fifo_size; u32 tx_fifo_size; u32 preallocated_fifo_pairs; + u32 private_segment_count; + u32 private_segment_size; } svm_fifo_segment_create_args_t; static inline svm_fifo_segment_private_t * diff --git a/src/svm/test_svm_fifo1.c b/src/svm/test_svm_fifo1.c index 63b4a9b7..63d75845 100644 --- a/src/svm/test_svm_fifo1.c +++ b/src/svm/test_svm_fifo1.c @@ -39,7 +39,7 @@ hello_world (int verbose) if (rv) return clib_error_return (0, "svm_fifo_segment_create returned %d", rv); - sp = svm_fifo_get_segment (a->new_segment_index); + sp = svm_fifo_get_segment (a->new_segment_indices[0]); f = svm_fifo_segment_alloc_fifo (sp, 4096, FIFO_SEGMENT_RX_FREELIST); @@ -92,7 +92,7 @@ master (int verbose) if (rv) return clib_error_return (0, "svm_fifo_segment_create returned %d", rv); - sp = svm_fifo_get_segment (a->new_segment_index); + sp = svm_fifo_get_segment (a->new_segment_indices[0]); f = svm_fifo_segment_alloc_fifo (sp, 4096, FIFO_SEGMENT_RX_FREELIST); @@ -128,7 +128,7 @@ mempig (int verbose) if (rv) return clib_error_return (0, "svm_fifo_segment_create returned %d", rv); - sp = svm_fifo_get_segment (a->new_segment_index); + sp = svm_fifo_get_segment (a->new_segment_indices[0]); for (i = 0; i < 1000; i++) { @@ -186,7 +186,7 @@ offset (int verbose) if (rv) return clib_error_return (0, "svm_fifo_segment_create returned %d", rv); - sp = svm_fifo_get_segment (a->new_segment_index); + sp = svm_fifo_get_segment (a->new_segment_indices[0]); f = svm_fifo_segment_alloc_fifo (sp, 200 << 10, FIFO_SEGMENT_RX_FREELIST); @@ -246,7 +246,7 @@ slave (int verbose) if (rv) return clib_error_return (0, "svm_fifo_segment_attach returned %d", rv); - sp = svm_fifo_get_segment (a->new_segment_index); + sp = svm_fifo_get_segment (a->new_segment_indices[0]); sh = sp->ssvm.sh; fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; diff --git a/src/uri/uri_udp_test.c b/src/uri/uri_udp_test.c index 45ad35a4..a8e39eaa 100644 --- a/src/uri/uri_udp_test.c +++ b/src/uri/uri_udp_test.c @@ -707,7 +707,7 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) return; } - segment_index = a->new_segment_index; + segment_index = a->new_segment_indices[0]; vec_add2 (utm->seg, seg, 1); memcpy (seg, sm->segments + segment_index, sizeof (*seg)); sleep (1); diff --git a/src/vnet/session/application.c b/src/vnet/session/application.c index 3cc56f37..8a953719 100644 --- a/src/vnet/session/application.c +++ b/src/vnet/session/application.c @@ -174,6 +174,8 @@ application_init (application_t * app, u32 api_client_index, u64 * options, props->preallocated_fifo_pairs = options[APP_OPTIONS_PREALLOC_FIFO_PAIRS]; props->use_private_segment = options[APP_OPTIONS_FLAGS] & APP_OPTIONS_FLAGS_BUILTIN_APP; + props->private_segment_count = options[APP_OPTIONS_PRIVATE_SEGMENT_COUNT]; + props->private_segment_size = options[APP_OPTIONS_PRIVATE_SEGMENT_SIZE]; first_seg_size = options[SESSION_OPTIONS_SEGMENT_SIZE]; if ((rv = segment_manager_init (sm, props, first_seg_size))) diff --git a/src/vnet/session/application_interface.c b/src/vnet/session/application_interface.c index 338ae857..566a52d7 100644 --- a/src/vnet/session/application_interface.c +++ b/src/vnet/session/application_interface.c @@ -275,27 +275,6 @@ vnet_application_detach (vnet_app_detach_args_t * a) return 0; } -session_type_t -session_type_from_proto_and_ip (session_api_proto_t proto, u8 is_ip4) -{ - if (proto == SESSION_PROTO_TCP) - { - if (is_ip4) - return SESSION_TYPE_IP4_TCP; - else - return SESSION_TYPE_IP6_TCP; - } - else - { - if (is_ip4) - return SESSION_TYPE_IP4_UDP; - else - return SESSION_TYPE_IP6_UDP; - } - - return SESSION_N_TYPES; -} - int vnet_bind_uri (vnet_bind_args_t * a) { diff --git a/src/vnet/session/application_interface.h b/src/vnet/session/application_interface.h index 4d6f9def..ed9f89b3 100644 --- a/src/vnet/session/application_interface.h +++ b/src/vnet/session/application_interface.h @@ -22,12 +22,6 @@ #include #include -typedef enum _session_api_proto -{ - SESSION_PROTO_TCP, - SESSION_PROTO_UDP -} session_api_proto_t; - typedef struct _vnet_app_attach_args_t { /** Binary API client index */ @@ -65,7 +59,7 @@ typedef struct _vnet_bind_args_t struct { transport_endpoint_t tep; - session_api_proto_t proto; + transport_proto_t proto; }; }; @@ -98,7 +92,7 @@ typedef struct _vnet_connect_args struct { transport_endpoint_t tep; - session_api_proto_t proto; + transport_proto_t proto; }; }; u32 app_index; @@ -120,6 +114,8 @@ typedef enum APP_EVT_QUEUE_SIZE, APP_OPTIONS_FLAGS, APP_OPTIONS_PREALLOC_FIFO_PAIRS, + APP_OPTIONS_PRIVATE_SEGMENT_COUNT, + APP_OPTIONS_PRIVATE_SEGMENT_SIZE, SESSION_OPTIONS_SEGMENT_SIZE, SESSION_OPTIONS_ADD_SEGMENT_SIZE, SESSION_OPTIONS_RX_FIFO_SIZE, diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c index b24f5fd9..56e62637 100644 --- a/src/vnet/session/node.c +++ b/src/vnet/session/node.c @@ -378,24 +378,12 @@ session_tx_fifo_dequeue_and_snd (vlib_main_t * vm, vlib_node_runtime_t * node, n_tx_pkts, 0); } -stream_session_t * -session_event_get_session (session_fifo_event_t * e0, u8 thread_index) +always_inline stream_session_t * +session_event_get_session (session_fifo_event_t * e, u8 thread_index) { - svm_fifo_t *f0; - stream_session_t *s0; - u32 session_index0; - - f0 = e0->fifo; - session_index0 = f0->master_session_index; - - /* $$$ add multiple event queues, per vpp worker thread */ - ASSERT (f0->master_thread_index == thread_index); - - s0 = stream_session_get_if_valid (session_index0, thread_index); - - ASSERT (s0 == 0 || s0->thread_index == thread_index); - - return s0; + ASSERT (e->fifo->master_thread_index == thread_index); + return stream_session_get_if_valid (e->fifo->master_session_index, + thread_index); } void @@ -569,7 +557,6 @@ skip_dequeue: case FIFO_EVENT_BUILTIN_RX: s0 = session_event_get_session (e0, my_thread_index); svm_fifo_unset_event (s0->server_rx_fifo); - /* Get session's server */ app = application_get (s0->app_index); app->cb_fns.builtin_server_rx_callback (s0); break; diff --git a/src/vnet/session/segment_manager.c b/src/vnet/session/segment_manager.c index dcef6261..262b7faa 100644 --- a/src/vnet/session/segment_manager.c +++ b/src/vnet/session/segment_manager.c @@ -30,7 +30,7 @@ segment_manager_t *segment_managers = 0; /** * Process private segment index */ -u32 private_segment_index = ~0; +u32 *private_segment_indices; /** * Default fifo and segment size. TODO config. @@ -70,7 +70,8 @@ session_manager_add_segment_i (segment_manager_t * sm, u32 segment_size, return VNET_API_ERROR_SVM_SEGMENT_CREATE_FAIL; } - vec_add1 (sm->segment_indices, ca->new_segment_index); + vec_append (sm->segment_indices, ca->new_segment_indices); + vec_free (ca->new_segment_indices); return 0; } @@ -111,22 +112,23 @@ static void { svm_fifo_segment_create_args_t _a, *a = &_a; - if (private_segment_index != ~0) + if (private_segment_indices) return; memset (a, 0, sizeof (*a)); a->segment_name = "process-private-segment"; a->segment_size = ~0; - a->new_segment_index = ~0; a->rx_fifo_size = props->rx_fifo_size; a->tx_fifo_size = props->tx_fifo_size; a->preallocated_fifo_pairs = props->preallocated_fifo_pairs; + a->private_segment_count = props->private_segment_count; + a->private_segment_size = props->private_segment_size; if (svm_fifo_segment_create_process_private (a)) clib_warning ("Failed to create process private segment"); - private_segment_index = a->new_segment_index; - ASSERT (private_segment_index != ~0); + private_segment_indices = a->new_segment_indices; + ASSERT (vec_len (private_segment_indices)); } /** @@ -156,10 +158,10 @@ segment_manager_init (segment_manager_t * sm, } else { - if (private_segment_index == ~0) + if (vec_len (private_segment_indices) == 0) segment_manager_alloc_process_private_segment (properties); - ASSERT (private_segment_index != ~0); - vec_add1 (sm->segment_indices, private_segment_index); + ASSERT (vec_len (private_segment_indices)); + vec_append (sm->segment_indices, private_segment_indices); } clib_spinlock_init (&sm->lockp); @@ -320,7 +322,7 @@ again: /* See if we're supposed to create another segment */ if (*server_rx_fifo == 0) { - if (sm->properties->add_segment) + if (sm->properties->add_segment && !sm->properties->use_private_segment) { if (added_a_segment) { @@ -379,6 +381,10 @@ segment_manager_dealloc_fifos (u32 svm_segment_index, svm_fifo_t * rx_fifo, svm_fifo_segment_free_fifo (fifo_segment, tx_fifo, FIFO_SEGMENT_TX_FREELIST); + /* Don't try to delete process-private segments */ + if (sm->properties->private_segment_count > 0) + return; + /* Remove segment only if it holds no fifos and not the first */ if (sm->segment_indices[0] != svm_segment_index && !svm_fifo_segment_has_fifos (fifo_segment)) diff --git a/src/vnet/session/segment_manager.h b/src/vnet/session/segment_manager.h index df38d2b3..41abeb22 100644 --- a/src/vnet/session/segment_manager.h +++ b/src/vnet/session/segment_manager.h @@ -39,6 +39,10 @@ typedef struct _segment_manager_properties /** Use private memory segment instead of shared memory */ u8 use_private_segment; + + /** Use one or more private mheaps, instead of the global heap */ + u32 private_segment_count; + u32 private_segment_size; } segment_manager_properties_t; typedef struct _segment_manager diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index fe198044..0a86d563 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -198,21 +198,28 @@ stream_session_lookup_listener4 (ip4_address_t * lcl, u16 lcl_port, u8 proto) */ stream_session_t * stream_session_lookup4 (ip4_address_t * lcl, ip4_address_t * rmt, - u16 lcl_port, u16 rmt_port, u8 proto, - u32 my_thread_index) + u16 lcl_port, u16 rmt_port, u8 proto) { session_manager_main_t *smm = &session_manager_main; session_kv4_t kv4; + stream_session_t *s; int rv; /* Lookup session amongst established ones */ make_v4_ss_kv (&kv4, lcl, rmt, lcl_port, rmt_port, proto); rv = clib_bihash_search_inline_16_8 (&smm->v4_session_hash, &kv4); if (rv == 0) - return stream_session_get_tsi (kv4.value, my_thread_index); + return stream_session_get_from_handle (kv4.value); /* If nothing is found, check if any listener is available */ - return stream_session_lookup_listener4 (lcl, lcl_port, proto); + if ((s = stream_session_lookup_listener4 (lcl, lcl_port, proto))) + return s; + + /* Finally, try half-open connections */ + rv = clib_bihash_search_inline_16_8 (&smm->v4_half_open_hash, &kv4); + if (rv == 0) + return stream_session_get_from_handle (kv4.value); + return 0; } stream_session_t * @@ -242,20 +249,27 @@ stream_session_lookup_listener6 (ip6_address_t * lcl, u16 lcl_port, u8 proto) * wildcarded local source (listener bound to all interfaces) */ stream_session_t * stream_session_lookup6 (ip6_address_t * lcl, ip6_address_t * rmt, - u16 lcl_port, u16 rmt_port, u8 proto, - u32 my_thread_index) + u16 lcl_port, u16 rmt_port, u8 proto) { session_manager_main_t *smm = vnet_get_session_manager_main (); session_kv6_t kv6; + stream_session_t *s; int rv; make_v6_ss_kv (&kv6, lcl, rmt, lcl_port, rmt_port, proto); rv = clib_bihash_search_inline_48_8 (&smm->v6_session_hash, &kv6); if (rv == 0) - return stream_session_get_tsi (kv6.value, my_thread_index); + return stream_session_get_from_handle (kv6.value); /* If nothing is found, check if any listener is available */ - return stream_session_lookup_listener6 (lcl, lcl_port, proto); + if ((s = stream_session_lookup_listener6 (lcl, lcl_port, proto))) + return s; + + /* Finally, try half-open connections */ + rv = clib_bihash_search_inline_48_8 (&smm->v6_half_open_hash, &kv6); + if (rv == 0) + return stream_session_get_from_handle (kv6.value); + return 0; } stream_session_t * @@ -340,7 +354,6 @@ stream_session_lookup_transport4 (ip4_address_t * lcl, ip4_address_t * rmt, rv = clib_bihash_search_inline_16_8 (&smm->v4_half_open_hash, &kv4); if (rv == 0) return tp_vfts[proto].get_half_open (kv4.value & 0xFFFFFFFF); - return 0; } @@ -390,6 +403,8 @@ stream_session_create_i (segment_manager_t * sm, transport_connection_t * tc, u32 thread_index = tc->thread_index; int rv; + ASSERT (thread_index == vlib_get_thread_index ()); + if ((rv = segment_manager_alloc_session_fifos (sm, &server_rx_fifo, &server_tx_fifo, &fifo_segment_index))) @@ -854,6 +869,7 @@ stream_session_accept (transport_connection_t * tc, u32 listener_index, s->app_index = server->index; s->listener_index = listener_index; + s->session_state = SESSION_STATE_ACCEPTING; /* Shoulder-tap the server */ if (notify) @@ -1088,6 +1104,27 @@ session_vpp_event_queue_allocate (session_manager_main_t * smm, } } +session_type_t +session_type_from_proto_and_ip (transport_proto_t proto, u8 is_ip4) +{ + if (proto == TRANSPORT_PROTO_TCP) + { + if (is_ip4) + return SESSION_TYPE_IP4_TCP; + else + return SESSION_TYPE_IP6_TCP; + } + else + { + if (is_ip4) + return SESSION_TYPE_IP4_UDP; + else + return SESSION_TYPE_IP6_UDP; + } + + return SESSION_N_TYPES; +} + static clib_error_t * session_manager_main_enable (vlib_main_t * vm) { @@ -1131,14 +1168,13 @@ session_manager_main_enable (vlib_main_t * vm) session_vpp_event_queue_allocate (smm, i); /* $$$$ preallocate hack config parameter */ - for (i = 0; i < 200000; i++) + for (i = 0; i < smm->preallocated_sessions; i++) { - stream_session_t *ss; + stream_session_t *ss __attribute__ ((unused)); pool_get_aligned (smm->sessions[0], ss, CLIB_CACHE_LINE_BYTES); - memset (ss, 0, sizeof (*ss)); } - for (i = 0; i < 200000; i++) + for (i = 0; i < smm->preallocated_sessions; i++) pool_put_index (smm->sessions[0], i); clib_bihash_init_16_8 (&smm->v4_session_hash, "v4 session table", @@ -1208,9 +1244,10 @@ session_manager_main_init (vlib_main_t * vm) return 0; } -VLIB_INIT_FUNCTION (session_manager_main_init) - static clib_error_t *session_config_fn (vlib_main_t * vm, - unformat_input_t * input) +VLIB_INIT_FUNCTION (session_manager_main_init); + +static clib_error_t * +session_config_fn (vlib_main_t * vm, unformat_input_t * input) { session_manager_main_t *smm = &session_manager_main; u32 nitems; @@ -1224,6 +1261,9 @@ VLIB_INIT_FUNCTION (session_manager_main_init) else clib_warning ("event queue length %d too small, ignored", nitems); } + if (unformat (input, "preallocated-sessions %d", + &smm->preallocated_sessions)) + ; else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index 5fa4225c..b4507d4e 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -80,6 +80,10 @@ typedef enum SESSION_N_TYPES, } session_type_t; + +session_type_t +session_type_from_proto_and_ip (transport_proto_t proto, u8 is_ip4); + /* * Application session state */ @@ -87,6 +91,7 @@ typedef enum { SESSION_STATE_LISTENING, SESSION_STATE_CONNECTING, + SESSION_STATE_ACCEPTING, SESSION_STATE_READY, SESSION_STATE_CLOSED, SESSION_STATE_N_STATES, @@ -211,8 +216,12 @@ struct _session_manager_main /** Per transport rx function that can either dequeue or peek */ session_fifo_rx_fn *session_tx_fns[SESSION_N_TYPES]; + /** Session manager is enabled */ u8 is_enabled; + /** Preallocate session config parameter */ + u32 preallocated_sessions; + /* Convenience */ vlib_main_t *vlib_main; vnet_main_t *vnet_main; @@ -247,13 +256,12 @@ stream_session_t *stream_session_lookup_listener4 (ip4_address_t * lcl, u16 lcl_port, u8 proto); stream_session_t *stream_session_lookup4 (ip4_address_t * lcl, ip4_address_t * rmt, u16 lcl_port, - u16 rmt_port, u8 proto, - u32 thread_index); + u16 rmt_port, u8 proto); stream_session_t *stream_session_lookup_listener6 (ip6_address_t * lcl, u16 lcl_port, u8 proto); stream_session_t *stream_session_lookup6 (ip6_address_t * lcl, ip6_address_t * rmt, u16 lcl_port, - u16 rmt_port, u8, u32 thread_index); + u16 rmt_port, u8 proto); transport_connection_t * stream_session_lookup_transport4 (ip4_address_t * lcl, ip4_address_t * rmt, u16 lcl_port, @@ -277,9 +285,24 @@ stream_session_get_tsi (u64 ti_and_si, u32 thread_index) ti_and_si & 0xFFFFFFFFULL); } +always_inline u8 +stream_session_is_valid (u32 si, u8 thread_index) +{ + stream_session_t *s; + s = pool_elt_at_index (session_manager_main.sessions[thread_index], si); + if (s->thread_index != thread_index || s->session_index != si + || s->server_rx_fifo->master_session_index != si + || s->server_tx_fifo->master_session_index != si + || s->server_rx_fifo->master_thread_index != thread_index + || s->server_tx_fifo->master_thread_index != thread_index) + return 0; + return 1; +} + always_inline stream_session_t * stream_session_get (u32 si, u32 thread_index) { + ASSERT (stream_session_is_valid (si, thread_index)); return pool_elt_at_index (session_manager_main.sessions[thread_index], si); } @@ -292,6 +315,7 @@ stream_session_get_if_valid (u64 si, u32 thread_index) if (pool_is_free_index (session_manager_main.sessions[thread_index], si)) return 0; + ASSERT (stream_session_is_valid (si, thread_index)); return pool_elt_at_index (session_manager_main.sessions[thread_index], si); } diff --git a/src/vnet/session/session_cli.c b/src/vnet/session/session_cli.c index 6b8341aa..e06bc586 100755 --- a/src/vnet/session/session_cli.c +++ b/src/vnet/session/session_cli.c @@ -47,7 +47,8 @@ format_stream_session (u8 * s, va_list * args) svm_fifo_max_enqueue (ss->server_tx_fifo), stream_session_get_index (ss)); - if (ss->session_state == SESSION_STATE_READY) + if (ss->session_state == SESSION_STATE_READY + || ss->session_state == SESSION_STATE_ACCEPTING) { s = format (s, "%U", tp_vft->format_connection, ss->connection_index, ss->thread_index, verbose); @@ -68,8 +69,9 @@ format_stream_session (u8 * s, va_list * args) } else if (ss->session_state == SESSION_STATE_CLOSED) { - s = format (s, "[CL] %-40U", tp_vft->format_connection, - ss->connection_index, ss->thread_index, verbose); + s = + format (s, "[CL] %U", tp_vft->format_connection, ss->connection_index, + ss->thread_index, verbose); if (verbose == 1) s = format (s, "%v", str); if (verbose > 1) @@ -93,7 +95,13 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, int verbose = 0, i; stream_session_t *pool; stream_session_t *s; - u8 *str = 0; + u8 *str = 0, one_session = 0, proto_set = 0, proto = 0; + u8 is_ip4 = 0, s_type = 0; + ip4_address_t lcl_ip4, rmt_ip4; + u32 lcl_port = 0, rmt_port = 0; + + memset (&lcl_ip4, 0, sizeof (lcl_ip4)); + memset (&rmt_ip4, 0, sizeof (rmt_ip4)); if (!smm->is_enabled) { @@ -106,10 +114,43 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, ; else if (unformat (input, "verbose")) verbose = 1; + else if (unformat (input, "tcp")) + { + proto_set = 1; + proto = TRANSPORT_PROTO_TCP; + } + else if (unformat (input, "%U:%d->%U:%d", + unformat_ip4_address, &lcl_ip4, &lcl_port, + unformat_ip4_address, &rmt_ip4, &rmt_port)) + { + one_session = 1; + is_ip4 = 1; + } + else break; } + if (one_session) + { + if (!proto_set) + { + vlib_cli_output (vm, "proto not set"); + return clib_error_return (0, "proto not set"); + } + + s_type = session_type_from_proto_and_ip (proto, is_ip4); + s = stream_session_lookup4 (&lcl_ip4, &rmt_ip4, + clib_host_to_net_u16 (lcl_port), + clib_host_to_net_u16 (rmt_port), s_type); + if (s) + vlib_cli_output (vm, "%U", format_stream_session, s, 2); + else + vlib_cli_output (vm, "session does not exist"); + + return 0; + } + for (i = 0; i < vec_len (smm->sessions); i++) { u32 once_per_pool; @@ -146,6 +187,7 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, } else vlib_cli_output (vm, "Thread %d: no active sessions", i); + vec_reset_length (str); } vec_free (str); @@ -161,15 +203,22 @@ VLIB_CLI_COMMAND (show_session_command, static) = }; /* *INDENT-ON* */ +static int +clear_session (stream_session_t * s) +{ + application_t *server = application_get (s->app_index); + server->cb_fns.session_disconnect_callback (s); + return 0; +} + static clib_error_t * clear_session_command_fn (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) { session_manager_main_t *smm = &session_manager_main; - u32 thread_index = 0; + u32 thread_index = 0, clear_all = 0; u32 session_index = ~0; - stream_session_t *pool, *session; - application_t *server; + stream_session_t **pool, *session; if (!smm->is_enabled) { @@ -182,28 +231,36 @@ clear_session_command_fn (vlib_main_t * vm, unformat_input_t * input, ; else if (unformat (input, "session %d", &session_index)) ; + else if (unformat (input, "all")) + clear_all = 1; else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); } - if (session_index == ~0) + if (!clear_all && session_index == ~0) return clib_error_return (0, "session required, but not set."); - if (thread_index > vec_len (smm->sessions)) - return clib_error_return (0, "thread %d out of range [0-%d]", - thread_index, vec_len (smm->sessions)); - - pool = smm->sessions[thread_index]; - - if (pool_is_free_index (pool, session_index)) - return clib_error_return (0, "session %d not active", session_index); - - session = pool_elt_at_index (pool, session_index); - server = application_get (session->app_index); + if (session_index != ~0) + { + session = stream_session_get_if_valid (session_index, thread_index); + if (!session) + return clib_error_return (0, "no session %d on thread %d", + session_index, thread_index); + clear_session (session); + } - /* Disconnect both app and transport */ - server->cb_fns.session_disconnect_callback (session); + if (clear_all) + { + /* *INDENT-OFF* */ + vec_foreach (pool, smm->sessions) + { + pool_foreach(session, *pool, ({ + clear_session (session); + })); + }; + /* *INDENT-ON* */ + } return 0; } diff --git a/src/vnet/session/transport.h b/src/vnet/session/transport.h index 561a9257..9c38bab9 100644 --- a/src/vnet/session/transport.h +++ b/src/vnet/session/transport.h @@ -225,6 +225,12 @@ make_v6_ss_kv_from_tc (session_kv6_t * kv, transport_connection_t * t) t->rmt_port, t->proto); } +typedef enum _transport_proto +{ + TRANSPORT_PROTO_TCP, + TRANSPORT_PROTO_UDP +} transport_proto_t; + typedef struct _transport_endpoint { ip46_address_t ip; /** ip address */ diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index 6f8be082..a6c8a235 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -170,62 +170,90 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, { tclient_main_t *tm = &tclient_main; int my_thread_index = vlib_get_thread_index (); - vl_api_disconnect_session_t *dmp; session_t *sp; int i; int delete_session; u32 *connection_indices; - u32 tx_quota = 0; - u32 delta, prev_bytes_received_this_session; + u32 *connections_this_batch; + u32 nconnections_this_batch; connection_indices = tm->connection_index_by_thread[my_thread_index]; + connections_this_batch = + tm->connections_this_batch_by_thread[my_thread_index]; - if (tm->run_test == 0 || vec_len (connection_indices) == 0) + if ((tm->run_test == 0) || + ((vec_len (connection_indices) == 0) + && vec_len (connections_this_batch) == 0)) return 0; - for (i = 0; i < vec_len (connection_indices); i++) + /* Grab another pile of connections */ + if (PREDICT_FALSE (vec_len (connections_this_batch) == 0)) + { + nconnections_this_batch = + clib_min (tm->connections_per_batch, vec_len (connection_indices)); + + ASSERT (nconnections_this_batch > 0); + vec_validate (connections_this_batch, nconnections_this_batch - 1); + clib_memcpy (connections_this_batch, + connection_indices + vec_len (connection_indices) + - nconnections_this_batch, + nconnections_this_batch * sizeof (u32)); + _vec_len (connection_indices) -= nconnections_this_batch; + } + + if (PREDICT_FALSE (tm->prev_conns != tm->connections_per_batch + && tm->prev_conns == vec_len (connections_this_batch))) + { + tm->repeats++; + tm->prev_conns = vec_len (connections_this_batch); + if (tm->repeats == 500000) + { + clib_warning ("stuck clients"); + } + } + else + { + tm->prev_conns = vec_len (connections_this_batch); + tm->repeats = 0; + } + + for (i = 0; i < vec_len (connections_this_batch); i++) { delete_session = 1; - sp = pool_elt_at_index (tm->sessions, connection_indices[i]); + sp = pool_elt_at_index (tm->sessions, connections_this_batch[i]); - if ((tm->no_return || tx_quota < 60) && sp->bytes_to_send > 0) + if (sp->bytes_to_send > 0) { send_test_chunk (tm, sp); delete_session = 0; - tx_quota++; } - if (!tm->no_return && sp->bytes_to_receive > 0) + if (sp->bytes_to_receive > 0) { - prev_bytes_received_this_session = sp->bytes_received; receive_test_chunk (tm, sp); - delta = sp->bytes_received - prev_bytes_received_this_session; - if (delta > 0) - tx_quota--; delete_session = 0; } if (PREDICT_FALSE (delete_session == 1)) { - __sync_fetch_and_add (&tm->tx_total, tm->bytes_to_send); + u32 index, thread_index; + stream_session_t *s; + + __sync_fetch_and_add (&tm->tx_total, sp->bytes_sent); __sync_fetch_and_add (&tm->rx_total, sp->bytes_received); - dmp = vl_msg_api_alloc_as_if_client (sizeof (*dmp)); - memset (dmp, 0, sizeof (*dmp)); - dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION); - dmp->client_index = tm->my_client_index; - dmp->handle = sp->vpp_session_handle; - if (!unix_shared_memory_queue_add (tm->vl_input_queue, (u8 *) & dmp, - 1)) + stream_session_parse_handle (sp->vpp_session_handle, + &index, &thread_index); + s = stream_session_get_if_valid (index, thread_index); + + if (s) { - vec_delete (connection_indices, 1, i); - tm->connection_index_by_thread[my_thread_index] = - connection_indices; + stream_session_disconnect (s); + vec_delete (connections_this_batch, 1, i); + i--; __sync_fetch_and_add (&tm->ready_connections, -1); } else - { - vl_msg_api_free (dmp); - } + clib_warning ("session AWOL?"); /* Kick the debug CLI process */ if (tm->ready_connections == 0) @@ -236,6 +264,10 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, } } } + + tm->connection_index_by_thread[my_thread_index] = connection_indices; + tm->connections_this_batch_by_thread[my_thread_index] = + connections_this_batch; return 0; } @@ -356,6 +388,8 @@ tcp_test_clients_init (vlib_main_t * vm) tm->vlib_main = vm; vec_validate (tm->connection_index_by_thread, thread_main->n_vlib_mains); + vec_validate (tm->connections_this_batch_by_thread, + thread_main->n_vlib_mains); return 0; } @@ -388,7 +422,8 @@ builtin_session_connected_callback (u32 app_index, u32 api_context, pool_get (tm->sessions, session); memset (session, 0, sizeof (*session)); session_index = session - tm->sessions; - session->bytes_to_receive = session->bytes_to_send = tm->bytes_to_send; + session->bytes_to_send = tm->bytes_to_send; + session->bytes_to_receive = tm->no_return ? 0ULL : tm->bytes_to_send; session->server_rx_fifo = s->server_rx_fifo; session->server_rx_fifo->client_session_index = session_index; session->server_tx_fifo = s->server_tx_fifo; @@ -485,6 +520,8 @@ attach_builtin_test_clients_app (void) options[SESSION_OPTIONS_SEGMENT_SIZE] = (2ULL << 32); options[SESSION_OPTIONS_RX_FIFO_SIZE] = tm->fifo_size; options[SESSION_OPTIONS_TX_FIFO_SIZE] = tm->fifo_size / 2; + options[APP_OPTIONS_PRIVATE_SEGMENT_COUNT] = tm->private_segment_count; + options[APP_OPTIONS_PRIVATE_SEGMENT_SIZE] = tm->private_segment_size; options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = prealloc_fifos; options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP; @@ -561,6 +598,9 @@ test_tcp_clients_command_fn (vlib_main_t * vm, tm->bytes_to_send = 8192; tm->no_return = 0; tm->fifo_size = 64 << 10; + tm->connections_per_batch = 1000; + tm->private_segment_count = 0; + tm->private_segment_size = 0; vec_free (tm->connect_uri); @@ -582,6 +622,20 @@ test_tcp_clients_command_fn (vlib_main_t * vm, tm->no_return = 1; else if (unformat (input, "fifo-size %d", &tm->fifo_size)) tm->fifo_size <<= 10; + else if (unformat (input, "private-segment-count %d", + &tm->private_segment_count)) + ; + else if (unformat (input, "private-segment-size %dm", &tmp)) + tm->private_segment_size = tmp << 20; + else if (unformat (input, "private-segment-size %dg", &tmp)) + tm->private_segment_size = tmp << 30; + else if (unformat (input, "private-segment-size %d", &tmp)) + tm->private_segment_size = tmp; + else if (unformat (input, "preallocate-fifos")) + tm->prealloc_fifos = 1; + else + if (unformat (input, "client-batch %d", &tm->connections_per_batch)) + ; else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); @@ -688,9 +742,13 @@ test_tcp_clients_command_fn (vlib_main_t * vm, vlib_cli_output (vm, "zero delta-t?"); cleanup: - pool_free (tm->sessions); + tm->run_test = 0; for (i = 0; i < vec_len (tm->connection_index_by_thread); i++) - vec_reset_length (tm->connection_index_by_thread[i]); + { + vec_reset_length (tm->connection_index_by_thread[i]); + vec_reset_length (tm->connections_this_batch_by_thread[i]); + } + pool_free (tm->sessions); return 0; } diff --git a/src/vnet/tcp/builtin_client.h b/src/vnet/tcp/builtin_client.h index 3462e0ee..38af231d 100644 --- a/src/vnet/tcp/builtin_client.h +++ b/src/vnet/tcp/builtin_client.h @@ -63,6 +63,9 @@ typedef struct u32 configured_segment_size; u32 fifo_size; u32 expected_connections; /**< Number of clients/connections */ + u32 connections_per_batch; /**< Connections to rx/tx at once */ + u32 private_segment_count; /**< Number of private fifo segs */ + u32 private_segment_size; /**< size of private fifo segs */ /* * Test state variables @@ -72,6 +75,7 @@ typedef struct uword *session_index_by_vpp_handles; /**< Hash table for disconnecting */ u8 *connect_test_data; /**< Pre-computed test data */ u32 **connection_index_by_thread; + u32 **connections_this_batch_by_thread; /**< active connection batch */ pthread_t client_thread_handle; volatile u32 ready_connections; @@ -82,7 +86,8 @@ typedef struct f64 test_start_time; f64 test_end_time; - + u32 prev_conns; + u32 repeats; /* * Flags */ diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c index 775bfc26..8e958ac0 100644 --- a/src/vnet/tcp/builtin_server.c +++ b/src/vnet/tcp/builtin_server.c @@ -56,12 +56,15 @@ typedef struct u32 fifo_size; /**< Fifo size */ u32 rcv_buffer_size; /**< Rcv buffer size */ u32 prealloc_fifos; /**< Preallocate fifos */ + u32 private_segment_count; /**< Number of private segments */ + u32 private_segment_size; /**< Size of private segments */ /* * Test state */ u8 **rx_buf; /**< Per-thread RX buffer */ u64 byte_index; + u32 **rx_retries; vlib_main_t *vlib_main; } builtin_server_main_t; @@ -77,6 +80,8 @@ builtin_session_accept_callback (stream_session_t * s) session_manager_get_vpp_event_queue (s->thread_index); s->session_state = SESSION_STATE_READY; bsm->byte_index = 0; + vec_validate (bsm->rx_retries[s->thread_index], s->session_index); + bsm->rx_retries[s->thread_index][s->session_index] = 0; return 0; } @@ -173,11 +178,16 @@ builtin_server_rx_callback (stream_session_t * s) builtin_server_main_t *bsm = &builtin_server_main; session_fifo_event_t evt; static int serial_number = 0; - u32 my_thread_id = vlib_get_thread_index (); + u32 thread_index = vlib_get_thread_index (); + + ASSERT (s->thread_index == thread_index); rx_fifo = s->server_rx_fifo; tx_fifo = s->server_tx_fifo; + ASSERT (rx_fifo->master_thread_index == thread_index); + ASSERT (tx_fifo->master_thread_index == thread_index); + max_dequeue = svm_fifo_max_dequeue (s->server_rx_fifo); max_enqueue = svm_fifo_max_enqueue (s->server_tx_fifo); @@ -201,21 +211,31 @@ builtin_server_rx_callback (stream_session_t * s) evt.event_type = FIFO_EVENT_BUILTIN_RX; evt.event_id = 0; - q = bsm->vpp_queue[s->thread_index]; + q = bsm->vpp_queue[thread_index]; if (PREDICT_FALSE (q->cursize == q->maxsize)) clib_warning ("out of event queue space"); - else - unix_shared_memory_queue_add (q, (u8 *) & evt, - 0 /* don't wait for mutex */ ); + else if (unix_shared_memory_queue_add (q, (u8 *) & evt, 0 /* don't wait for mutex */ + )) + clib_warning ("failed to enqueue self-tap"); + + bsm->rx_retries[thread_index][s->session_index]++; + if (bsm->rx_retries[thread_index][s->session_index] == 500000) + { + clib_warning ("session stuck: %U", format_stream_session, s, 2); + } + } + else + { + bsm->rx_retries[thread_index][s->session_index] = 0; } return 0; } - _vec_len (bsm->rx_buf[my_thread_id]) = max_transfer; + _vec_len (bsm->rx_buf[thread_index]) = max_transfer; actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, max_transfer, - bsm->rx_buf[my_thread_id]); + bsm->rx_buf[thread_index]); ASSERT (actual_transfer == max_transfer); // test_bytes (bsm, actual_transfer); @@ -225,7 +245,7 @@ builtin_server_rx_callback (stream_session_t * s) */ n_written = svm_fifo_enqueue_nowait (tx_fifo, actual_transfer, - bsm->rx_buf[my_thread_id]); + bsm->rx_buf[thread_index]); if (n_written != max_transfer) clib_warning ("short trout!"); @@ -237,11 +257,13 @@ builtin_server_rx_callback (stream_session_t * s) evt.event_type = FIFO_EVENT_APP_TX; evt.event_id = serial_number++; - unix_shared_memory_queue_add (bsm->vpp_queue[s->thread_index], - (u8 *) & evt, 0 /* do wait for mutex */ ); + if (unix_shared_memory_queue_add (bsm->vpp_queue[s->thread_index], + (u8 *) & evt, + 0 /* do wait for mutex */ )) + clib_warning ("failed to enqueue tx evt"); } - if (PREDICT_FALSE (max_enqueue < max_dequeue)) + if (PREDICT_FALSE (n_written < max_dequeue)) goto rx_event; return 0; @@ -328,9 +350,13 @@ server_attach () a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 512 << 20; a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = bsm->fifo_size; a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = bsm->fifo_size; - a->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP; + a->options[APP_OPTIONS_PRIVATE_SEGMENT_COUNT] = bsm->private_segment_count; + a->options[APP_OPTIONS_PRIVATE_SEGMENT_SIZE] = bsm->private_segment_size; a->options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = bsm->prealloc_fifos ? bsm->prealloc_fifos : 1; + + a->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP; + a->segment_name = segment_name; a->segment_name_length = ARRAY_LEN (segment_name); @@ -374,6 +400,8 @@ server_create (vlib_main_t * vm) num_threads = 1 /* main thread */ + vtm->n_threads; vec_validate (builtin_server_main.vpp_queue, num_threads - 1); vec_validate (bsm->rx_buf, num_threads - 1); + vec_validate (bsm->rx_retries, num_threads - 1); + for (i = 0; i < num_threads; i++) vec_validate (bsm->rx_buf[i], bsm->rcv_buffer_size); @@ -435,11 +463,14 @@ server_create_command_fn (vlib_main_t * vm, unformat_input_t * input, { builtin_server_main_t *bsm = &builtin_server_main; int rv; + u32 tmp; bsm->no_echo = 0; bsm->fifo_size = 64 << 10; bsm->rcv_buffer_size = 128 << 10; bsm->prealloc_fifos = 0; + bsm->private_segment_count = 0; + bsm->private_segment_size = 0; while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { @@ -449,8 +480,17 @@ server_create_command_fn (vlib_main_t * vm, unformat_input_t * input, bsm->fifo_size <<= 10; else if (unformat (input, "rcv-buf-size %d", &bsm->rcv_buffer_size)) ; - else if (unformat (input, "prealloc-fifos", &bsm->prealloc_fifos)) + else if (unformat (input, "prealloc-fifos %d", &bsm->prealloc_fifos)) + ; + else if (unformat (input, "private-segment-count %d", + &bsm->private_segment_count)) ; + else if (unformat (input, "private-segment-size %dm", &tmp)) + bsm->private_segment_size = tmp << 20; + else if (unformat (input, "private-segment-size %dg", &tmp)) + bsm->private_segment_size = tmp << 30; + else if (unformat (input, "private-segment-size %d", &tmp)) + bsm->private_segment_size = tmp; else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 4e85eb3f..f379e699 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -74,8 +74,16 @@ static void tcp_connection_unbind (u32 listener_index) { tcp_main_t *tm = vnet_get_tcp_main (); - TCP_EVT_DBG (TCP_EVT_UNBIND, - pool_elt_at_index (tm->listener_pool, listener_index)); + tcp_connection_t *tc; + + tc = pool_elt_at_index (tm->listener_pool, listener_index); + + TCP_EVT_DBG (TCP_EVT_UNBIND, tc); + + /* Poison the entry */ + if (CLIB_DEBUG > 0) + memset (tc, 0xFA, sizeof (*tc)); + pool_put_index (tm->listener_pool, listener_index); } @@ -124,9 +132,20 @@ tcp_connection_cleanup (tcp_connection_t * tc) /* Check if half-open */ if (tc->state == TCP_STATE_SYN_SENT) - pool_put (tm->half_open_connections, tc); + { + /* Poison the entry */ + if (CLIB_DEBUG > 0) + memset (tc, 0xFA, sizeof (*tc)); + pool_put (tm->half_open_connections, tc); + } else - pool_put (tm->connections[tc->c_thread_index], tc); + { + int thread_index = tc->c_thread_index; + /* Poison the entry */ + if (CLIB_DEBUG > 0) + memset (tc, 0xFA, sizeof (*tc)); + pool_put (tm->connections[thread_index], tc); + } } /** @@ -168,13 +187,14 @@ tcp_connection_reset (tcp_connection_t * tc) /* Make sure all timers are cleared */ tcp_connection_timers_reset (tc); - stream_session_reset_notify (&tc->connection); + + /* Wait for cleanup from session layer but not forever */ + tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); break; case TCP_STATE_CLOSED: return; } - } /** @@ -278,6 +298,9 @@ tcp_allocate_local_port (tcp_main_t * tm, ip46_address_t * ip) tries = max - min; time_now = tcp_time_now (); + /* Only support active opens from thread 0 */ + ASSERT (vlib_get_thread_index () == 0); + /* Start at random point or max */ pool_get (tm->local_endpoints, tep); clib_memcpy (&tep->ip, ip, sizeof (*ip)); @@ -343,6 +366,7 @@ tcp_connection_timers_reset (tcp_connection_t * tc) } } +#if 0 typedef struct ip4_tcp_hdr { ip4_header_t ip; @@ -435,6 +459,7 @@ tcp_connection_fib_attach (tcp_connection_t * tc) tcp_connection_stack_on_fib_entry (tc); } +#endif /* 0 */ /** Initialize tcp connection variables * @@ -447,7 +472,7 @@ tcp_connection_init_vars (tcp_connection_t * tc) tcp_init_mss (tc); scoreboard_init (&tc->sack_sb); tcp_cc_init (tc); - tcp_connection_fib_attach (tc); + // tcp_connection_fib_attach (tc); } int @@ -485,14 +510,38 @@ tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4) if (is_ip4) { ip4_address_t *ip4; - ip4 = ip_interface_get_first_ip (sw_if_index, 1); - lcl_addr.ip4.as_u32 = ip4->as_u32; + int index; + if (vec_len (tm->ip4_src_addresses)) + { + index = tm->last_v4_address_rotor++; + if (tm->last_v4_address_rotor >= vec_len (tm->ip4_src_addresses)) + tm->last_v4_address_rotor = 0; + lcl_addr.ip4.as_u32 = tm->ip4_src_addresses[index].as_u32; + } + else + { + ip4 = ip_interface_get_first_ip (sw_if_index, 1); + lcl_addr.ip4.as_u32 = ip4->as_u32; + } } else { ip6_address_t *ip6; - ip6 = ip_interface_get_first_ip (sw_if_index, 0); - clib_memcpy (&lcl_addr.ip6, ip6, sizeof (*ip6)); + int index; + + if (vec_len (tm->ip6_src_addresses)) + { + index = tm->last_v6_address_rotor++; + if (tm->last_v6_address_rotor >= vec_len (tm->ip6_src_addresses)) + tm->last_v6_address_rotor = 0; + clib_memcpy (&lcl_addr.ip6, &tm->ip6_src_addresses[index], + sizeof (*ip6)); + } + else + { + ip6 = ip_interface_get_first_ip (sw_if_index, 0); + clib_memcpy (&lcl_addr.ip6, ip6, sizeof (*ip6)); + } } /* Allocate source port */ @@ -614,7 +663,7 @@ u8 * format_tcp_vars (u8 * s, va_list * args) { tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - s = format (s, " snd_una %u snd_nxt %u snd_una_max %u\n", + s = format (s, " snd_una %u snd_nxt %u snd_una_max %u", tc->snd_una - tc->iss, tc->snd_nxt - tc->iss, tc->snd_una_max - tc->iss); s = format (s, " rcv_nxt %u rcv_las %u\n", @@ -628,12 +677,17 @@ format_tcp_vars (u8 * s, va_list * args) s = format (s, " cong %U ", format_tcp_congestion_status, tc); s = format (s, "cwnd %u ssthresh %u rtx_bytes %u bytes_acked %u\n", tc->cwnd, tc->ssthresh, tc->snd_rxt_bytes, tc->bytes_acked); - s = format (s, " prev_ssthresh %u snd_congestion %u dupack %u\n", + s = format (s, " prev_ssthresh %u snd_congestion %u dupack %u", tc->prev_ssthresh, tc->snd_congestion - tc->iss, tc->rcv_dupacks); + s = format (s, " limited_transmit %u\n", tc->limited_transmit - tc->iss); + s = format (s, " tsecr %u tsecr_last_ack %u\n", tc->rcv_opts.tsecr, + tc->tsecr_last_ack); s = format (s, " rto %u rto_boff %u srtt %u rttvar %u rtt_ts %u ", tc->rto, tc->rto_boff, tc->srtt, tc->rttvar, tc->rtt_ts); s = format (s, "rtt_seq %u\n", tc->rtt_seq); + s = format (s, " tsval_recent %u tsval_recent_age %u\n", tc->tsval_recent, + tcp_time_now () - tc->tsval_recent_age); s = format (s, " scoreboard: %U\n", format_tcp_scoreboard, &tc->sack_sb); if (vec_len (tc->snd_sacks)) s = format (s, " sacks tx: %U\n", format_tcp_sacks, tc); @@ -719,11 +773,21 @@ format_tcp_sacks (u8 * s, va_list * args) tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); sack_block_t *sacks = tc->snd_sacks; sack_block_t *block; - vec_foreach (block, sacks) - { - s = format (s, " start %u end %u\n", block->start - tc->irs, - block->end - tc->irs); - } + int i, len = 0; + + len = vec_len (sacks); + for (i = 0; i < len - 1; i++) + { + block = &sacks[i]; + s = format (s, " start %u end %u\n", block->start - tc->irs, + block->end - tc->irs); + } + if (len) + { + block = &sacks[len - 1]; + s = format (s, " start %u end %u", block->start - tc->irs, + block->end - tc->irs); + } return s; } @@ -796,14 +860,18 @@ tcp_session_send_mss (transport_connection_t * trans_conn) always_inline u32 tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space) { - if (tc->snd_wnd < tc->snd_mss) + if (PREDICT_FALSE (tc->snd_wnd < tc->snd_mss)) { return tc->snd_wnd <= snd_space ? tc->snd_wnd : 0; } /* If we can't write at least a segment, don't try at all */ - if (snd_space < tc->snd_mss) - return 0; + if (PREDICT_FALSE (snd_space < tc->snd_mss)) + { + if (snd_space > clib_min (tc->mss, tc->rcv_opts.mss) - TCP_HDR_LEN_MAX) + return snd_space; + return 0; + } /* round down to mss multiple */ return snd_space - (snd_space % tc->snd_mss); @@ -1042,6 +1110,8 @@ tcp_main_enable (vlib_main_t * vm) vlib_thread_main_t *vtm = vlib_get_thread_main (); clib_error_t *error = 0; u32 num_threads; + int thread, i; + tcp_connection_t *tc __attribute__ ((unused)); if ((error = vlib_call_init_function (vm, ip_main_init))) return error; @@ -1074,6 +1144,27 @@ tcp_main_enable (vlib_main_t * vm) num_threads = 1 /* main thread */ + vtm->n_threads; vec_validate (tm->connections, num_threads - 1); + /* + * Preallocate connections + */ + for (thread = 0; thread < num_threads; thread++) + { + for (i = 0; i < tm->preallocated_connections; i++) + pool_get (tm->connections[thread], tc); + + for (i = 0; i < tm->preallocated_connections; i++) + pool_put_index (tm->connections[thread], i); + } + + /* + * Preallocate half-open connections + */ + for (i = 0; i < tm->preallocated_half_open_connections; i++) + pool_get (tm->half_open_connections, tc); + + for (i = 0; i < tm->preallocated_half_open_connections; i++) + pool_put_index (tm->half_open_connections, i); + /* Initialize per worker thread tx buffers (used for control messages) */ vec_validate (tm->tx_buffers, num_threads - 1); @@ -1116,7 +1207,6 @@ tcp_init (vlib_main_t * vm) { tcp_main_t *tm = vnet_get_tcp_main (); - tm->vlib_main = vm; tm->vnet_main = vnet_get_main (); tm->is_enabled = 0; @@ -1125,6 +1215,97 @@ tcp_init (vlib_main_t * vm) VLIB_INIT_FUNCTION (tcp_init); + +static clib_error_t * +tcp_config_fn (vlib_main_t * vm, unformat_input_t * input) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (input, "preallocated-connections %d", + &tm->preallocated_connections)) + ; + else if (unformat (input, "preallocated-half-open-connections %d", + &tm->preallocated_half_open_connections)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + return 0; +} + +VLIB_CONFIG_FUNCTION (tcp_config_fn, "tcp"); + +static clib_error_t * +tcp_src_address (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd_arg) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + ip4_address_t v4start, v4end; + ip6_address_t v6start, v6end; + int v4set = 0; + int v6set = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "%U - %U", unformat_ip4_address, &v4start, + unformat_ip4_address, &v4end)) + v4set = 1; + else if (unformat (input, "%U", unformat_ip4_address, &v4start)) + { + memcpy (&v4end, &v4start, sizeof (v4start)); + v4set = 1; + } + else if (unformat (input, "%U - %U", unformat_ip6_address, &v6start, + unformat_ip4_address, &v6end)) + v6set = 1; + else if (unformat (input, "%U", unformat_ip6_address, &v6start)) + { + memcpy (&v6end, &v6start, sizeof (v4start)); + v6set = 1; + } + else + break; + } + + if (!v4set && !v6set) + return clib_error_return (0, "at least one v4 or v6 address required"); + + if (v4set) + { + u32 tmp; + + do + { + vec_add1 (tm->ip4_src_addresses, v4start); + tmp = clib_net_to_host_u32 (v4start.as_u32); + tmp++; + v4start.as_u32 = clib_host_to_net_u32 (tmp); + } + while (clib_host_to_net_u32 (v4start.as_u32) <= + clib_host_to_net_u32 (v4end.as_u32)); + } + if (v6set) + { + clib_warning ("v6 src address list unimplemented..."); + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (tcp_src_address_command, static) = +{ + .path = "tcp src-address", + .short_help = "tcp src-address [- ] add src address range", + .function = tcp_src_address, +}; +/* *INDENT-ON* */ + + + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 12d804b8..37b10fd4 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -348,6 +348,16 @@ typedef struct _tcp_main /* Flag that indicates if stack is on or off */ u8 is_enabled; + /** Number of preallocated connections */ + u32 preallocated_connections; + u32 preallocated_half_open_connections; + + /** Vectors of src addresses. Optional unless one needs > 63K active-opens */ + ip4_address_t *ip4_src_addresses; + u32 last_v4_address_rotor; + u32 last_v6_address_rotor; + ip6_address_t *ip6_src_addresses; + /* convenience */ vlib_main_t *vlib_main; vnet_main_t *vnet_main; @@ -569,6 +579,7 @@ tcp_connection_force_ack (tcp_connection_t * tc, vlib_buffer_t * b) always_inline void tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval) { + ASSERT (tc->c_thread_index == vlib_get_thread_index ()); tc->timers[timer_id] = tw_timer_start_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index], tc->c_c_index, timer_id, interval); @@ -577,6 +588,7 @@ tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval) always_inline void tcp_timer_reset (tcp_connection_t * tc, u8 timer_id) { + ASSERT (tc->c_thread_index == vlib_get_thread_index ()); if (tc->timers[timer_id] == TCP_TIMER_HANDLE_INVALID) return; @@ -588,6 +600,7 @@ tcp_timer_reset (tcp_connection_t * tc, u8 timer_id) always_inline void tcp_timer_update (tcp_connection_t * tc, u8 timer_id, u32 interval) { + ASSERT (tc->c_thread_index == vlib_get_thread_index ()); if (tc->timers[timer_id] != TCP_TIMER_HANDLE_INVALID) tw_timer_stop_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index], tc->timers[timer_id]); diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index ae68ad1b..be51bca2 100755 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -383,9 +383,16 @@ typedef enum _tcp_dbg_evt "establish", \ }, \ }; \ - DECLARE_ETD(_tc, _e, 2); \ - ed->data[0] = _timer_id; \ - ed->data[1] = _timer_id; \ + if (_tc) \ + { \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = _timer_id; \ + ed->data[1] = _timer_id; \ + } \ + else \ + { \ + clib_warning ("pop for unexisting connection %d", _tc_index); \ + } \ } #define TCP_EVT_SEG_INVALID_HANDLER(_tc, _seq, _end, ...) \ diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index a2e6dad1..45db0da6 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -251,6 +251,7 @@ tcp_update_timestamp (tcp_connection_t * tc, u32 seq, u32 seq_end) if (tcp_opts_tstamp (&tc->rcv_opts) && tc->tsval_recent && seq_leq (seq, tc->rcv_las) && seq_leq (tc->rcv_las, seq_end)) { + ASSERT (timestamp_leq (tc->tsval_recent, tc->rcv_opts.tsval)); tc->tsval_recent = tc->rcv_opts.tsval; tc->tsval_recent_age = tcp_time_now (); } @@ -383,12 +384,9 @@ tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt) if (tc->srtt != 0) { err = mrtt - tc->srtt; -// tc->srtt += err >> 3; /* XXX Drop in RTT results in RTTVAR increase and bigger RTO. * The increase should be bound */ -// tc->rttvar += ((int) clib_abs (err) - (int) tc->rttvar) >> 2; - tc->srtt = clib_max ((int) tc->srtt + (err >> 3), 1); diff = (clib_abs (err) - (int) tc->rttvar) >> 2; tc->rttvar = clib_max ((int) tc->rttvar + diff, 1); @@ -491,6 +489,14 @@ tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 prev_snd_wnd, && (prev_snd_wnd == tc->snd_wnd)); } +static u8 +tcp_is_lost_fin (tcp_connection_t * tc) +{ + if ((tc->flags & TCP_CONN_FINSNT) && tc->snd_una_max - tc->snd_una == 1) + return 1; + return 0; +} + /** * Checks if ack is a congestion control event. */ @@ -503,7 +509,7 @@ tcp_ack_is_cc_event (tcp_connection_t * tc, vlib_buffer_t * b, *is_dack = tc->sack_sb.last_sacked_bytes || tcp_ack_is_dupack (tc, b, prev_snd_wnd, prev_snd_una); - return (*is_dack || tcp_in_cong_recovery (tc)); + return ((*is_dack || tcp_in_cong_recovery (tc)) && !tcp_is_lost_fin (tc)); } void @@ -750,10 +756,20 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) * last hole end */ tmp = tc->rcv_opts.sacks[vec_len (tc->rcv_opts.sacks) - 1]; last_hole = scoreboard_last_hole (sb); - if (seq_gt (tc->snd_una_max, sb->high_sacked) - && seq_gt (tc->snd_una_max, last_hole->end)) - last_hole->end = tc->snd_una_max; - /* keep track of max byte sacked for when the last hole + if (seq_gt (tc->snd_una_max, last_hole->end)) + { + if (seq_geq (last_hole->start, sb->high_sacked)) + { + last_hole->end = tc->snd_una_max; + } + /* New hole after high sacked block */ + else if (seq_lt (sb->high_sacked, tc->snd_una_max)) + { + scoreboard_insert_hole (sb, sb->tail, sb->high_sacked, + tc->snd_una_max); + } + } + /* Keep track of max byte sacked for when the last hole * is acked */ if (seq_gt (tmp.end, sb->high_sacked)) sb->high_sacked = tmp.end; @@ -764,7 +780,6 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) while (hole && blk_index < vec_len (tc->rcv_opts.sacks)) { blk = &tc->rcv_opts.sacks[blk_index]; - if (seq_leq (blk->start, hole->start)) { /* Block covers hole. Remove hole */ @@ -784,6 +799,7 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) } else if (!next_hole) { + ASSERT (seq_geq (sb->high_sacked, ack)); sb->snd_una_adv = sb->high_sacked - ack; sb->last_bytes_delivered += sb->high_sacked - hole->end; } @@ -819,7 +835,6 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { hole->end = blk->start; } - hole = scoreboard_next_hole (sb, hole); } } @@ -827,10 +842,13 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) scoreboard_update_bytes (tc, sb); sb->last_sacked_bytes = sb->sacked_bytes - (old_sacked_bytes - sb->last_bytes_delivered); + ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes); ASSERT (sb->sacked_bytes == 0 || sb->sacked_bytes < tc->snd_una_max - seq_max (tc->snd_una, ack)); ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_una_max - seq_max (tc->snd_una, ack)); + ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc) + || sb->holes[sb->head].start == ack + sb->snd_una_adv); } /** @@ -916,7 +934,8 @@ tcp_cc_congestion_undo (tcp_connection_t * tc) static u8 tcp_cc_is_spurious_retransmit (tcp_connection_t * tc) { - return (tc->snd_rxt_ts + return (tcp_in_recovery (tc) + && tc->snd_rxt_ts && tcp_opts_tstamp (&tc->rcv_opts) && timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts)); } @@ -994,6 +1013,7 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) { ASSERT (tc->snd_una != tc->snd_una_max || tc->sack_sb.last_sacked_bytes); + tc->rcv_dupacks++; if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD && !tc->bytes_acked) @@ -1012,17 +1032,20 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) goto partial_ack_test; } - /* If of of the two conditions lower hold, reset dupacks - * 1) Cumulative ack does not cover more than congestion threshold, - * and the following doesn't hold: the congestion window is - * greater than SMSS bytes and the difference between highest_ack - * and prev_highest_ack is at most 4*SMSS bytes (XXX) - * 2) RFC6582 heuristic to avoid multiple fast retransmits + /* If of of the two conditions lower hold, reset dupacks because + * we're probably after timeout (RFC6582 heuristics). + * If Cumulative ack does not cover more than congestion threshold, + * and: + * 1) The following doesn't hold: The congestion window is greater + * than SMSS bytes and the difference between highest_ack + * and prev_highest_ack is at most 4*SMSS bytes + * 2) Echoed timestamp in the last non-dup ack does not equal the + * stored timestamp */ - if ((seq_gt (tc->snd_una, tc->snd_congestion) - || !(tc->cwnd > tc->snd_mss - && tc->bytes_acked <= 4 * tc->snd_mss)) - || tc->rcv_opts.tsecr != tc->tsecr_last_ack) + if (seq_leq (tc->snd_una, tc->snd_congestion) + && ((!(tc->cwnd > tc->snd_mss + && tc->bytes_acked <= 4 * tc->snd_mss)) + || (tc->rcv_opts.tsecr != tc->tsecr_last_ack))) { tc->rcv_dupacks = 0; return; @@ -1038,6 +1061,7 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) * three segments that have left the network and should've been * buffered at the receiver XXX */ tc->cwnd = tc->ssthresh + tc->rcv_dupacks * tc->snd_mss; + ASSERT (tc->cwnd >= tc->snd_mss); /* If cwnd allows, send more data */ if (tcp_opts_sack_permitted (&tc->rcv_opts) @@ -1112,7 +1136,7 @@ partial_ack: >= tc->sack_sb.last_bytes_delivered); rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv - tc->sack_sb.last_bytes_delivered; - if (rxt_delivered && seq_gt (tc->sack_sb.high_rxt, tc->snd_una)) + if (0 && rxt_delivered && seq_gt (tc->sack_sb.high_rxt, tc->snd_una)) { /* If we have sacks and we haven't gotten an ack beyond high_rxt, * remove sacked bytes delivered */ @@ -1301,6 +1325,8 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, { int written; + ASSERT (seq_geq (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt)); + /* Pure ACK. Update rcv_nxt and be done. */ if (PREDICT_FALSE (data_len == 0)) { @@ -1450,6 +1476,7 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, /* Chop off the bytes in the past */ n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number; n_data_bytes -= n_bytes_to_drop; + vnet_buffer (b)->tcp.seq_number = tc->rcv_nxt; vlib_buffer_advance (b, n_bytes_to_drop); goto in_order; @@ -1912,11 +1939,12 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; } - stream_session_init_fifos_pointers (&new_tc0->connection, - new_tc0->irs + 1, - new_tc0->iss + 1); /* Make sure after data segment processing ACK is sent */ new_tc0->flags |= TCP_CONN_SNDACK; + + /* Update rtt with the syn-ack sample */ + new_tc0->bytes_acked = 1; + tcp_update_rtt (new_tc0, vnet_buffer (b0)->tcp.ack_number); } /* SYN: Simultaneous open. Change state to SYN-RCVD and send SYN-ACK */ else @@ -1932,9 +1960,8 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; } - stream_session_init_fifos_pointers (&new_tc0->connection, - new_tc0->irs + 1, - new_tc0->iss + 1); + tc0->rtt_ts = 0; + tcp_make_synack (new_tc0, b0); next0 = tcp_next_output (is_ip4); @@ -2151,8 +2178,6 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, << tc0->rcv_opts.wscale; tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number; tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number; - - /* Shoulder tap the server */ stream_session_accept_notify (&tc0->connection); /* Reset SYN-ACK retransmit timer */ @@ -2175,6 +2200,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* If FIN is ACKed */ if (tc0->snd_una == tc0->snd_una_max) { + ASSERT (tcp_fin (tcp0)); tc0->state = TCP_STATE_FIN_WAIT_2; /* Stop all timers, 2MSL will be set lower */ tcp_connection_timers_reset (tc0); @@ -2545,10 +2571,6 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_make_synack (child0, b0); next0 = tcp_next_output (is_ip4); - /* Init fifo pointers after we have iss */ - stream_session_init_fifos_pointers (&child0->connection, - child0->irs + 1, - child0->iss + 1); drop: if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { @@ -2886,9 +2908,12 @@ do { \ _(LISTEN, TCP_FLAG_SYN, TCP_INPUT_NEXT_LISTEN, TCP_ERROR_NONE); _(LISTEN, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_NONE); _(LISTEN, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_NONE); + _(LISTEN, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, + TCP_ERROR_NONE); /* ACK for for a SYN-ACK -> tcp-rcv-process. */ _(SYN_RCVD, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(SYN_RCVD, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(SYN_RCVD, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); /* SYN-ACK for a SYN */ _(SYN_SENT, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE); @@ -2905,12 +2930,14 @@ do { \ _(ESTABLISHED, TCP_FLAG_RST, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); _(ESTABLISHED, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); + _(ESTABLISHED, TCP_FLAG_SYN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); /* ACK or FIN-ACK to our FIN */ _(FIN_WAIT_1, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(FIN_WAIT_1, TCP_FLAG_ACK | TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); /* FIN in reply to our FIN from the other side */ _(FIN_WAIT_1, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(FIN_WAIT_1, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); /* FIN confirming that the peer (app) has closed */ _(FIN_WAIT_2, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(FIN_WAIT_2, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); @@ -2929,6 +2956,8 @@ do { \ TCP_ERROR_NONE); _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED); _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); + _(CLOSED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, + TCP_ERROR_CONNECTION_CLOSED); #undef _ } diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c index c825e952..103fea4c 100644 --- a/src/vnet/tcp/tcp_newreno.c +++ b/src/vnet/tcp/tcp_newreno.c @@ -63,8 +63,8 @@ newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type) * window deflation" attempts to ensure that, when fast recovery * eventually ends, approximately ssthresh amount of data will be * outstanding in the network.*/ - tc->cwnd = (tc->cwnd > tc->bytes_acked) ? - tc->cwnd - tc->bytes_acked : 0; + tc->cwnd = (tc->cwnd > tc->bytes_acked + tc->snd_mss) ? + tc->cwnd - tc->bytes_acked : tc->snd_mss; if (tc->bytes_acked > tc->snd_mss) tc->cwnd += tc->snd_mss; } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 41bebcb3..b418e8ba 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -19,17 +19,20 @@ vlib_node_registration_t tcp4_output_node; vlib_node_registration_t tcp6_output_node; -typedef enum _tcp_output_nect +typedef enum _tcp_output_next { TCP_OUTPUT_NEXT_DROP, + TCP_OUTPUT_NEXT_IP_LOOKUP, TCP_OUTPUT_N_NEXT } tcp_output_next_t; #define foreach_tcp4_output_next \ _ (DROP, "error-drop") \ + _ (IP_LOOKUP, "ip4-lookup") #define foreach_tcp6_output_next \ _ (DROP, "error-drop") \ + _ (IP_LOOKUP, "ip6-lookup") static char *tcp_error_strings[] = { #define tcp_error(n,s) s, @@ -427,16 +430,16 @@ tcp_init_mss (tcp_connection_t * tc) #define tcp_get_free_buffer_index(tm, bidx) \ do { \ u32 *my_tx_buffers, n_free_buffers; \ - u32 thread_index = vlib_get_thread_index(); \ - my_tx_buffers = tm->tx_buffers[thread_index]; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ if (PREDICT_FALSE(vec_len (my_tx_buffers) == 0)) \ { \ n_free_buffers = 32; /* TODO config or macro */ \ vec_validate (my_tx_buffers, n_free_buffers - 1); \ _vec_len(my_tx_buffers) = vlib_buffer_alloc_from_free_list ( \ - tm->vlib_main, my_tx_buffers, n_free_buffers, \ + vlib_get_main(), my_tx_buffers, n_free_buffers, \ VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); \ - tm->tx_buffers[thread_index] = my_tx_buffers; \ + tm->tx_buffers[thread_index] = my_tx_buffers; \ } \ /* buffer shortage */ \ if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) \ @@ -445,12 +448,12 @@ do { \ _vec_len (my_tx_buffers) -= 1; \ } while (0) -#define tcp_return_buffer(tm) \ -do { \ - u32 *my_tx_buffers; \ - u32 thread_index = vlib_get_thread_index(); \ - my_tx_buffers = tm->tx_buffers[thread_index]; \ - _vec_len (my_tx_buffers) +=1; \ +#define tcp_return_buffer(tm) \ +do { \ + u32 *my_tx_buffers; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ + _vec_len (my_tx_buffers) +=1; \ } while (0) always_inline void @@ -757,23 +760,22 @@ void tcp_push_ip_hdr (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b) { tcp_header_t *th = vlib_buffer_get_current (b); - + vlib_main_t *vm = vlib_get_main (); if (tc->c_is_ip4) { ip4_header_t *ih; - ih = vlib_buffer_push_ip4 (tm->vlib_main, b, &tc->c_lcl_ip4, + ih = vlib_buffer_push_ip4 (vm, b, &tc->c_lcl_ip4, &tc->c_rmt_ip4, IP_PROTOCOL_TCP); - th->checksum = ip4_tcp_udp_compute_checksum (tm->vlib_main, b, ih); + th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih); } else { ip6_header_t *ih; int bogus = ~0; - ih = vlib_buffer_push_ip6 (tm->vlib_main, b, &tc->c_lcl_ip6, + ih = vlib_buffer_push_ip6 (vm, b, &tc->c_lcl_ip6, &tc->c_rmt_ip6, IP_PROTOCOL_TCP); - th->checksum = ip6_tcp_udp_icmp_compute_checksum (tm->vlib_main, b, ih, - &bogus); + th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih, &bogus); ASSERT (!bogus); } } @@ -851,6 +853,13 @@ tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) /* Decide where to send the packet */ next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index; + /* Initialize the trajectory trace, if configured */ + if (VLIB_BUFFER_TRACE_TRAJECTORY > 0) + { + b->pre_data[0] = 1; + b->pre_data[1] = next_index; + } + /* Enqueue the packet */ f = vlib_get_frame_to_node (vm, next_index); to_next = vlib_frame_vector_args (f); @@ -1144,6 +1153,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Account for the SYN */ tc->snd_nxt += 1; + tc->rtt_ts = 0; } else { @@ -1232,7 +1242,7 @@ tcp_timer_persist_handler (u32 index) /* Nothing to send */ if (n_bytes <= 0) { - clib_warning ("persist found nothing to send"); + // clib_warning ("persist found nothing to send"); tcp_return_buffer (tm); return; } @@ -1448,7 +1458,7 @@ tcp46_output_inline (vlib_main_t * vm, tcp_connection_t *tc0; tcp_tx_trace_t *t0; tcp_header_t *th0 = 0; - u32 error0 = TCP_ERROR_PKTS_SENT, next0 = TCP_OUTPUT_NEXT_DROP; + u32 error0 = TCP_ERROR_PKTS_SENT, next0 = TCP_OUTPUT_NEXT_IP_LOOKUP; bi0 = from[0]; to_next[0] = bi0; @@ -1527,6 +1537,7 @@ tcp46_output_inline (vlib_main_t * vm, tc0->rto_boff = 0; } +#if 0 /* Make sure we haven't lost route to our peer */ if (PREDICT_FALSE (tc0->last_fib_check < tc0->snd_opts.tsval + TCP_FIB_RECHECK_PERIOD)) @@ -1547,6 +1558,10 @@ tcp46_output_inline (vlib_main_t * vm, /* Use pre-computed dpo to set next node */ next0 = tc0->c_rmt_dpo.dpoi_next_node; vnet_buffer (b0)->ip.adj_index[VLIB_TX] = tc0->c_rmt_dpo.dpoi_index; +#endif + + vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0; b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; done: diff --git a/src/vnet/tcp/tcp_packet.h b/src/vnet/tcp/tcp_packet.h index a6f62ee1..9ccfe655 100644 --- a/src/vnet/tcp/tcp_packet.h +++ b/src/vnet/tcp/tcp_packet.h @@ -168,6 +168,7 @@ typedef struct #define TCP_OPTION_LEN_TIMESTAMP 10 #define TCP_OPTION_LEN_SACK_BLOCK 8 +#define TCP_HDR_LEN_MAX 60 #define TCP_WND_MAX 65535U #define TCP_MAX_WND_SCALE 14 /* See RFC 1323 */ #define TCP_OPTS_ALIGN 4 diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c index a461e3b8..510deb4f 100644 --- a/src/vnet/tcp/tcp_test.c +++ b/src/vnet/tcp/tcp_test.c @@ -290,7 +290,7 @@ tcp_test_sack_tx (vlib_main_t * vm, unformat_input_t * input) { tcp_connection_t _tc, *tc = &_tc; sack_block_t *sacks; - int i, verbose = 0; + int i, verbose = 0, expected; while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { @@ -326,8 +326,12 @@ tcp_test_sack_tx (vlib_main_t * vm, unformat_input_t * input) sacks = vec_dup (tc->snd_sacks); tcp_update_sack_list (tc, 1100, 1200); - TCP_TEST ((vec_len (tc->snd_sacks) == 5), "sack blocks %d expected %d", - vec_len (tc->snd_sacks), 5); + if (verbose) + vlib_cli_output (vm, "add new segment [1100, 1200]\n%U", + format_tcp_sacks, tc); + expected = 5 < TCP_MAX_SACK_BLOCKS ? 6 : 5; + TCP_TEST ((vec_len (tc->snd_sacks) == expected), + "sack blocks %d expected %d", vec_len (tc->snd_sacks), expected); TCP_TEST ((tc->snd_sacks[0].start == 1100), "first sack block start %u expected %u", tc->snd_sacks[0].start, 1100); diff --git a/src/vnet/udp/udp_input.c b/src/vnet/udp/udp_input.c index e6b4f8fc..9a8ff076 100644 --- a/src/vnet/udp/udp_input.c +++ b/src/vnet/udp/udp_input.c @@ -123,7 +123,7 @@ udp4_uri_input_node_fn (vlib_main_t * vm, /* lookup session */ s0 = stream_session_lookup4 (&ip0->dst_address, &ip0->src_address, udp0->dst_port, udp0->src_port, - SESSION_TYPE_IP4_UDP, my_thread_index); + SESSION_TYPE_IP4_UDP); /* no listener */ if (PREDICT_FALSE (s0 == 0)) -- cgit 1.2.3-korg