From 3af90fceb61d0c236709c25df936bbbf304cbff5 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Wed, 3 May 2017 21:09:42 -0700 Subject: Fix TCP loss recovery, VPP-745 Allows pure loss recovery retransmits only on timeout. Change-Id: I563cdbf9e7b890a6569350bdbda4f746ace0544e Signed-off-by: Florin Coras --- src/vnet/tcp/tcp.c | 35 +++++++++++++++++++++++++++-------- src/vnet/tcp/tcp.h | 4 ++-- src/vnet/tcp/tcp_input.c | 35 +++++++++++++++++++++++++---------- src/vnet/tcp/tcp_newreno.c | 2 +- src/vnet/tcp/tcp_output.c | 44 +++++++++++++++++--------------------------- 5 files changed, 72 insertions(+), 48 deletions(-) diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index de4edfa6..e80e2ec9 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -567,30 +567,49 @@ tcp_session_send_mss (transport_connection_t * trans_conn) return tc->snd_mss; } +always_inline u32 +tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space) +{ + if (tc->snd_wnd < tc->snd_mss) + { + return tc->snd_wnd < snd_space ? tc->snd_wnd : 0; + } + + /* If we can't write at least a segment, don't try at all */ + if (snd_space < tc->snd_mss) + return 0; + + /* round down to mss multiple */ + return snd_space - (snd_space % tc->snd_mss); +} + /** * Compute tx window session is allowed to fill. */ u32 tcp_session_send_space (transport_connection_t * trans_conn) { - u32 snd_space, chunk; + int snd_space; tcp_connection_t *tc = (tcp_connection_t *) trans_conn; /* If we haven't gotten dupacks or if we did and have gotten sacked bytes * then we can still send */ - if (PREDICT_TRUE (tcp_in_fastrecovery (tc) == 0 + if (PREDICT_TRUE (tcp_in_cong_recovery (tc) == 0 && (tc->rcv_dupacks == 0 || tc->sack_sb.last_sacked_bytes))) { - chunk = tc->snd_wnd > tc->snd_mss ? tc->snd_mss : tc->snd_wnd; snd_space = tcp_available_snd_space (tc); + return tcp_round_snd_space (tc, snd_space); + } - /* If we can't write at least a segment, don't try at all */ - if (chunk == 0 || snd_space < chunk) + if (tcp_in_recovery (tc)) + { + tc->snd_nxt = tc->snd_una_max; + snd_space = tcp_available_wnd (tc) - tc->rtx_bytes + - (tc->snd_una_max - tc->snd_congestion); + if (snd_space <= 0) return 0; - - /* round down to mss multiple */ - return snd_space - (snd_space % chunk); + return tcp_round_snd_space (tc, snd_space); } /* If in fast recovery, send 1 SMSS if wnd allows */ diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index f61a1b52..c75479dc 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -24,8 +24,8 @@ #include #include -#define TCP_TICK 10e-3 /**< TCP tick period (s) */ -#define THZ 1/TCP_TICK /**< TCP tick frequency */ +#define TCP_TICK 0.001 /**< TCP tick period (s) */ +#define THZ (u32) (1/TCP_TICK) /**< TCP tick frequency */ #define TCP_TSTAMP_RESOLUTION TCP_TICK /**< Time stamp resolution */ #define TCP_PAWS_IDLE 24 * 24 * 60 * 60 * THZ /**< 24 days */ #define TCP_MAX_OPTION_SPACE 40 diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 0030cfe2..e9c52c5e 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -392,11 +392,10 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) /* Karn's rule, part 1. Don't use retransmitted segments to estimate * RTT because they're ambiguous. */ - if (tc->rtt_seq && seq_gt (ack, tc->rtt_seq) && !tc->rto_boff) + if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq) && !tc->rto_boff) { mrtt = tcp_time_now () - tc->rtt_ts; } - /* As per RFC7323 TSecr can be used for RTTM only if the segment advances * snd_una, i.e., the left side of the send window: * seq_lt (tc->snd_una, ack). Note: last condition could be dropped, we don't @@ -406,19 +405,22 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) mrtt = tcp_time_now () - tc->opt.tsecr; } + /* Allow measuring of a new RTT */ + tc->rtt_ts = 0; + + /* If ACK moves left side of the wnd make sure boff is 0, even if mrtt is + * not valid */ + if (tc->bytes_acked) + tc->rto_boff = 0; + /* Ignore dubious measurements */ if (mrtt == 0 || mrtt > TCP_RTT_MAX) return 0; tcp_estimate_rtt (tc, mrtt); - tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); - /* Allow measuring of RTT and make sure boff is 0 */ - tc->rtt_seq = 0; - tc->rto_boff = 0; - - return 1; + return 0; } /** @@ -735,7 +737,7 @@ tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) { u8 partial_ack; - if (tcp_in_cong_recovery (tc)) + if (tcp_in_fastrecovery (tc)) { partial_ack = seq_lt (tc->snd_una, tc->snd_congestion); if (!partial_ack) @@ -749,6 +751,7 @@ tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) /* Clear retransmitted bytes. XXX should we clear all? */ tc->rtx_bytes = 0; + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK); /* In case snd_nxt is still in the past and output tries to @@ -772,6 +775,13 @@ tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) tc->cc_algo->rcv_ack (tc); tc->tsecr_last_ack = tc->opt.tsecr; tc->rcv_dupacks = 0; + if (tcp_in_recovery (tc)) + { + tc->rtx_bytes -= clib_min (tc->bytes_acked, tc->rtx_bytes); + tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); + if (seq_geq (tc->snd_una, tc->snd_congestion)) + tcp_recovery_off (tc); + } } } @@ -897,7 +907,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, tcp_cc_rcv_ack (tc, b); /* If everything has been acked, stop retransmit timer - * otherwise update */ + * otherwise update. */ if (tc->snd_una == tc->snd_una_max) tcp_retransmit_timer_reset (tc); else @@ -1778,6 +1788,11 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_send_reset (b0, is_ip4); goto drop; } + + /* Update rtt and rto */ + tc0->bytes_acked = 1; + tcp_update_rtt (tc0, vnet_buffer (b0)->tcp.ack_number); + /* Switch state to ESTABLISHED */ tc0->state = TCP_STATE_ESTABLISHED; diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c index 856dffe4..3525f4e5 100644 --- a/src/vnet/tcp/tcp_newreno.c +++ b/src/vnet/tcp/tcp_newreno.c @@ -38,7 +38,7 @@ newreno_rcv_ack (tcp_connection_t * tc) else { /* Round up to 1 if needed */ - tc->cwnd += clib_max (tc->snd_mss * tc->snd_mss / tc->cwnd, 1); + tc->cwnd += clib_max ((tc->snd_mss * tc->snd_mss) / tc->cwnd, 1); } } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index a85d30da..7ee930c6 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -73,7 +73,7 @@ tcp_set_snd_mss (tcp_connection_t * tc) snd_mss = dummy_mtu; /* TODO cache mss and consider PMTU discovery */ - snd_mss = tc->opt.mss < snd_mss ? tc->opt.mss : snd_mss; + snd_mss = clib_min (tc->opt.mss, snd_mss); tc->snd_mss = snd_mss; @@ -923,6 +923,12 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, /* TODO this is updated in output as well ... */ if (tc->snd_nxt > tc->snd_una_max) tc->snd_una_max = tc->snd_nxt; + + if (tc->rtt_ts == 0) + { + tc->rtt_ts = tcp_time_now (); + tc->rtt_seq = tc->snd_nxt; + } TCP_EVT_DBG (TCP_EVT_PKTIZE, tc); } @@ -1019,9 +1025,10 @@ tcp_rtx_timeout_cc (tcp_connection_t * tc) } /* Start again from the beginning */ - tcp_recovery_on (tc); + tc->cwnd = tcp_loss_wnd (tc); tc->snd_congestion = tc->snd_una_max; + tcp_recovery_on (tc); } static void @@ -1032,7 +1039,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; - u32 bi, snd_space, n_bytes; + u32 bi, n_bytes; if (is_syn) { @@ -1065,33 +1072,16 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Exponential backoff */ tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); - /* Figure out what and how many bytes we can send */ - snd_space = tcp_available_snd_space (tc); - TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1); - if (snd_space == 0) - { - clib_warning ("no wnd to retransmit"); - tcp_return_buffer (tm); - - /* Force one segment */ - tcp_retransmit_first_unacked (tc); + /* Send one segment. No fancy recovery for now! */ + n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); + scoreboard_clear (&tc->sack_sb); - /* Re-enable retransmit timer. Output may be unwilling - * to do it for us */ - tcp_retransmit_timer_set (tc); - - return; - } - else + if (n_bytes == 0) { - /* No fancy recovery for now! */ - n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, snd_space); - scoreboard_clear (&tc->sack_sb); - - if (n_bytes == 0) - return; + clib_warning ("could not retransmit"); + return; } } else @@ -1400,7 +1390,7 @@ tcp46_output_inline (vlib_main_t * vm, } /* If not retransmitting - * 1) update snd_una_max (SYN, SYNACK, new data, FIN) + * 1) update snd_una_max (SYN, SYNACK, FIN) * 2) If we're not tracking an ACK, start tracking */ if (seq_lt (tc0->snd_una_max, tc0->snd_nxt)) { -- cgit 1.2.3-korg