diff options
author | Florin Coras <fcoras@cisco.com> | 2019-09-27 08:16:40 -0700 |
---|---|---|
committer | Dave Barach <openvpp@barachs.net> | 2019-10-10 20:32:43 +0000 |
commit | be237bf02382854118986e8ea84c7544e42023f2 (patch) | |
tree | 85ef98b86f1a8949a54602c15b5510c7cccfda20 /src/vnet/tcp | |
parent | 5b1379be3e25df096d97dcd217965169fc6bb1b2 (diff) |
tcp: retry lost retransmits
Add heuristic that detects lost retransmitted segments and retries
sending them.
Type: feature
Change-Id: I34d1bb16799e1993779222eb2bfad4b40704159e
Signed-off-by: Florin Coras <fcoras@cisco.com>
Diffstat (limited to 'src/vnet/tcp')
-rw-r--r-- | src/vnet/tcp/tcp.c | 24 | ||||
-rw-r--r-- | src/vnet/tcp/tcp.h | 9 | ||||
-rwxr-xr-x | src/vnet/tcp/tcp_input.c | 250 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_output.c | 44 |
4 files changed, 179 insertions, 148 deletions
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 5ffb1e27ab8..6ef03dc093d 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -915,17 +915,26 @@ static u8 * format_tcp_congestion (u8 * s, va_list * args) { tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - u32 indent = format_get_indent (s); + u32 indent = format_get_indent (s), prr_space = 0; s = format (s, "%U ", format_tcp_congestion_status, tc); s = format (s, "algo %s cwnd %u ssthresh %u bytes_acked %u\n", tc->cc_algo->name, tc->cwnd, tc->ssthresh, tc->bytes_acked); - s = format (s, "%Ucc space %u prev_cwnd %u prev_ssthresh %u rxt_bytes %u\n", + s = format (s, "%Ucc space %u prev_cwnd %u prev_ssthresh %u\n", format_white_space, indent, tcp_available_cc_snd_space (tc), - tc->prev_cwnd, tc->prev_ssthresh, tc->snd_rxt_bytes); - s = format (s, "%Usnd_congestion %u dupack %u limited_transmit %u\n", + tc->prev_cwnd, tc->prev_ssthresh); + s = format (s, "%Usnd_cong %u dupack %u limited_tx %u\n", format_white_space, indent, tc->snd_congestion - tc->iss, tc->rcv_dupacks, tc->limited_transmit - tc->iss); + s = format (s, "%Urxt_bytes %u rxt_delivered %u rxt_head %u rxt_ts %u\n", + format_white_space, indent, tc->snd_rxt_bytes, + tc->rxt_delivered, tc->rxt_head - tc->iss, + tcp_time_now_w_thread (tc->c_thread_index) - tc->snd_rxt_ts); + if (tcp_in_fastrecovery (tc)) + prr_space = tcp_fastrecovery_prr_snd_space (tc); + s = format (s, "%Uprr_start %u prr_delivered %u prr space %u\n", + format_white_space, indent, tc->prr_start - tc->iss, + tc->prr_delivered, prr_space); return s; } @@ -1140,10 +1149,11 @@ format_tcp_scoreboard (u8 * s, va_list * args) sack_scoreboard_hole_t *hole; u32 indent = format_get_indent (s); - s = format (s, "sacked %u last_sacked %u lost %u last_lost %u\n", + s = format (s, "sacked %u last_sacked %u lost %u last_lost %u" + " rxt_sacked %u\n", sb->sacked_bytes, sb->last_sacked_bytes, sb->lost_bytes, - sb->last_lost_bytes); - s = format (s, "%Ulast_bytes_delivered %u high_sacked %u is_reneging %u\n", + sb->last_lost_bytes, sb->rxt_sacked); + s = format (s, "%Ulast_delivered %u high_sacked %u is_reneging %u\n", format_white_space, indent, sb->last_bytes_delivered, sb->high_sacked - tc->iss, sb->is_reneging); s = format (s, "%Ucur_rxt_hole %u high_rxt %u rescue_rxt %u", diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 1bf32818171..a31c46cdacf 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -168,7 +168,7 @@ typedef struct _sack_scoreboard u32 sacked_bytes; /**< Number of bytes sacked in sb */ u32 last_sacked_bytes; /**< Number of bytes last sacked */ u32 last_bytes_delivered; /**< Sack bytes delivered to app */ - u32 rxt_sacked; /**< Rxt last delivered */ + u32 rxt_sacked; /**< Rxt bytes last delivered */ u32 high_sacked; /**< Highest byte sacked (fack) */ u32 high_rxt; /**< Highest retransmitted sequence */ u32 rescue_rxt; /**< Rescue sequence number */ @@ -226,7 +226,7 @@ sack_scoreboard_hole_t *scoreboard_last_hole (sack_scoreboard_t * sb); void scoreboard_clear (sack_scoreboard_t * sb); void scoreboard_clear_reneging (sack_scoreboard_t * sb, u32 start, u32 end); void scoreboard_init (sack_scoreboard_t * sb); -void scoreboard_init_high_rxt (sack_scoreboard_t * sb, u32 snd_una); +void scoreboard_init_rxt (sack_scoreboard_t * sb, u32 snd_una); u8 *format_tcp_scoreboard (u8 * s, va_list * args); #define TCP_BTS_INVALID_INDEX ((u32)~0) @@ -369,7 +369,9 @@ typedef struct _tcp_connection u32 snd_rxt_bytes; /**< Retransmitted bytes during current cc event */ u32 snd_rxt_ts; /**< Timestamp when first packet is retransmitted */ u32 prr_delivered; /**< RFC6937 bytes delivered during current event */ + u32 prr_start; /**< snd_una when prr starts */ u32 rxt_delivered; /**< Rxt bytes delivered during current cc event */ + u32 rxt_head; /**< snd_una last time we re rxted the head */ u32 tsecr_last_ack; /**< Timestamp echoed to us in last healthy ACK */ u32 snd_congestion; /**< snd_una_max when congestion is detected */ u32 tx_fifo_size; /**< Tx fifo size. Used to constrain cwnd */ @@ -957,8 +959,7 @@ tcp_is_lost_fin (tcp_connection_t * tc) } u32 tcp_snd_space (tcp_connection_t * tc); -//void tcp_cc_init_congestion (tcp_connection_t * tc); -//void tcp_cc_fastrecovery_clear (tcp_connection_t * tc); +int tcp_fastrecovery_prr_snd_space (tcp_connection_t * tc); fib_node_index_t tcp_lookup_rmt_in_fib (tcp_connection_t * tc); diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index b49b8e8fd77..f177f5f4753 100755 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -629,34 +629,6 @@ tcp_program_dequeue (tcp_worker_ctx_t * wrk, tcp_connection_t * tc) tc->burst_acked += tc->bytes_acked; } -/** - * Check if duplicate ack as per RFC5681 Sec. 2 - */ -static u8 -tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 prev_snd_wnd, - u32 prev_snd_una) -{ - return ((vnet_buffer (b)->tcp.ack_number == prev_snd_una) - && seq_gt (tc->snd_nxt, tc->snd_una) - && (vnet_buffer (b)->tcp.seq_end == vnet_buffer (b)->tcp.seq_number) - && (prev_snd_wnd == tc->snd_wnd)); -} - -/** - * Checks if ack is a congestion control event. - */ -static u8 -tcp_ack_is_cc_event (tcp_connection_t * tc, vlib_buffer_t * b, - u32 prev_snd_wnd, u32 prev_snd_una, u8 * is_dack) -{ - /* Check if ack is duplicate. Per RFC 6675, ACKs that SACK new data are - * defined to be 'duplicate' */ - *is_dack = tc->sack_sb.last_sacked_bytes - || tcp_ack_is_dupack (tc, b, prev_snd_wnd, prev_snd_una); - - return ((*is_dack || tcp_in_cong_recovery (tc)) && !tcp_is_lost_fin (tc)); -} - #ifndef CLIB_MARCH_VARIANT static u32 scoreboard_hole_index (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) @@ -920,7 +892,7 @@ scoreboard_next_rxt_hole (sack_scoreboard_t * sb, } void -scoreboard_init_high_rxt (sack_scoreboard_t * sb, u32 snd_una) +scoreboard_init_rxt (sack_scoreboard_t * sb, u32 snd_una) { sack_scoreboard_hole_t *hole; hole = scoreboard_first_hole (sb); @@ -975,7 +947,7 @@ scoreboard_clear_reneging (sack_scoreboard_t * sb, u32 start, u32 end) last_hole->is_lost = 1; sb->tail = scoreboard_hole_index (sb, last_hole); sb->high_sacked = start; - scoreboard_init_high_rxt (sb, start); + scoreboard_init_rxt (sb, start); } #endif /* CLIB_MARCH_VARIANT */ @@ -1203,6 +1175,8 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) ASSERT (sb->last_lost_bytes <= sb->lost_bytes); ASSERT ((ack - tc->snd_una) + sb->last_sacked_bytes - sb->last_bytes_delivered >= sb->rxt_sacked); + ASSERT ((ack - tc->snd_una) >= tc->sack_sb.last_bytes_delivered + || (tc->flags & TCP_CONN_FINSNT)); TCP_EVT (TCP_EVT_CC_SCOREBOARD, tc); } @@ -1261,9 +1235,11 @@ tcp_cc_init_congestion (tcp_connection_t * tc) tc->snd_rxt_bytes = 0; tc->rxt_delivered = 0; tc->prr_delivered = 0; + tc->prr_start = tc->snd_una; tc->prev_ssthresh = tc->ssthresh; tc->prev_cwnd = tc->cwnd; + tc->snd_rxt_ts = tcp_tstamp (tc); tcp_cc_congestion (tc); /* Post retransmit update cwnd to ssthresh and account for the @@ -1310,8 +1286,29 @@ tcp_should_fastrecover_sack (tcp_connection_t * tc) } static inline u8 -tcp_should_fastrecover (tcp_connection_t * tc) +tcp_should_fastrecover (tcp_connection_t * tc, u8 has_sack) { + if (!has_sack) + { + /* If of of the two conditions lower hold, reset dupacks because + * we're probably after timeout (RFC6582 heuristics). + * If Cumulative ack does not cover more than congestion threshold, + * and: + * 1) The following doesn't hold: The congestion window is greater + * than SMSS bytes and the difference between highest_ack + * and prev_highest_ack is at most 4*SMSS bytes + * 2) Echoed timestamp in the last non-dup ack does not equal the + * stored timestamp + */ + if (seq_leq (tc->snd_una, tc->snd_congestion) + && ((!(tc->cwnd > tc->snd_mss + && tc->bytes_acked <= 4 * tc->snd_mss)) + || (tc->rcv_opts.tsecr != tc->tsecr_last_ack))) + { + tc->rcv_dupacks = 0; + return 0; + } + } return ((tc->rcv_dupacks == TCP_DUPACK_THRESHOLD) || tcp_should_fastrecover_sack (tc)); } @@ -1330,6 +1327,7 @@ tcp_cc_recover (tcp_connection_t * tc) is_spurious = 1; } + tc->rcv_dupacks = 0; tc->prr_delivered = 0; tc->rxt_delivered = 0; tc->snd_rxt_bytes = 0; @@ -1342,8 +1340,9 @@ tcp_cc_recover (tcp_connection_t * tc) if (tc->sack_sb.sacked_bytes) { tc->snd_congestion = tc->snd_nxt; - if (tcp_opts_sack_permitted (&tc->rcv_opts)) - scoreboard_init_high_rxt (&tc->sack_sb, tc->snd_una); + tc->snd_rxt_ts = tcp_tstamp (tc); + tc->prr_start = tc->snd_una; + scoreboard_init_rxt (&tc->sack_sb, tc->snd_una); tcp_program_retransmit (tc); return is_spurious; } @@ -1355,6 +1354,8 @@ tcp_cc_recover (tcp_connection_t * tc) if (!tcp_in_recovery (tc) && !is_spurious) tcp_cc_recovered (tc); + tcp_connection_tx_pacer_reset (tc, tc->cwnd, 0 /* start bucket */ ); + tcp_fastrecovery_off (tc); tcp_fastrecovery_first_off (tc); tcp_recovery_off (tc); @@ -1387,30 +1388,6 @@ tcp_cc_update (tcp_connection_t * tc, tcp_rate_sample_t * rs) tc->snd_congestion = tc->snd_una - 1; } -static void -tcp_update_delivered (tcp_connection_t * tc, u8 is_dack, u8 has_sack) -{ - if (has_sack) - { - ASSERT (tc->bytes_acked >= tc->sack_sb.last_bytes_delivered - || (tc->flags & TCP_CONN_FINSNT)); - - tc->rxt_delivered += tc->sack_sb.rxt_sacked; - tc->prr_delivered += tc->bytes_acked + tc->sack_sb.last_sacked_bytes - - tc->sack_sb.last_bytes_delivered; - } - else - { - tcp_fastrecovery_first_on (tc); - tc->rxt_delivered = clib_max (tc->rxt_delivered + tc->bytes_acked, - tc->snd_rxt_bytes); - if (is_dack) - tc->prr_delivered += is_dack; - else - tc->prr_delivered += tc->bytes_acked - tc->snd_mss * tc->rcv_dupacks; - } -} - /** * One function to rule them all ... and in the darkness bind them */ @@ -1421,92 +1398,34 @@ tcp_cc_handle_event (tcp_connection_t * tc, tcp_rate_sample_t * rs, u8 has_sack = tcp_opts_sack_permitted (&tc->rcv_opts); /* - * Already in fast recovery. Return if no data acked, partial acks - * and accounting for segments that have left the network are done - * lower. + * If not in recovery, figure out if we should enter */ - if (tcp_in_cong_recovery (tc)) + if (!tcp_in_cong_recovery (tc)) { - if (!has_sack) - tc->rcv_dupacks += is_dack; - - if (!tc->bytes_acked) - { - tcp_update_delivered (tc, is_dack, has_sack); - tcp_program_retransmit (tc); - if (!has_sack) - tcp_cc_rcv_cong_ack (tc, TCP_CC_DUPACK, rs); - return; - } - } - /* - * Duplicate ACK. Check if we should enter fast recovery - */ - else if (is_dack) - { - TCP_EVT (TCP_EVT_DUPACK_RCVD, tc, 1); - ASSERT (tc->snd_una != tc->snd_nxt || tc->sack_sb.last_sacked_bytes); - - /* Heuristic to catch potential late dupacks */ - if (!tc->sack_sb.sacked_bytes && tc->snd_una == tc->snd_congestion - && timestamp_leq (tc->rcv_opts.tsecr, tc->tsecr_last_ack)) - return; + ASSERT (is_dack); tc->rcv_dupacks++; + TCP_EVT (TCP_EVT_DUPACK_RCVD, tc, 1); + tcp_cc_rcv_cong_ack (tc, TCP_CC_DUPACK, rs); - if (tcp_should_fastrecover (tc)) + if (tcp_should_fastrecover (tc, has_sack)) { - ASSERT (!tcp_in_fastrecovery (tc)); - tcp_cc_init_congestion (tc); if (has_sack) - scoreboard_init_high_rxt (&tc->sack_sb, tc->snd_una); + scoreboard_init_rxt (&tc->sack_sb, tc->snd_una); tcp_connection_tx_pacer_reset (tc, tc->cwnd, 0 /* start bucket */ ); tcp_program_retransmit (tc); - return; - } - else - { - tcp_cc_rcv_cong_ack (tc, TCP_CC_DUPACK, rs); - return; - } - } - /* Don't allow entry in fast recovery if still in recovery, for now */ - else if (0 && is_dack && tcp_in_recovery (tc)) - { - /* If of of the two conditions lower hold, reset dupacks because - * we're probably after timeout (RFC6582 heuristics). - * If Cumulative ack does not cover more than congestion threshold, - * and: - * 1) The following doesn't hold: The congestion window is greater - * than SMSS bytes and the difference between highest_ack - * and prev_highest_ack is at most 4*SMSS bytes - * 2) Echoed timestamp in the last non-dup ack does not equal the - * stored timestamp - */ - if (seq_leq (tc->snd_una, tc->snd_congestion) - && ((!(tc->cwnd > tc->snd_mss - && tc->bytes_acked <= 4 * tc->snd_mss)) - || (tc->rcv_opts.tsecr != tc->tsecr_last_ack))) - { - tc->rcv_dupacks = 0; - return; } - } - ASSERT (tc->bytes_acked); - TCP_EVT (TCP_EVT_CC_PACK, tc); + return; + } /* - * Legitimate ACK. 1) See if we can exit recovery + * Already in recovery. See if we can exit and stop retransmitting */ - /* RFC6675: If the incoming ACK is a cumulative acknowledgment, - * reset dupacks to 0. Also needed if in congestion recovery */ - tc->rcv_dupacks = 0; - if (seq_geq (tc->snd_una, tc->snd_congestion)) { /* If spurious return, we've already updated everything */ @@ -1522,21 +1441,88 @@ tcp_cc_handle_event (tcp_connection_t * tc, tcp_rate_sample_t * rs, } /* - * Legitimate ACK. 2) If PARTIAL ACK try to retransmit + * Process (re)transmit feedback. Output path uses this to decide how much + * more data to release into the network */ + if (has_sack) + { + tc->rxt_delivered += tc->sack_sb.rxt_sacked; + tc->prr_delivered += tc->bytes_acked + tc->sack_sb.last_sacked_bytes + - tc->sack_sb.last_bytes_delivered; + + tcp_program_retransmit (tc); + } + else + { + if (is_dack) + { + tc->rcv_dupacks += 1; + TCP_EVT (TCP_EVT_DUPACK_RCVD, tc, 1); + } + tc->rxt_delivered = clib_max (tc->rxt_delivered + tc->bytes_acked, + tc->snd_rxt_bytes); + if (is_dack) + tc->prr_delivered += 1; + else + tc->prr_delivered += tc->bytes_acked - tc->snd_mss * tc->rcv_dupacks; - /* Remove retransmitted bytes that have been delivered */ - tcp_update_delivered (tc, is_dack, has_sack); + /* If partial ack, assume that the first un-acked segment was lost */ + if (tc->bytes_acked || tc->rcv_dupacks == TCP_DUPACK_THRESHOLD) + tcp_fastrecovery_first_on (tc); + + tcp_program_retransmit (tc); + } + + /* + * Notify cc of the event + */ + + if (!tc->bytes_acked) + { + tcp_cc_rcv_cong_ack (tc, TCP_CC_DUPACK, rs); + return; + } + + /* RFC6675: If the incoming ACK is a cumulative acknowledgment, + * reset dupacks to 0. Also needed if in congestion recovery */ + tc->rcv_dupacks = 0; if (tcp_in_recovery (tc)) tcp_cc_rcv_ack (tc, rs); else tcp_cc_rcv_cong_ack (tc, TCP_CC_PARTIALACK, rs); +} - /* - * Since this was a partial ack, try to retransmit some more data - */ - tcp_program_retransmit (tc); +/** + * Check if duplicate ack as per RFC5681 Sec. 2 + */ +always_inline u8 +tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 prev_snd_wnd, + u32 prev_snd_una) +{ + return ((vnet_buffer (b)->tcp.ack_number == prev_snd_una) + && seq_gt (tc->snd_nxt, tc->snd_una) + && (vnet_buffer (b)->tcp.seq_end == vnet_buffer (b)->tcp.seq_number) + && (prev_snd_wnd == tc->snd_wnd)); +} + +/** + * Checks if ack is a congestion control event. + */ +static u8 +tcp_ack_is_cc_event (tcp_connection_t * tc, vlib_buffer_t * b, + u32 prev_snd_wnd, u32 prev_snd_una, u8 * is_dack) +{ + /* Check if ack is duplicate. Per RFC 6675, ACKs that SACK new data are + * defined to be 'duplicate' as well */ + *is_dack = tc->sack_sb.last_sacked_bytes + || tcp_ack_is_dupack (tc, b, prev_snd_wnd, prev_snd_una); + + /* If reneging, wait for timer based retransmits */ + if (PREDICT_FALSE (tcp_is_lost_fin (tc) || tc->sack_sb.is_reneging)) + return 0; + + return (*is_dack || tcp_in_cong_recovery (tc)); } /** diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index a7b0e398d36..79866aff03a 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -1599,7 +1599,7 @@ tcp_timer_retransmit_handler (u32 tc_index) } if (tcp_opts_sack_permitted (&tc->rcv_opts)) - scoreboard_init_high_rxt (&tc->sack_sb, tc->snd_una + tc->snd_mss); + scoreboard_init_rxt (&tc->sack_sb, tc->snd_una + n_bytes); tcp_program_retransmit (tc); } @@ -1858,7 +1858,7 @@ done: /** * Estimate send space using proportional rate reduction (RFC6937) */ -static int +int tcp_fastrecovery_prr_snd_space (tcp_connection_t * tc) { u32 pipe, prr_out; @@ -1874,13 +1874,24 @@ tcp_fastrecovery_prr_snd_space (tcp_connection_t * tc) } else { - int limit = tc->prr_delivered - prr_out + tc->snd_mss; + int limit; + limit = clib_max ((int) (tc->prr_delivered - prr_out), 0) + tc->snd_mss; space = clib_min (tc->ssthresh - pipe, limit); } space = clib_max (space, prr_out ? 0 : tc->snd_mss); return space; } +static inline u8 +tcp_retransmit_should_retry_head (tcp_connection_t * tc, + sack_scoreboard_t * sb) +{ + u32 tx_adv_sack = sb->high_sacked - tc->snd_congestion; + f64 rr = (f64) tc->ssthresh / tc->prev_cwnd; + + return (tx_adv_sack > (tc->snd_una - tc->prr_start) * rr); +} + #define scoreboard_rescue_rxt_valid(_sb, _tc) \ (seq_geq (_sb->rescue_rxt, _tc->snd_una) \ && seq_leq (_sb->rescue_rxt, _tc->snd_congestion)) @@ -1917,8 +1928,31 @@ tcp_retransmit_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, return 0; } - TCP_EVT (TCP_EVT_CC_EVT, tc, 0); sb = &tc->sack_sb; + + /* Check if snd_una is a lost retransmit */ + if (seq_gt (sb->high_sacked, tc->snd_congestion) + && tc->rxt_head != tc->snd_una + && tcp_retransmit_should_retry_head (tc, sb)) + { + n_written = tcp_prepare_retransmit_segment (wrk, tc, 0, tc->snd_mss, + &b); + if (!n_written) + { + tcp_program_retransmit (tc); + goto done; + } + bi = vlib_get_buffer_index (vm, b); + tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4); + n_segs = 1; + + tc->rxt_head = tc->snd_una; + tc->rxt_delivered += n_written; + tc->prr_delivered += n_written; + ASSERT (tc->rxt_delivered <= tc->snd_rxt_bytes); + } + + TCP_EVT (TCP_EVT_CC_EVT, tc, 0); hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); max_deq = transport_max_tx_dequeue (&tc->connection); @@ -1931,7 +1965,7 @@ tcp_retransmit_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, if (!hole) { /* We are out of lost holes to retransmit so send some new data. */ - if (max_deq) + if (max_deq > tc->snd_mss) { u32 n_segs_new, av_window; av_window = tc->snd_wnd - (tc->snd_nxt - tc->snd_una); |