From 36ebcfffbc7ab0e83b4bb8dfaec16bf16cafb954 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Thu, 12 Sep 2019 18:36:44 -0700 Subject: tcp: use sacks for timer based recovery Type: feature If available, reuse sack scoreboard in timer triggered retransmit to minimize spurious retransmits. Additional changes/refactoring: - limited transmit updates - add sacked rxt count to scoreboard - prr pacing of fast retransmits - startup pacing updates - changed loss window to flight + mss Change-Id: I057de6a9d6401698bd1031d5cf5cfbb62f2bdf61 Signed-off-by: Florin Coras --- src/vnet/tcp/tcp_input.c | 286 +++++++++++++++++++++++------------------------ 1 file changed, 141 insertions(+), 145 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 933f913bc5a..b49b8e8fd77 100755 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -508,8 +508,6 @@ tcp_update_rtt (tcp_connection_t * tc, tcp_rate_sample_t * rs, u32 ack) mrtt = rs->rtt_time * THZ; goto estimate_rtt; } - if (tcp_in_recovery (tc)) - return 0; goto done; } @@ -784,9 +782,17 @@ scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index, return hole; } -#endif /* CLIB_MARCH_VARIANT */ -#ifndef CLIB_MARCH_VARIANT +always_inline void +scoreboard_update_sacked_rxt (sack_scoreboard_t * sb, u32 start, u32 end, + u8 has_rxt) +{ + if (!has_rxt || seq_geq (start, sb->high_rxt)) + return; + + sb->rxt_sacked += + seq_lt (end, sb->high_rxt) ? (end - start) : (sb->high_rxt - start); +} always_inline void scoreboard_update_bytes (sack_scoreboard_t * sb, u32 ack, u32 snd_mss) @@ -837,7 +843,7 @@ scoreboard_update_bytes (sack_scoreboard_t * sb, u32 ack, u32 snd_mss) while (right) { sb->lost_bytes += scoreboard_hole_bytes (right); - sb->last_lost_bytes += right->is_lost ? 0 : right->end - right->start; + sb->last_lost_bytes += right->is_lost ? 0 : (right->end - right->start); right->is_lost = 1; left = scoreboard_prev_hole (sb, right); if (!left) @@ -912,9 +918,8 @@ scoreboard_next_rxt_hole (sack_scoreboard_t * sb, return hole; } -#endif /* CLIB_MARCH_VARIANT */ -static void +void scoreboard_init_high_rxt (sack_scoreboard_t * sb, u32 snd_una) { sack_scoreboard_hole_t *hole; @@ -928,7 +933,6 @@ scoreboard_init_high_rxt (sack_scoreboard_t * sb, u32 snd_una) sb->rescue_rxt = snd_una - 1; } -#ifndef CLIB_MARCH_VARIANT void scoreboard_init (sack_scoreboard_t * sb) { @@ -957,6 +961,23 @@ scoreboard_clear (sack_scoreboard_t * sb) sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; sb->is_reneging = 0; } + +void +scoreboard_clear_reneging (sack_scoreboard_t * sb, u32 start, u32 end) +{ + sack_scoreboard_hole_t *last_hole; + + clib_warning ("sack reneging"); + + scoreboard_clear (sb); + last_hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX, + start, end); + last_hole->is_lost = 1; + sb->tail = scoreboard_hole_index (sb, last_hole); + sb->high_sacked = start; + scoreboard_init_high_rxt (sb, start); +} + #endif /* CLIB_MARCH_VARIANT */ /** @@ -983,14 +1004,18 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) sack_scoreboard_t *sb = &tc->sack_sb; sack_block_t *blk, *rcv_sacks; u32 blk_index = 0, i, j; + u8 has_rxt; sb->last_sacked_bytes = 0; sb->last_bytes_delivered = 0; + sb->rxt_sacked = 0; if (!tcp_opts_sack (&tc->rcv_opts) && sb->head == TCP_INVALID_SACK_HOLE_INDEX) return; + has_rxt = tcp_in_cong_recovery (tc); + /* Remove invalid blocks */ blk = tc->rcv_opts.sacks; while (blk < vec_end (tc->rcv_opts.sacks)) @@ -1121,6 +1146,8 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) sb->is_reneging = 0; } } + scoreboard_update_sacked_rxt (sb, hole->start, hole->end, + has_rxt); scoreboard_remove_hole (sb, hole); hole = next_hole; } @@ -1129,6 +1156,8 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { if (seq_gt (blk->end, hole->start)) { + scoreboard_update_sacked_rxt (sb, hole->start, blk->end, + has_rxt); hole->start = blk->end; } blk_index++; @@ -1145,11 +1174,17 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) /* Pool might've moved */ hole = scoreboard_get_hole (sb, hole_index); hole->end = blk->start; + + scoreboard_update_sacked_rxt (sb, blk->start, blk->end, + has_rxt); + blk_index++; ASSERT (hole->next == scoreboard_hole_index (sb, next_hole)); } else if (seq_lt (blk->start, hole->end)) { + scoreboard_update_sacked_rxt (sb, blk->start, hole->end, + has_rxt); hole->end = blk->start; } hole = scoreboard_next_hole (sb, hole); @@ -1166,6 +1201,8 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc) || sb->is_reneging || sb->holes[sb->head].start == ack); ASSERT (sb->last_lost_bytes <= sb->lost_bytes); + ASSERT ((ack - tc->snd_una) + sb->last_sacked_bytes + - sb->last_bytes_delivered >= sb->rxt_sacked); TCP_EVT (TCP_EVT_CC_SCOREBOARD, tc); } @@ -1209,70 +1246,41 @@ tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd) } } -#ifndef CLIB_MARCH_VARIANT /** * Init loss recovery/fast recovery. * * Triggered by dup acks as opposed to timer timeout. Note that cwnd is * updated in @ref tcp_cc_handle_event after fast retransmit */ -void +static void tcp_cc_init_congestion (tcp_connection_t * tc) { tcp_fastrecovery_on (tc); tc->snd_congestion = tc->snd_nxt; tc->cwnd_acc_bytes = 0; tc->snd_rxt_bytes = 0; + tc->rxt_delivered = 0; + tc->prr_delivered = 0; tc->prev_ssthresh = tc->ssthresh; tc->prev_cwnd = tc->cwnd; - tc->cc_algo->congestion (tc); - tc->fr_occurences += 1; - TCP_EVT (TCP_EVT_CC_EVT, tc, 4); -} -#endif /* CLIB_MARCH_VARIANT */ -static void -tcp_cc_recovery_exit (tcp_connection_t * tc) -{ - tc->rto_boff = 0; - tcp_update_rto (tc); - tc->snd_rxt_ts = 0; - tc->rtt_ts = 0; - tcp_recovery_off (tc); - TCP_EVT (TCP_EVT_CC_EVT, tc, 3); -} + tcp_cc_congestion (tc); -#ifndef CLIB_MARCH_VARIANT -void -tcp_cc_fastrecovery_clear (tcp_connection_t * tc) -{ - tc->snd_rxt_bytes = 0; - tc->rcv_dupacks = 0; - tc->rtt_ts = 0; + /* Post retransmit update cwnd to ssthresh and account for the + * three segments that have left the network and should've been + * buffered at the receiver XXX */ + if (!tcp_opts_sack_permitted (&tc->rcv_opts)) + tc->cwnd += 3 * tc->snd_mss; - tcp_fastrecovery_off (tc); - tcp_fastrecovery_first_off (tc); - tc->flags &= ~TCP_CONN_FRXT_PENDING; - - TCP_EVT (TCP_EVT_CC_EVT, tc, 3); + tc->fr_occurences += 1; + TCP_EVT (TCP_EVT_CC_EVT, tc, 4); } -#endif /* CLIB_MARCH_VARIANT */ static void tcp_cc_congestion_undo (tcp_connection_t * tc) { tc->cwnd = tc->prev_cwnd; tc->ssthresh = tc->prev_ssthresh; - tc->rcv_dupacks = 0; - if (tcp_in_recovery (tc)) - { - tcp_cc_recovery_exit (tc); - tc->snd_nxt = seq_max (tc->snd_nxt, tc->snd_congestion); - } - else if (tcp_in_fastrecovery (tc)) - { - tcp_cc_fastrecovery_clear (tc); - } tcp_cc_undo_recovery (tc); ASSERT (tc->rto_boff == 0); TCP_EVT (TCP_EVT_CC_EVT, tc, 5); @@ -1288,48 +1296,74 @@ tcp_cc_is_spurious_timeout_rxt (tcp_connection_t * tc) } static inline u8 -tcp_cc_is_spurious_fast_rxt (tcp_connection_t * tc) +tcp_cc_is_spurious_retransmit (tcp_connection_t * tc) { - return (tcp_in_fastrecovery (tc) - && tc->cwnd > tc->ssthresh + 3 * tc->snd_mss); + return (tcp_cc_is_spurious_timeout_rxt (tc)); } -static u8 -tcp_cc_is_spurious_retransmit (tcp_connection_t * tc) +static inline u8 +tcp_should_fastrecover_sack (tcp_connection_t * tc) +{ + return (tc->sack_sb.lost_bytes + || ((TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss + < tc->sack_sb.sacked_bytes)); +} + +static inline u8 +tcp_should_fastrecover (tcp_connection_t * tc) { - return (tcp_cc_is_spurious_timeout_rxt (tc) - || tcp_cc_is_spurious_fast_rxt (tc)); + return ((tc->rcv_dupacks == TCP_DUPACK_THRESHOLD) + || tcp_should_fastrecover_sack (tc)); } static int tcp_cc_recover (tcp_connection_t * tc) { sack_scoreboard_hole_t *hole; + u8 is_spurious = 0; ASSERT (tcp_in_cong_recovery (tc)); - hole = scoreboard_first_hole (&tc->sack_sb); - if (hole && hole->start == tc->snd_una && hole->end == tc->snd_nxt) - scoreboard_clear (&tc->sack_sb); - if (tcp_cc_is_spurious_retransmit (tc)) { tcp_cc_congestion_undo (tc); - return 1; + is_spurious = 1; } - if (tcp_in_recovery (tc)) - tcp_cc_recovery_exit (tc); - else if (tcp_in_fastrecovery (tc)) + tc->prr_delivered = 0; + tc->rxt_delivered = 0; + tc->snd_rxt_bytes = 0; + tc->snd_rxt_ts = 0; + tc->rtt_ts = 0; + tc->flags &= ~TCP_CONN_RXT_PENDING; + + /* Previous recovery left us congested. Continue sending as part + * of the current recovery event with an updated snd_congestion */ + if (tc->sack_sb.sacked_bytes) { - tcp_cc_recovered (tc); - tcp_cc_fastrecovery_clear (tc); + tc->snd_congestion = tc->snd_nxt; + if (tcp_opts_sack_permitted (&tc->rcv_opts)) + scoreboard_init_high_rxt (&tc->sack_sb, tc->snd_una); + tcp_program_retransmit (tc); + return is_spurious; } + hole = scoreboard_first_hole (&tc->sack_sb); + if (hole && hole->start == tc->snd_una && hole->end == tc->snd_nxt) + scoreboard_clear (&tc->sack_sb); + + if (!tcp_in_recovery (tc) && !is_spurious) + tcp_cc_recovered (tc); + + tcp_fastrecovery_off (tc); + tcp_fastrecovery_first_off (tc); + tcp_recovery_off (tc); + TCP_EVT (TCP_EVT_CC_EVT, tc, 3); + ASSERT (tc->rto_boff == 0); ASSERT (!tcp_in_cong_recovery (tc)); ASSERT (tcp_scoreboard_is_sane_post_recovery (tc)); - return 0; + return is_spurious; } static void @@ -1353,17 +1387,28 @@ tcp_cc_update (tcp_connection_t * tc, tcp_rate_sample_t * rs) tc->snd_congestion = tc->snd_una - 1; } -static u8 -tcp_should_fastrecover_sack (tcp_connection_t * tc) +static void +tcp_update_delivered (tcp_connection_t * tc, u8 is_dack, u8 has_sack) { - return (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss < tc->sack_sb.sacked_bytes; -} + if (has_sack) + { + ASSERT (tc->bytes_acked >= tc->sack_sb.last_bytes_delivered + || (tc->flags & TCP_CONN_FINSNT)); -static u8 -tcp_should_fastrecover (tcp_connection_t * tc) -{ - return (tc->rcv_dupacks == TCP_DUPACK_THRESHOLD - || tcp_should_fastrecover_sack (tc)); + tc->rxt_delivered += tc->sack_sb.rxt_sacked; + tc->prr_delivered += tc->bytes_acked + tc->sack_sb.last_sacked_bytes + - tc->sack_sb.last_bytes_delivered; + } + else + { + tcp_fastrecovery_first_on (tc); + tc->rxt_delivered = clib_max (tc->rxt_delivered + tc->bytes_acked, + tc->snd_rxt_bytes); + if (is_dack) + tc->prr_delivered += is_dack; + else + tc->prr_delivered += tc->bytes_acked - tc->snd_mss * tc->rcv_dupacks; + } } /** @@ -1380,28 +1425,20 @@ tcp_cc_handle_event (tcp_connection_t * tc, tcp_rate_sample_t * rs, * and accounting for segments that have left the network are done * lower. */ - if (tcp_in_fastrecovery (tc)) + if (tcp_in_cong_recovery (tc)) { if (!has_sack) - tc->rcv_dupacks++; + tc->rcv_dupacks += is_dack; if (!tc->bytes_acked) { - tcp_program_fastretransmit (tc); + tcp_update_delivered (tc, is_dack, has_sack); + tcp_program_retransmit (tc); if (!has_sack) tcp_cc_rcv_cong_ack (tc, TCP_CC_DUPACK, rs); return; } } - /* - * In timer triggered recovery - */ - else if (tcp_in_recovery (tc)) - { - /* No fast recovery entry at this point */ - if (!tc->bytes_acked) - return; - } /* * Duplicate ACK. Check if we should enter fast recovery */ @@ -1410,33 +1447,24 @@ tcp_cc_handle_event (tcp_connection_t * tc, tcp_rate_sample_t * rs, TCP_EVT (TCP_EVT_DUPACK_RCVD, tc, 1); ASSERT (tc->snd_una != tc->snd_nxt || tc->sack_sb.last_sacked_bytes); + /* Heuristic to catch potential late dupacks */ + if (!tc->sack_sb.sacked_bytes && tc->snd_una == tc->snd_congestion + && timestamp_leq (tc->rcv_opts.tsecr, tc->tsecr_last_ack)) + return; + tc->rcv_dupacks++; if (tcp_should_fastrecover (tc)) { - u32 pacer_wnd; - ASSERT (!tcp_in_fastrecovery (tc)); - /* Heuristic to catch potential late dupacks - * after fast retransmit exits */ - if (is_dack && tc->snd_una == tc->snd_congestion - && timestamp_leq (tc->rcv_opts.tsecr, tc->tsecr_last_ack)) - { - tc->rcv_dupacks = 0; - return; - } - tcp_cc_init_congestion (tc); if (has_sack) scoreboard_init_high_rxt (&tc->sack_sb, tc->snd_una); - /* Constrain rate until we get a partial ack */ - pacer_wnd = clib_max (0.1 * tc->cwnd, 2 * tc->snd_mss); - tcp_connection_tx_pacer_reset (tc, pacer_wnd, - 0 /* start bucket */ ); - tcp_program_fastretransmit (tc); + tcp_connection_tx_pacer_reset (tc, tc->cwnd, 0 /* start bucket */ ); + tcp_program_retransmit (tc); return; } else @@ -1475,6 +1503,10 @@ tcp_cc_handle_event (tcp_connection_t * tc, tcp_rate_sample_t * rs, * Legitimate ACK. 1) See if we can exit recovery */ + /* RFC6675: If the incoming ACK is a cumulative acknowledgment, + * reset dupacks to 0. Also needed if in congestion recovery */ + tc->rcv_dupacks = 0; + if (seq_geq (tc->snd_una, tc->snd_congestion)) { /* If spurious return, we've already updated everything */ @@ -1493,55 +1525,18 @@ tcp_cc_handle_event (tcp_connection_t * tc, tcp_rate_sample_t * rs, * Legitimate ACK. 2) If PARTIAL ACK try to retransmit */ - /* RFC6675: If the incoming ACK is a cumulative acknowledgment, - * reset dupacks to 0. Also needed if in congestion recovery */ - tc->rcv_dupacks = 0; - - /* Post RTO timeout don't try anything fancy */ - if (tcp_in_recovery (tc)) - { - tcp_cc_rcv_ack (tc, rs); - transport_add_tx_event (&tc->connection); - return; - } - /* Remove retransmitted bytes that have been delivered */ - if (has_sack) - { - ASSERT (tc->bytes_acked >= tc->sack_sb.last_bytes_delivered - || (tc->flags & TCP_CONN_FINSNT)); + tcp_update_delivered (tc, is_dack, has_sack); - /* If we have sacks and we haven't gotten an ack beyond high_rxt, - * remove sacked bytes delivered */ - if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt)) - { - u32 rxt_delivered; - rxt_delivered = tc->bytes_acked - tc->sack_sb.last_bytes_delivered; - ASSERT (tc->snd_rxt_bytes >= rxt_delivered); - tc->snd_rxt_bytes -= rxt_delivered; - } - else - { - /* Apparently all retransmitted holes have been acked */ - tc->snd_rxt_bytes = 0; - tc->sack_sb.high_rxt = tc->snd_una; - } - } + if (tcp_in_recovery (tc)) + tcp_cc_rcv_ack (tc, rs); else - { - tcp_fastrecovery_first_on (tc); - if (tc->snd_rxt_bytes > tc->bytes_acked) - tc->snd_rxt_bytes -= tc->bytes_acked; - else - tc->snd_rxt_bytes = 0; - } - - tcp_cc_rcv_cong_ack (tc, TCP_CC_PARTIALACK, rs); + tcp_cc_rcv_cong_ack (tc, TCP_CC_PARTIALACK, rs); /* * Since this was a partial ack, try to retransmit some more data */ - tcp_program_fastretransmit (tc); + tcp_program_retransmit (tc); } /** @@ -2767,6 +2762,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Update rtt and rto */ tcp_estimate_initial_rtt (tc0); + tcp_connection_tx_pacer_update (tc0); /* Switch state to ESTABLISHED */ tc0->state = TCP_STATE_ESTABLISHED; -- cgit 1.2.3-korg