aboutsummaryrefslogtreecommitdiffstats
path: root/src/vnet/tcp/tcp_input.c
diff options
context:
space:
mode:
authorFlorin Coras <fcoras@cisco.com>2018-10-19 16:26:24 -0700
committerDamjan Marion <dmarion@me.com>2018-10-23 19:35:51 +0000
commitbf4d5ce58435d3f424749ff69650ea67ce778f04 (patch)
tree35470ff4780c23a60ff0bf8c2680c73be7f6978f /src/vnet/tcp/tcp_input.c
parentf87eb9b3722f38112de6bc823527438e2a521bc0 (diff)
tcp: fast retransmit improvements
Patch is too large to be ported to 18.10 just days before release. - handle fast retransmits outside of established node and limit the retransmit burst size to avoid tx losses and worsening congestion. - in the absance of a tx pacer, use slow start after fast retransmit exists - add fast retransmit heuristic that re-retries sending the first segment if everything else fails - fine tuning Change-Id: I84a2ab8fbba8b97f1d2b26584dc11a1e2c33c8d2 Signed-off-by: Florin Coras <fcoras@cisco.com>
Diffstat (limited to 'src/vnet/tcp/tcp_input.c')
-rw-r--r--src/vnet/tcp/tcp_input.c223
1 files changed, 165 insertions, 58 deletions
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index 4e3987eeaed..39a538ba681 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -749,7 +749,7 @@ scoreboard_next_rxt_hole (sack_scoreboard_t * sb,
/* Rule (3): if hole not lost */
else if (seq_lt (hole->start, sb->high_sacked))
{
- *snd_limited = 1;
+ *snd_limited = 0;
sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
}
/* Rule (4): if hole beyond high_sacked */
@@ -993,10 +993,10 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
sb->last_sacked_bytes = sb->sacked_bytes
- (old_sacked_bytes - sb->last_bytes_delivered);
ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes || tcp_in_recovery (tc));
- ASSERT (sb->sacked_bytes == 0
+ ASSERT (sb->sacked_bytes == 0 || tcp_in_recovery (tc)
|| sb->sacked_bytes < tc->snd_una_max - seq_max (tc->snd_una, ack));
ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_una_max
- - seq_max (tc->snd_una, ack));
+ - seq_max (tc->snd_una, ack) || tcp_in_recovery (tc));
ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc)
|| sb->holes[sb->head].start == ack + sb->snd_una_adv);
TCP_EVT_DBG (TCP_EVT_CC_SCOREBOARD, tc);
@@ -1052,6 +1052,9 @@ tcp_cc_init_congestion (tcp_connection_t * tc)
tcp_fastrecovery_on (tc);
tc->snd_congestion = tc->snd_una_max;
tc->cwnd_acc_bytes = 0;
+ tc->snd_rxt_bytes = 0;
+ tc->prev_ssthresh = tc->ssthresh;
+ tc->prev_cwnd = tc->cwnd;
tc->cc_algo->congestion (tc);
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 4);
}
@@ -1074,8 +1077,14 @@ tcp_cc_fastrecovery_exit (tcp_connection_t * tc)
tc->snd_rxt_bytes = 0;
tc->rcv_dupacks = 0;
tc->snd_nxt = tc->snd_una_max;
+ tc->snd_rxt_bytes = 0;
+
+ /* HACK: since we don't have an output pacer, force slow start */
+ tc->cwnd = 20 * tc->snd_mss;
+
tcp_fastrecovery_off (tc);
tcp_fastrecovery_1_smss_off (tc);
+ tcp_fastrecovery_first_off (tc);
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3);
}
@@ -1088,13 +1097,14 @@ tcp_cc_congestion_undo (tcp_connection_t * tc)
tc->rcv_dupacks = 0;
if (tcp_in_recovery (tc))
tcp_cc_recovery_exit (tc);
+ else if (tcp_in_fastrecovery (tc))
+ tcp_cc_fastrecovery_exit (tc);
ASSERT (tc->rto_boff == 0);
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 5);
- /* TODO extend for fastrecovery */
}
-static u8
-tcp_cc_is_spurious_retransmit (tcp_connection_t * tc)
+static inline u8
+tcp_cc_is_spurious_timeout_rxt (tcp_connection_t * tc)
{
return (tcp_in_recovery (tc) && tc->rto_boff == 1
&& tc->snd_rxt_ts
@@ -1102,6 +1112,20 @@ tcp_cc_is_spurious_retransmit (tcp_connection_t * tc)
&& timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts));
}
+static inline u8
+tcp_cc_is_spurious_fast_rxt (tcp_connection_t * tc)
+{
+ return (tcp_in_fastrecovery (tc)
+ && tc->cwnd > tc->ssthresh + 3 * tc->snd_mss);
+}
+
+static u8
+tcp_cc_is_spurious_retransmit (tcp_connection_t * tc)
+{
+ return (tcp_cc_is_spurious_timeout_rxt (tc)
+ || tcp_cc_is_spurious_fast_rxt (tc));
+}
+
static int
tcp_cc_recover (tcp_connection_t * tc)
{
@@ -1158,6 +1182,84 @@ tcp_should_fastrecover (tcp_connection_t * tc)
|| tcp_should_fastrecover_sack (tc));
}
+void
+tcp_program_fastretransmit (tcp_connection_t * tc)
+{
+ tcp_worker_ctx_t *wrk = &tcp_main.wrk_ctx[tc->c_thread_index];
+ if (!(tc->flags & TCP_CONN_FRXT_PENDING))
+ {
+ vec_add1 (wrk->pending_fast_rxt, tc->c_c_index);
+ tc->flags |= TCP_CONN_FRXT_PENDING;
+ }
+}
+
+void
+tcp_do_fastretransmits (u32 thread_index)
+{
+ tcp_worker_ctx_t *wrk = &tcp_main.wrk_ctx[thread_index];
+ u32 max_burst_size, burst_size, n_segs = 0;
+ tcp_connection_t *tc;
+ int i;
+
+ if (vec_len (wrk->pending_fast_rxt) == 0)
+ return;
+
+ vec_append (wrk->ongoing_fast_rxt, wrk->pending_fast_rxt);
+ vec_reset_length (wrk->pending_fast_rxt);
+
+ max_burst_size = VLIB_FRAME_SIZE / vec_len (wrk->ongoing_fast_rxt);
+ max_burst_size = clib_max (max_burst_size, 1);
+
+ for (i = 0; i < vec_len (wrk->ongoing_fast_rxt); i++)
+ {
+ tc = tcp_connection_get (wrk->ongoing_fast_rxt[i], thread_index);
+ tc->flags &= ~TCP_CONN_FRXT_PENDING;
+
+ if (!tcp_in_fastrecovery (tc))
+ continue;
+
+ /* TODO tx pacer instead of this */
+ if (n_segs >= VLIB_FRAME_SIZE)
+ {
+ tcp_program_fastretransmit (tc);
+ continue;
+ }
+
+ burst_size = clib_min (max_burst_size, VLIB_FRAME_SIZE - n_segs);
+
+ if (tc->cwnd > tc->ssthresh + 3 * tc->snd_mss)
+ {
+ /* The first segment MUST be retransmitted */
+ if (tcp_retransmit_first_unacked (tc))
+ {
+ tcp_program_fastretransmit (tc);
+ continue;
+ }
+
+ /* Post retransmit update cwnd to ssthresh and account for the
+ * three segments that have left the network and should've been
+ * buffered at the receiver XXX */
+ tc->cwnd = tc->ssthresh + 3 * tc->snd_mss;
+
+ /* If cwnd allows, send more data */
+ if (tcp_opts_sack_permitted (&tc->rcv_opts))
+ {
+ scoreboard_init_high_rxt (&tc->sack_sb,
+ tc->snd_una + tc->snd_mss);
+ tc->sack_sb.rescue_rxt = tc->snd_una - 1;
+ n_segs += tcp_fast_retransmit_sack (tc, burst_size);
+ }
+ else
+ {
+ n_segs += tcp_fast_retransmit_no_sack (tc, burst_size);
+ }
+ }
+ else
+ n_segs += tcp_fast_retransmit (tc, burst_size);
+ }
+ vec_reset_length (wrk->ongoing_fast_rxt);
+}
+
/**
* One function to rule them all ... and in the darkness bind them
*/
@@ -1170,7 +1272,7 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
{
if (tc->bytes_acked)
goto partial_ack;
- tcp_fast_retransmit (tc);
+ tcp_program_fastretransmit (tc);
return;
}
/*
@@ -1196,20 +1298,10 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
{
ASSERT (!tcp_in_fastrecovery (tc));
- /* If of of the two conditions lower hold, reset dupacks because
- * we're probably after timeout (RFC6582 heuristics).
- * If Cumulative ack does not cover more than congestion threshold,
- * and:
- * 1) The following doesn't hold: The congestion window is greater
- * than SMSS bytes and the difference between highest_ack
- * and prev_highest_ack is at most 4*SMSS bytes
- * 2) Echoed timestamp in the last non-dup ack does not equal the
- * stored timestamp
- */
- if (seq_leq (tc->snd_una, tc->snd_congestion)
- && ((!(tc->cwnd > tc->snd_mss
- && tc->bytes_acked <= 4 * tc->snd_mss))
- || (tc->rcv_opts.tsecr != tc->tsecr_last_ack)))
+ /* Heuristic to catch potential late dupacks
+ * after fast retransmit exits */
+ if (is_dack && tc->snd_una == tc->snd_congestion
+ && timestamp_leq (tc->rcv_opts.tsecr, tc->tsecr_last_ack))
{
tc->rcv_dupacks = 0;
return;
@@ -1218,26 +1310,10 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
tcp_cc_init_congestion (tc);
tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
- /* The first segment MUST be retransmitted */
- tcp_retransmit_first_unacked (tc);
-
- /* Post retransmit update cwnd to ssthresh and account for the
- * three segments that have left the network and should've been
- * buffered at the receiver XXX */
- tc->cwnd = tc->ssthresh + tc->rcv_dupacks * tc->snd_mss;
- ASSERT (tc->cwnd >= tc->snd_mss);
-
- /* If cwnd allows, send more data */
if (tcp_opts_sack_permitted (&tc->rcv_opts))
- {
- scoreboard_init_high_rxt (&tc->sack_sb,
- tc->snd_una + tc->snd_mss);
- tcp_fast_retransmit_sack (tc);
- }
- else
- {
- tcp_fast_retransmit_no_sack (tc);
- }
+ tc->sack_sb.high_rxt = tc->snd_una;
+
+ tcp_program_fastretransmit (tc);
return;
}
else if (!tc->bytes_acked
@@ -1249,6 +1325,28 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
else
goto partial_ack;
}
+ /* Don't allow entry in fast recovery if still in recovery, for now */
+ else if (0 && is_dack && tcp_in_recovery (tc))
+ {
+ /* If of of the two conditions lower hold, reset dupacks because
+ * we're probably after timeout (RFC6582 heuristics).
+ * If Cumulative ack does not cover more than congestion threshold,
+ * and:
+ * 1) The following doesn't hold: The congestion window is greater
+ * than SMSS bytes and the difference between highest_ack
+ * and prev_highest_ack is at most 4*SMSS bytes
+ * 2) Echoed timestamp in the last non-dup ack does not equal the
+ * stored timestamp
+ */
+ if (seq_leq (tc->snd_una, tc->snd_congestion)
+ && ((!(tc->cwnd > tc->snd_mss
+ && tc->bytes_acked <= 4 * tc->snd_mss))
+ || (tc->rcv_opts.tsecr != tc->tsecr_last_ack)))
+ {
+ tc->rcv_dupacks = 0;
+ return;
+ }
+ }
if (!tc->bytes_acked)
return;
@@ -1259,14 +1357,11 @@ partial_ack:
/*
* Legitimate ACK. 1) See if we can exit recovery
*/
- /* XXX limit this only to first partial ack? */
- if (seq_lt (tc->snd_una, tc->snd_congestion))
- tcp_retransmit_timer_force_update (tc);
- else
- tcp_retransmit_timer_update (tc);
if (seq_geq (tc->snd_una, tc->snd_congestion))
{
+ tcp_retransmit_timer_update (tc);
+
/* If spurious return, we've already updated everything */
if (tcp_cc_recover (tc))
{
@@ -1286,6 +1381,9 @@ partial_ack:
* Legitimate ACK. 2) If PARTIAL ACK try to retransmit
*/
+ /* XXX limit this only to first partial ack? */
+ tcp_retransmit_timer_force_update (tc);
+
/* RFC6675: If the incoming ACK is a cumulative acknowledgment,
* reset dupacks to 0. Also needed if in congestion recovery */
tc->rcv_dupacks = 0;
@@ -1300,24 +1398,33 @@ partial_ack:
}
/* Remove retransmitted bytes that have been delivered */
- ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv
- >= tc->sack_sb.last_bytes_delivered
- || (tc->flags & TCP_CONN_FINSNT));
-
- if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt))
+ if (tcp_opts_sack_permitted (&tc->rcv_opts))
{
+ ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv
+ >= tc->sack_sb.last_bytes_delivered
+ || (tc->flags & TCP_CONN_FINSNT));
+
/* If we have sacks and we haven't gotten an ack beyond high_rxt,
* remove sacked bytes delivered */
- rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv
- - tc->sack_sb.last_bytes_delivered;
- ASSERT (tc->snd_rxt_bytes >= rxt_delivered);
- tc->snd_rxt_bytes -= rxt_delivered;
+ if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt))
+ {
+ rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv
+ - tc->sack_sb.last_bytes_delivered;
+ ASSERT (tc->snd_rxt_bytes >= rxt_delivered);
+ tc->snd_rxt_bytes -= rxt_delivered;
+ }
+ else
+ {
+ /* Apparently all retransmitted holes have been acked */
+ tc->snd_rxt_bytes = 0;
+ }
}
else
{
- /* Either all retransmitted holes have been acked, or we're
- * "in the blind" and retransmitting segment by segment */
- tc->snd_rxt_bytes = 0;
+ if (tc->snd_rxt_bytes > tc->bytes_acked)
+ tc->snd_rxt_bytes -= tc->bytes_acked;
+ else
+ tc->snd_rxt_bytes = 0;
}
tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK);
@@ -1325,7 +1432,7 @@ partial_ack:
/*
* Since this was a partial ack, try to retransmit some more data
*/
- tcp_fast_retransmit (tc);
+ tcp_program_fastretransmit (tc);
}
/**