From bf4d5ce58435d3f424749ff69650ea67ce778f04 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Fri, 19 Oct 2018 16:26:24 -0700 Subject: tcp: fast retransmit improvements Patch is too large to be ported to 18.10 just days before release. - handle fast retransmits outside of established node and limit the retransmit burst size to avoid tx losses and worsening congestion. - in the absance of a tx pacer, use slow start after fast retransmit exists - add fast retransmit heuristic that re-retries sending the first segment if everything else fails - fine tuning Change-Id: I84a2ab8fbba8b97f1d2b26584dc11a1e2c33c8d2 Signed-off-by: Florin Coras --- src/vnet/tcp/tcp.c | 16 +--- src/vnet/tcp/tcp.h | 22 ++++- src/vnet/tcp/tcp_debug.h | 47 +++++++--- src/vnet/tcp/tcp_input.c | 223 ++++++++++++++++++++++++++++++++++------------ src/vnet/tcp/tcp_output.c | 78 +++++++++++----- 5 files changed, 279 insertions(+), 107 deletions(-) (limited to 'src') diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index e32b5c417ae..cb05b8c0533 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -950,7 +950,8 @@ format_tcp_scoreboard (u8 * s, va_list * args) hole = scoreboard_first_hole (sb); if (hole) - s = format (s, "\n head %u tail %u holes:\n", sb->head, sb->tail); + s = format (s, "\n head %u tail %u %u holes:\n", sb->head, sb->tail, + pool_elts (sb->holes)); while (hole) { @@ -1027,7 +1028,7 @@ tcp_snd_space_inline (tcp_connection_t * tc) { int snd_space, snt_limited; - if (PREDICT_TRUE (tcp_in_cong_recovery (tc) == 0)) + if (PREDICT_TRUE (!tcp_in_fastrecovery (tc))) { snd_space = tcp_available_output_snd_space (tc); @@ -1047,16 +1048,6 @@ tcp_snd_space_inline (tcp_connection_t * tc) return tcp_round_snd_space (tc, snd_space); } - if (tcp_in_recovery (tc)) - { - tc->snd_nxt = tc->snd_una_max; - snd_space = tcp_available_snd_wnd (tc) - tc->snd_rxt_bytes - - (tc->snd_una_max - tc->snd_congestion); - if (snd_space <= 0 || (tc->snd_una_max - tc->snd_una) >= tc->snd_wnd) - return 0; - return tcp_round_snd_space (tc, snd_space); - } - /* RFC 5681: When previously unsent data is available and the new value of * cwnd and the receiver's advertised window allow, a TCP SHOULD send 1*SMSS * bytes of previously unsent data. */ @@ -1103,6 +1094,7 @@ tcp_update_time (f64 now, u8 thread_index) tw_timer_expire_timers_16t_2w_512sl (&tcp_main. wrk_ctx[thread_index].timer_wheel, now); + tcp_do_fastretransmits (thread_index); tcp_flush_frames_to_output (thread_index); } diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 165659b6d9f..a036072e546 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -120,6 +120,8 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler; _(FR_1_SMSS, "Sent 1 SMSS") \ _(HALF_OPEN_DONE, "Half-open completed") \ _(FINPNDG, "FIN pending") \ + _(FRXT_PENDING, "Fast-retransmit pending") \ + _(FRXT_FIRST, "Fast-retransmit first again") \ typedef enum _tcp_connection_flag_bits { @@ -345,6 +347,9 @@ struct _tcp_cc_algorithm #define tcp_fastrecovery_sent_1_smss(tc) ((tc)->flags & TCP_CONN_FR_1_SMSS) #define tcp_fastrecovery_1_smss_on(tc) ((tc)->flags |= TCP_CONN_FR_1_SMSS) #define tcp_fastrecovery_1_smss_off(tc) ((tc)->flags &= ~TCP_CONN_FR_1_SMSS) +#define tcp_fastrecovery_first(tc) ((tc)->flags & TCP_CONN_FRXT_FIRST) +#define tcp_fastrecovery_first_on(tc) ((tc)->flags |= TCP_CONN_FRXT_FIRST) +#define tcp_fastrecovery_first_off(tc) ((tc)->flags &= ~TCP_CONN_FRXT_FIRST) #define tcp_in_cong_recovery(tc) ((tc)->flags & \ (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY)) @@ -354,6 +359,7 @@ tcp_cong_recovery_off (tcp_connection_t * tc) { tc->flags &= ~(TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY); tcp_fastrecovery_1_smss_off (tc); + tcp_fastrecovery_first_off (tc); } typedef enum _tcp_error @@ -379,9 +385,15 @@ typedef struct tcp_worker_ctx_ output nodes */ vlib_frame_t *ip_lookup_tx_frames[2]; /**< tx frames for ip 4/6 lookup nodes */ + u32 *pending_fast_rxt; /**< vector of connections + needing fast rxt */ + u32 *ongoing_fast_rxt; /**< vector of connections + now doing fast rxt */ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline1); u8 cached_opts[40]; /**< cached 'on the wire' options for bursts */ + } tcp_worker_ctx_t; typedef struct _tcp_main @@ -542,6 +554,8 @@ void tcp_update_burst_snd_vars (tcp_connection_t * tc); void tcp_update_rto (tcp_connection_t * tc); void tcp_flush_frame_to_output (vlib_main_t * vm, u8 thread_index, u8 is_ip4); void tcp_flush_frames_to_output (u8 thread_index); +void tcp_program_fastretransmit (tcp_connection_t * tc); +void tcp_do_fastretransmits (u32 thread_index); always_inline u32 tcp_end_seq (tcp_header_t * th, u32 len) @@ -659,10 +673,10 @@ tcp_is_lost_fin (tcp_connection_t * tc) } u32 tcp_snd_space (tcp_connection_t * tc); -void tcp_retransmit_first_unacked (tcp_connection_t * tc); -void tcp_fast_retransmit_no_sack (tcp_connection_t * tc); -void tcp_fast_retransmit_sack (tcp_connection_t * tc); -void tcp_fast_retransmit (tcp_connection_t * tc); +int tcp_retransmit_first_unacked (tcp_connection_t * tc); +int tcp_fast_retransmit_no_sack (tcp_connection_t * tc, u32 burst_size); +int tcp_fast_retransmit_sack (tcp_connection_t * tc, u32 burst_size); +int tcp_fast_retransmit (tcp_connection_t * tc, u32 burst_size); void tcp_cc_init_congestion (tcp_connection_t * tc); void tcp_cc_fastrecovery_exit (tcp_connection_t * tc); diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index ccf12dae1bc..8f626b1afeb 100755 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -629,6 +629,8 @@ if (_av > 0) \ #define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...) \ { \ + if (_tc->snd_una != _tc->iss) \ + TCP_EVT_CC_STAT_PRINT (_tc); \ ELOG_TYPE_DECLARE (_e) = \ { \ .format = "cc: %s snd_space %u snd_una %u out %u flight %u", \ @@ -788,9 +790,11 @@ if (TCP_DEBUG_CC > 1) \ #define STATS_INTERVAL 1 -#define TCP_EVT_CC_RTO_STAT_HANDLER(_tc, ...) \ -{ \ -if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now()) \ +#define tcp_cc_time_to_print_stats(_tc) \ + _tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now() \ + || tcp_in_fastrecovery (_tc) \ + +#define TCP_EVT_CC_RTO_STAT_PRINT(_tc) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ @@ -801,29 +805,40 @@ if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now()) \ ed->data[0] = _tc->rto; \ ed->data[1] = _tc->srtt; \ ed->data[2] = _tc->rttvar; \ -} \ } -#define TCP_EVT_CC_SND_STAT_HANDLER(_tc, ...) \ + +#define TCP_EVT_CC_RTO_STAT_HANDLER(_tc, ...) \ +{ \ +if (tcp_cc_time_to_print_stats (_tc)) \ { \ -if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now()) \ + TCP_EVT_CC_RTO_STAT_PRINT (_tc); \ +} \ +} + +#define TCP_EVT_CC_SND_STAT_PRINT(_tc) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "snd_stat: dack %u sacked %u lost %u out %u rxt %u", \ + .format = "snd_stat: cc_space %u sacked %u lost %u out %u rxt %u", \ .format_args = "i4i4i4i4i4", \ }; \ DECLARE_ETD(_tc, _e, 5); \ - ed->data[0] = _tc->rcv_dupacks; \ + ed->data[0] = tcp_available_cc_snd_space (_tc); \ ed->data[1] = _tc->sack_sb.sacked_bytes; \ ed->data[2] = _tc->sack_sb.lost_bytes; \ ed->data[3] = tcp_bytes_out (_tc); \ ed->data[3] = _tc->snd_rxt_bytes; \ -} \ } -#define TCP_EVT_CC_STAT_HANDLER(_tc, ...) \ +#define TCP_EVT_CC_SND_STAT_HANDLER(_tc, ...) \ +{ \ +if (tcp_cc_time_to_print_stats (_tc)) \ { \ -if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now()) \ + TCP_EVT_CC_SND_STAT_PRINT(_tc); \ +} \ +} + +#define TCP_EVT_CC_STAT_PRINT(_tc) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ @@ -836,7 +851,15 @@ if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now()) \ ed->data[2] = tcp_snd_space (_tc); \ ed->data[3] = _tc->ssthresh; \ ed->data[4] = _tc->snd_wnd; \ - TCP_EVT_CC_RTO_STAT_HANDLER (_tc); \ + TCP_EVT_CC_RTO_STAT_PRINT (_tc); \ + TCP_EVT_CC_SND_STAT_PRINT (_tc); \ +} + +#define TCP_EVT_CC_STAT_HANDLER(_tc, ...) \ +{ \ +if (tcp_cc_time_to_print_stats (_tc)) \ +{ \ + TCP_EVT_CC_STAT_PRINT (_tc); \ _tc->c_cc_stat_tstamp = tcp_time_now(); \ } \ } diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 4e3987eeaed..39a538ba681 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -749,7 +749,7 @@ scoreboard_next_rxt_hole (sack_scoreboard_t * sb, /* Rule (3): if hole not lost */ else if (seq_lt (hole->start, sb->high_sacked)) { - *snd_limited = 1; + *snd_limited = 0; sb->cur_rxt_hole = scoreboard_hole_index (sb, hole); } /* Rule (4): if hole beyond high_sacked */ @@ -993,10 +993,10 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) sb->last_sacked_bytes = sb->sacked_bytes - (old_sacked_bytes - sb->last_bytes_delivered); ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes || tcp_in_recovery (tc)); - ASSERT (sb->sacked_bytes == 0 + ASSERT (sb->sacked_bytes == 0 || tcp_in_recovery (tc) || sb->sacked_bytes < tc->snd_una_max - seq_max (tc->snd_una, ack)); ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_una_max - - seq_max (tc->snd_una, ack)); + - seq_max (tc->snd_una, ack) || tcp_in_recovery (tc)); ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc) || sb->holes[sb->head].start == ack + sb->snd_una_adv); TCP_EVT_DBG (TCP_EVT_CC_SCOREBOARD, tc); @@ -1052,6 +1052,9 @@ tcp_cc_init_congestion (tcp_connection_t * tc) tcp_fastrecovery_on (tc); tc->snd_congestion = tc->snd_una_max; tc->cwnd_acc_bytes = 0; + tc->snd_rxt_bytes = 0; + tc->prev_ssthresh = tc->ssthresh; + tc->prev_cwnd = tc->cwnd; tc->cc_algo->congestion (tc); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 4); } @@ -1074,8 +1077,14 @@ tcp_cc_fastrecovery_exit (tcp_connection_t * tc) tc->snd_rxt_bytes = 0; tc->rcv_dupacks = 0; tc->snd_nxt = tc->snd_una_max; + tc->snd_rxt_bytes = 0; + + /* HACK: since we don't have an output pacer, force slow start */ + tc->cwnd = 20 * tc->snd_mss; + tcp_fastrecovery_off (tc); tcp_fastrecovery_1_smss_off (tc); + tcp_fastrecovery_first_off (tc); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); } @@ -1088,13 +1097,14 @@ tcp_cc_congestion_undo (tcp_connection_t * tc) tc->rcv_dupacks = 0; if (tcp_in_recovery (tc)) tcp_cc_recovery_exit (tc); + else if (tcp_in_fastrecovery (tc)) + tcp_cc_fastrecovery_exit (tc); ASSERT (tc->rto_boff == 0); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 5); - /* TODO extend for fastrecovery */ } -static u8 -tcp_cc_is_spurious_retransmit (tcp_connection_t * tc) +static inline u8 +tcp_cc_is_spurious_timeout_rxt (tcp_connection_t * tc) { return (tcp_in_recovery (tc) && tc->rto_boff == 1 && tc->snd_rxt_ts @@ -1102,6 +1112,20 @@ tcp_cc_is_spurious_retransmit (tcp_connection_t * tc) && timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts)); } +static inline u8 +tcp_cc_is_spurious_fast_rxt (tcp_connection_t * tc) +{ + return (tcp_in_fastrecovery (tc) + && tc->cwnd > tc->ssthresh + 3 * tc->snd_mss); +} + +static u8 +tcp_cc_is_spurious_retransmit (tcp_connection_t * tc) +{ + return (tcp_cc_is_spurious_timeout_rxt (tc) + || tcp_cc_is_spurious_fast_rxt (tc)); +} + static int tcp_cc_recover (tcp_connection_t * tc) { @@ -1158,6 +1182,84 @@ tcp_should_fastrecover (tcp_connection_t * tc) || tcp_should_fastrecover_sack (tc)); } +void +tcp_program_fastretransmit (tcp_connection_t * tc) +{ + tcp_worker_ctx_t *wrk = &tcp_main.wrk_ctx[tc->c_thread_index]; + if (!(tc->flags & TCP_CONN_FRXT_PENDING)) + { + vec_add1 (wrk->pending_fast_rxt, tc->c_c_index); + tc->flags |= TCP_CONN_FRXT_PENDING; + } +} + +void +tcp_do_fastretransmits (u32 thread_index) +{ + tcp_worker_ctx_t *wrk = &tcp_main.wrk_ctx[thread_index]; + u32 max_burst_size, burst_size, n_segs = 0; + tcp_connection_t *tc; + int i; + + if (vec_len (wrk->pending_fast_rxt) == 0) + return; + + vec_append (wrk->ongoing_fast_rxt, wrk->pending_fast_rxt); + vec_reset_length (wrk->pending_fast_rxt); + + max_burst_size = VLIB_FRAME_SIZE / vec_len (wrk->ongoing_fast_rxt); + max_burst_size = clib_max (max_burst_size, 1); + + for (i = 0; i < vec_len (wrk->ongoing_fast_rxt); i++) + { + tc = tcp_connection_get (wrk->ongoing_fast_rxt[i], thread_index); + tc->flags &= ~TCP_CONN_FRXT_PENDING; + + if (!tcp_in_fastrecovery (tc)) + continue; + + /* TODO tx pacer instead of this */ + if (n_segs >= VLIB_FRAME_SIZE) + { + tcp_program_fastretransmit (tc); + continue; + } + + burst_size = clib_min (max_burst_size, VLIB_FRAME_SIZE - n_segs); + + if (tc->cwnd > tc->ssthresh + 3 * tc->snd_mss) + { + /* The first segment MUST be retransmitted */ + if (tcp_retransmit_first_unacked (tc)) + { + tcp_program_fastretransmit (tc); + continue; + } + + /* Post retransmit update cwnd to ssthresh and account for the + * three segments that have left the network and should've been + * buffered at the receiver XXX */ + tc->cwnd = tc->ssthresh + 3 * tc->snd_mss; + + /* If cwnd allows, send more data */ + if (tcp_opts_sack_permitted (&tc->rcv_opts)) + { + scoreboard_init_high_rxt (&tc->sack_sb, + tc->snd_una + tc->snd_mss); + tc->sack_sb.rescue_rxt = tc->snd_una - 1; + n_segs += tcp_fast_retransmit_sack (tc, burst_size); + } + else + { + n_segs += tcp_fast_retransmit_no_sack (tc, burst_size); + } + } + else + n_segs += tcp_fast_retransmit (tc, burst_size); + } + vec_reset_length (wrk->ongoing_fast_rxt); +} + /** * One function to rule them all ... and in the darkness bind them */ @@ -1170,7 +1272,7 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) { if (tc->bytes_acked) goto partial_ack; - tcp_fast_retransmit (tc); + tcp_program_fastretransmit (tc); return; } /* @@ -1196,20 +1298,10 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) { ASSERT (!tcp_in_fastrecovery (tc)); - /* If of of the two conditions lower hold, reset dupacks because - * we're probably after timeout (RFC6582 heuristics). - * If Cumulative ack does not cover more than congestion threshold, - * and: - * 1) The following doesn't hold: The congestion window is greater - * than SMSS bytes and the difference between highest_ack - * and prev_highest_ack is at most 4*SMSS bytes - * 2) Echoed timestamp in the last non-dup ack does not equal the - * stored timestamp - */ - if (seq_leq (tc->snd_una, tc->snd_congestion) - && ((!(tc->cwnd > tc->snd_mss - && tc->bytes_acked <= 4 * tc->snd_mss)) - || (tc->rcv_opts.tsecr != tc->tsecr_last_ack))) + /* Heuristic to catch potential late dupacks + * after fast retransmit exits */ + if (is_dack && tc->snd_una == tc->snd_congestion + && timestamp_leq (tc->rcv_opts.tsecr, tc->tsecr_last_ack)) { tc->rcv_dupacks = 0; return; @@ -1218,26 +1310,10 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) tcp_cc_init_congestion (tc); tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); - /* The first segment MUST be retransmitted */ - tcp_retransmit_first_unacked (tc); - - /* Post retransmit update cwnd to ssthresh and account for the - * three segments that have left the network and should've been - * buffered at the receiver XXX */ - tc->cwnd = tc->ssthresh + tc->rcv_dupacks * tc->snd_mss; - ASSERT (tc->cwnd >= tc->snd_mss); - - /* If cwnd allows, send more data */ if (tcp_opts_sack_permitted (&tc->rcv_opts)) - { - scoreboard_init_high_rxt (&tc->sack_sb, - tc->snd_una + tc->snd_mss); - tcp_fast_retransmit_sack (tc); - } - else - { - tcp_fast_retransmit_no_sack (tc); - } + tc->sack_sb.high_rxt = tc->snd_una; + + tcp_program_fastretransmit (tc); return; } else if (!tc->bytes_acked @@ -1249,6 +1325,28 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) else goto partial_ack; } + /* Don't allow entry in fast recovery if still in recovery, for now */ + else if (0 && is_dack && tcp_in_recovery (tc)) + { + /* If of of the two conditions lower hold, reset dupacks because + * we're probably after timeout (RFC6582 heuristics). + * If Cumulative ack does not cover more than congestion threshold, + * and: + * 1) The following doesn't hold: The congestion window is greater + * than SMSS bytes and the difference between highest_ack + * and prev_highest_ack is at most 4*SMSS bytes + * 2) Echoed timestamp in the last non-dup ack does not equal the + * stored timestamp + */ + if (seq_leq (tc->snd_una, tc->snd_congestion) + && ((!(tc->cwnd > tc->snd_mss + && tc->bytes_acked <= 4 * tc->snd_mss)) + || (tc->rcv_opts.tsecr != tc->tsecr_last_ack))) + { + tc->rcv_dupacks = 0; + return; + } + } if (!tc->bytes_acked) return; @@ -1259,14 +1357,11 @@ partial_ack: /* * Legitimate ACK. 1) See if we can exit recovery */ - /* XXX limit this only to first partial ack? */ - if (seq_lt (tc->snd_una, tc->snd_congestion)) - tcp_retransmit_timer_force_update (tc); - else - tcp_retransmit_timer_update (tc); if (seq_geq (tc->snd_una, tc->snd_congestion)) { + tcp_retransmit_timer_update (tc); + /* If spurious return, we've already updated everything */ if (tcp_cc_recover (tc)) { @@ -1286,6 +1381,9 @@ partial_ack: * Legitimate ACK. 2) If PARTIAL ACK try to retransmit */ + /* XXX limit this only to first partial ack? */ + tcp_retransmit_timer_force_update (tc); + /* RFC6675: If the incoming ACK is a cumulative acknowledgment, * reset dupacks to 0. Also needed if in congestion recovery */ tc->rcv_dupacks = 0; @@ -1300,24 +1398,33 @@ partial_ack: } /* Remove retransmitted bytes that have been delivered */ - ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv - >= tc->sack_sb.last_bytes_delivered - || (tc->flags & TCP_CONN_FINSNT)); - - if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt)) + if (tcp_opts_sack_permitted (&tc->rcv_opts)) { + ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv + >= tc->sack_sb.last_bytes_delivered + || (tc->flags & TCP_CONN_FINSNT)); + /* If we have sacks and we haven't gotten an ack beyond high_rxt, * remove sacked bytes delivered */ - rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv - - tc->sack_sb.last_bytes_delivered; - ASSERT (tc->snd_rxt_bytes >= rxt_delivered); - tc->snd_rxt_bytes -= rxt_delivered; + if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt)) + { + rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv + - tc->sack_sb.last_bytes_delivered; + ASSERT (tc->snd_rxt_bytes >= rxt_delivered); + tc->snd_rxt_bytes -= rxt_delivered; + } + else + { + /* Apparently all retransmitted holes have been acked */ + tc->snd_rxt_bytes = 0; + } } else { - /* Either all retransmitted holes have been acked, or we're - * "in the blind" and retransmitting segment by segment */ - tc->snd_rxt_bytes = 0; + if (tc->snd_rxt_bytes > tc->bytes_acked) + tc->snd_rxt_bytes -= tc->bytes_acked; + else + tc->snd_rxt_bytes = 0; } tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK); @@ -1325,7 +1432,7 @@ partial_ack: /* * Since this was a partial ack, try to retransmit some more data */ - tcp_fast_retransmit (tc); + tcp_program_fastretransmit (tc); } /** diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index ed1c641d80f..2e6036b410a 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -1409,7 +1409,11 @@ tcp_rxt_timeout_cc (tcp_connection_t * tc) /* Cleanly recover cc (also clears up fast retransmit) */ if (tcp_in_fastrecovery (tc)) - tcp_cc_fastrecovery_exit (tc); + { + /* TODO be less aggressive about this */ + scoreboard_clear (&tc->sack_sb); + tcp_cc_fastrecovery_exit (tc); + } /* Start again from the beginning */ tc->cc_algo->congestion (tc); @@ -1487,6 +1491,8 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* First retransmit timeout */ if (tc->rto_boff == 1) tcp_rxt_timeout_cc (tc); + else + scoreboard_clear (&tc->sack_sb); /* If we've sent beyond snd_congestion, update it */ if (seq_gt (tc->snd_una_max, tc->snd_congestion)) @@ -1499,9 +1505,6 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) * shortfall */ n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b); - /* TODO be less aggressive about this */ - scoreboard_clear (&tc->sack_sb); - if (n_bytes == 0) { tcp_retransmit_timer_force_update (tc); @@ -1680,7 +1683,7 @@ tcp_timer_persist_handler (u32 index) /** * Retransmit first unacked segment */ -void +int tcp_retransmit_first_unacked (tcp_connection_t * tc) { vlib_main_t *vm = vlib_get_main (); @@ -1691,20 +1694,23 @@ tcp_retransmit_first_unacked (tcp_connection_t * tc) tc->snd_nxt = tc->snd_una; TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2); + n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b); if (!n_bytes) - return; + return -1; + bi = vlib_get_buffer_index (vm, b); tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); - tc->snd_nxt = old_snd_nxt; + + return 0; } /** * Do fast retransmit with SACKs */ -void -tcp_fast_retransmit_sack (tcp_connection_t * tc) +int +tcp_fast_retransmit_sack (tcp_connection_t * tc, u32 burst_size) { vlib_main_t *vm = vlib_get_main (); u32 n_written = 0, offset, max_bytes, n_segs = 0; @@ -1720,13 +1726,16 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) old_snd_nxt = tc->snd_nxt; sb = &tc->sack_sb; snd_space = tcp_available_cc_snd_space (tc); + hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); if (snd_space < tc->snd_mss) - goto done; + { + tcp_program_fastretransmit (tc); + goto done; + } TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); - hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); - while (hole && snd_space > 0 && n_segs++ < VLIB_FRAME_SIZE) + while (snd_space > 0 && n_segs < burst_size) { hole = scoreboard_next_rxt_hole (sb, hole, tcp_fastrecovery_sent_1_smss (tc), @@ -1736,7 +1745,21 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) if (!can_rescue || !(seq_lt (sb->rescue_rxt, tc->snd_una) || seq_gt (sb->rescue_rxt, tc->snd_congestion))) - break; + { + if (tcp_fastrecovery_first (tc)) + break; + + /* We tend to lose the first segment. Try re-resending + * it but only once and after we've tried everything */ + hole = scoreboard_first_hole (sb); + if (hole && hole->start == tc->snd_una) + { + tcp_retransmit_first_unacked (tc); + tcp_fastrecovery_first_on (tc); + n_segs += 1; + } + break; + } /* If rescue rxt undefined or less than snd_una then one segment of * up to SMSS octets that MUST include the highest outstanding @@ -1756,6 +1779,7 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) bi = vlib_get_buffer_index (vm, b); tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + n_segs += 1; break; } @@ -1776,22 +1800,27 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); ASSERT (n_written <= snd_space); snd_space -= n_written; + n_segs += 1; } + if (hole) + tcp_program_fastretransmit (tc); + done: /* If window allows, send 1 SMSS of new data */ tc->snd_nxt = old_snd_nxt; + return n_segs; } /** * Fast retransmit without SACK info */ -void -tcp_fast_retransmit_no_sack (tcp_connection_t * tc) +int +tcp_fast_retransmit_no_sack (tcp_connection_t * tc, u32 burst_size) { vlib_main_t *vm = vlib_get_main (); u32 n_written = 0, offset = 0, bi, old_snd_nxt; - int snd_space; + int snd_space, n_segs = 0; vlib_buffer_t *b; ASSERT (tcp_in_fastrecovery (tc)); @@ -1802,7 +1831,7 @@ tcp_fast_retransmit_no_sack (tcp_connection_t * tc) tc->snd_nxt = tc->snd_una; snd_space = tcp_available_cc_snd_space (tc); - while (snd_space > 0) + while (snd_space > 0 && n_segs < burst_size) { offset += n_written; n_written = tcp_prepare_retransmit_segment (tc, offset, snd_space, &b); @@ -1814,22 +1843,29 @@ tcp_fast_retransmit_no_sack (tcp_connection_t * tc) bi = vlib_get_buffer_index (vm, b); tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); snd_space -= n_written; + n_segs += 1; } + /* More data to resend */ + if (seq_lt (tc->snd_nxt, tc->snd_congestion)) + tcp_program_fastretransmit (tc); + /* Restore snd_nxt. If window allows, send 1 SMSS of new data */ tc->snd_nxt = old_snd_nxt; + + return n_segs; } /** * Do fast retransmit */ -void -tcp_fast_retransmit (tcp_connection_t * tc) +int +tcp_fast_retransmit (tcp_connection_t * tc, u32 burst_size) { if (tcp_opts_sack_permitted (&tc->rcv_opts)) - tcp_fast_retransmit_sack (tc); + return tcp_fast_retransmit_sack (tc, burst_size); else - tcp_fast_retransmit_no_sack (tc); + return tcp_fast_retransmit_no_sack (tc, burst_size); } static u32 -- cgit 1.2.3-korg