diff options
Diffstat (limited to 'src/vnet/tcp/tcp_output.c')
-rw-r--r-- | src/vnet/tcp/tcp_output.c | 231 |
1 files changed, 156 insertions, 75 deletions
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 78f7c3f8294..b15cf9b362b 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -1240,49 +1240,27 @@ tcp_timer_delack_handler (u32 index) } /** - * Build a retransmit segment + * Allocate a new buffer and build a new tcp segment * - * @return the number of bytes in the segment or 0 if there's nothing to - * retransmit + * @param wrk tcp worker + * @param tc connection for which the segment will be allocated + * @param offset offset of the first byte in the tx fifo + * @param max_deq_byte segment size + * @param[out] b pointer to buffer allocated + * + * @return the number of bytes in the segment or 0 if buffer cannot be + * allocated or no data available */ -static u32 -tcp_prepare_retransmit_segment (tcp_worker_ctx_t * wrk, - tcp_connection_t * tc, u32 offset, - u32 max_deq_bytes, vlib_buffer_t ** b) +static int +tcp_prepare_segment (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, + u32 offset, u32 max_deq_bytes, vlib_buffer_t ** b) { u32 bytes_per_buffer = vnet_get_tcp_main ()->bytes_per_buffer; + u32 bi, seg_size; vlib_main_t *vm = wrk->vm; int n_bytes = 0; - u32 start, bi, available_bytes, seg_size; u8 *data; - ASSERT (tc->state >= TCP_STATE_ESTABLISHED); - ASSERT (max_deq_bytes != 0); - - /* - * Make sure we can retransmit something - */ - available_bytes = session_tx_fifo_max_dequeue (&tc->connection); - ASSERT (available_bytes >= offset); - available_bytes -= offset; - if (!available_bytes) - return 0; - max_deq_bytes = clib_min (tc->snd_mss, max_deq_bytes); - max_deq_bytes = clib_min (available_bytes, max_deq_bytes); - - /* Start is beyond snd_congestion */ - start = tc->snd_una + offset; - if (seq_geq (start, tc->snd_congestion)) - goto done; - - /* Don't overshoot snd_congestion */ - if (seq_gt (start + max_deq_bytes, tc->snd_congestion)) - { - max_deq_bytes = tc->snd_congestion - start; - if (max_deq_bytes == 0) - goto done; - } - seg_size = max_deq_bytes + MAX_HDRS_LEN; /* @@ -1374,6 +1352,55 @@ tcp_prepare_retransmit_segment (tcp_worker_ctx_t * wrk, ASSERT (n_bytes > 0); ASSERT (((*b)->current_data + (*b)->current_length) <= bytes_per_buffer); + return n_bytes; +} + +/** + * Build a retransmit segment + * + * @return the number of bytes in the segment or 0 if there's nothing to + * retransmit + */ +static u32 +tcp_prepare_retransmit_segment (tcp_worker_ctx_t * wrk, + tcp_connection_t * tc, u32 offset, + u32 max_deq_bytes, vlib_buffer_t ** b) +{ + u32 start, available_bytes; + int n_bytes = 0; + + ASSERT (tc->state >= TCP_STATE_ESTABLISHED); + ASSERT (max_deq_bytes != 0); + + /* + * Make sure we can retransmit something + */ + available_bytes = session_tx_fifo_max_dequeue (&tc->connection); + ASSERT (available_bytes >= offset); + available_bytes -= offset; + if (!available_bytes) + return 0; + + max_deq_bytes = clib_min (tc->snd_mss, max_deq_bytes); + max_deq_bytes = clib_min (available_bytes, max_deq_bytes); + + /* Start is beyond snd_congestion */ + start = tc->snd_una + offset; + if (seq_geq (start, tc->snd_congestion)) + goto done; + + /* Don't overshoot snd_congestion */ + if (seq_gt (start + max_deq_bytes, tc->snd_congestion)) + { + max_deq_bytes = tc->snd_congestion - start; + if (max_deq_bytes == 0) + goto done; + } + + n_bytes = tcp_prepare_segment (wrk, tc, offset, max_deq_bytes, b); + if (!n_bytes) + return 0; + if (tcp_in_fastrecovery (tc)) tc->snd_rxt_bytes += n_bytes; @@ -1696,6 +1723,36 @@ tcp_retransmit_first_unacked (tcp_worker_ctx_t * wrk, tcp_connection_t * tc) return 0; } +static int +tcp_fast_retransmit_unsent (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, + u32 burst_size) +{ + u32 offset, n_segs = 0, n_written, bi; + vlib_main_t *vm = wrk->vm; + vlib_buffer_t *b = 0; + + tc->snd_nxt = tc->snd_una_max; + offset = tc->snd_una_max - tc->snd_una; + while (n_segs < burst_size) + { + n_written = tcp_prepare_segment (wrk, tc, offset, tc->snd_mss, &b); + if (!n_written) + goto done; + + bi = vlib_get_buffer_index (vm, b); + tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4); + offset += n_written; + n_segs += 1; + } + +done: + return n_segs; +} + +#define scoreboard_rescue_rxt_valid(_sb, _tc) \ + (seq_geq (_sb->rescue_rxt, _tc->snd_una) \ + && seq_leq (_sb->rescue_rxt, _tc->snd_congestion)) + /** * Do fast retransmit with SACKs */ @@ -1703,55 +1760,54 @@ int tcp_fast_retransmit_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, u32 burst_size) { + u32 n_written = 0, offset, max_bytes, n_segs = 0, n_segs_now; + sack_scoreboard_hole_t *hole; vlib_main_t *vm = wrk->vm; - u32 n_written = 0, offset, max_bytes, n_segs = 0; vlib_buffer_t *b = 0; - sack_scoreboard_hole_t *hole; sack_scoreboard_t *sb; u32 bi, old_snd_nxt; int snd_space; + u32 max_deq; u8 snd_limited = 0, can_rescue = 0; ASSERT (tcp_in_fastrecovery (tc)); - old_snd_nxt = tc->snd_nxt; - sb = &tc->sack_sb; snd_space = tcp_available_cc_snd_space (tc); - hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); - if (snd_space < tc->snd_mss) { tcp_program_fastretransmit (wrk, tc); - goto done; + return 0; } TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); + old_snd_nxt = tc->snd_nxt; + sb = &tc->sack_sb; + hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); + + max_deq = session_tx_fifo_max_dequeue (&tc->connection); + max_deq -= tc->snd_una_max - tc->snd_una; + while (snd_space > 0 && n_segs < burst_size) { - hole = scoreboard_next_rxt_hole (sb, hole, - tcp_fastrecovery_sent_1_smss (tc), - &can_rescue, &snd_limited); + hole = scoreboard_next_rxt_hole (sb, hole, max_deq, &can_rescue, + &snd_limited); if (!hole) { - if (!can_rescue || !(seq_lt (sb->rescue_rxt, tc->snd_una) - || seq_gt (sb->rescue_rxt, - tc->snd_congestion))) + if (max_deq) { - if (tcp_fastrecovery_first (tc)) - break; - - /* We tend to lose the first segment. Try re-resending - * it but only once and after we've tried everything */ - hole = scoreboard_first_hole (sb); - if (hole && hole->start == tc->snd_una) - { - tcp_retransmit_first_unacked (wrk, tc); - tcp_fastrecovery_first_on (tc); - n_segs += 1; - } - break; + snd_space = clib_min (max_deq, snd_space); + burst_size = clib_min (burst_size - n_segs, + snd_space / tc->snd_mss); + n_segs_now = tcp_fast_retransmit_unsent (wrk, tc, burst_size); + if (max_deq > n_segs_now * tc->snd_mss) + tcp_program_fastretransmit (wrk, tc); + n_segs += n_segs_now; + goto done; } + if (!can_rescue || scoreboard_rescue_rxt_valid (sb, tc)) + break; + /* If rescue rxt undefined or less than snd_una then one segment of * up to SMSS octets that MUST include the highest outstanding * unSACKed sequence number SHOULD be returned, and RescueRxt set to @@ -1778,19 +1834,21 @@ tcp_fast_retransmit_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, max_bytes = snd_limited ? clib_min (max_bytes, tc->snd_mss) : max_bytes; if (max_bytes == 0) break; + offset = sb->high_rxt - tc->snd_una; tc->snd_nxt = sb->high_rxt; n_written = tcp_prepare_retransmit_segment (wrk, tc, offset, max_bytes, &b); + ASSERT (n_written <= snd_space); /* Nothing left to retransmit */ if (n_written == 0) break; bi = vlib_get_buffer_index (vm, b); - sb->high_rxt += n_written; tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4); - ASSERT (n_written <= snd_space); + + sb->high_rxt += n_written; snd_space -= n_written; n_segs += 1; } @@ -1811,24 +1869,26 @@ int tcp_fast_retransmit_no_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, u32 burst_size) { - u32 n_written = 0, offset = 0, bi, old_snd_nxt; + u32 n_written = 0, offset = 0, bi, old_snd_nxt, max_deq, n_segs_now; vlib_main_t *vm = wrk->vm; int snd_space, n_segs = 0; vlib_buffer_t *b; ASSERT (tcp_in_fastrecovery (tc)); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); - - /* Start resending from first un-acked segment */ old_snd_nxt = tc->snd_nxt; - tc->snd_nxt = tc->snd_una; - snd_space = tcp_available_cc_snd_space (tc); + if (!tcp_fastrecovery_first (tc)) + goto send_unsent; + + /* RFC 6582: [If a partial ack], retransmit the first unacknowledged + * segment. */ + snd_space = tc->sack_sb.last_bytes_delivered; + tc->snd_nxt = tc->snd_una; while (snd_space > 0 && n_segs < burst_size) { - offset += n_written; - n_written = tcp_prepare_retransmit_segment (wrk, tc, offset, snd_space, - &b); + n_written = tcp_prepare_retransmit_segment (wrk, tc, offset, + tc->snd_mss, &b); /* Nothing left to retransmit */ if (n_written == 0) @@ -1837,16 +1897,37 @@ tcp_fast_retransmit_no_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, bi = vlib_get_buffer_index (vm, b); tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4); snd_space -= n_written; + offset += n_written; n_segs += 1; } - /* More data to resend */ - if (seq_lt (tc->snd_nxt, tc->snd_congestion)) - tcp_program_fastretransmit (wrk, tc); + if (n_segs == burst_size) + goto done; + +send_unsent: - /* Restore snd_nxt. If window allows, send 1 SMSS of new data */ + /* RFC 6582: Send a new segment if permitted by the new value of cwnd. */ + snd_space = tcp_available_cc_snd_space (tc); + if (snd_space < tc->snd_mss) + goto done; + + max_deq = session_tx_fifo_max_dequeue (&tc->connection); + max_deq -= tc->snd_una_max - tc->snd_una; + if (max_deq) + { + snd_space = clib_min (max_deq, snd_space); + burst_size = clib_min (burst_size - n_segs, snd_space / tc->snd_mss); + n_segs_now = tcp_fast_retransmit_unsent (wrk, tc, burst_size); + if (max_deq > n_segs_now * tc->snd_mss) + tcp_program_fastretransmit (wrk, tc); + n_segs += n_segs_now; + } + + /* Restore snd_nxt */ tc->snd_nxt = old_snd_nxt; +done: + tcp_fastrecovery_first_off (tc); return n_segs; } |