diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/vnet/session/session.c | 2 | ||||
-rw-r--r-- | src/vnet/session/session_node.c | 4 | ||||
-rw-r--r-- | src/vnet/tcp/tcp.c | 51 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_input.c | 29 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_output.c | 231 |
5 files changed, 196 insertions, 121 deletions
diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index f6894868a3a..b944f5a2104 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -1081,7 +1081,7 @@ stream_session_disconnect (stream_session_t * s) * held, just append a new event to pending disconnects vector. */ if (vlib_thread_is_main_w_barrier () || thread_index == s->thread_index) { - wrk = session_manager_get_worker (thread_index); + wrk = session_manager_get_worker (s->thread_index); vec_add2 (wrk->pending_disconnects, evt, 1); clib_memset (evt, 0, sizeof (*evt)); evt->session_handle = session_handle (s); diff --git a/src/vnet/session/session_node.c b/src/vnet/session/session_node.c index 5ed681d03c7..edc518ee872 100644 --- a/src/vnet/session/session_node.c +++ b/src/vnet/session/session_node.c @@ -789,7 +789,7 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, /* Make sure postponed events are handled first */ fifo_events = wrk->free_event_vector; vec_append (fifo_events, wrk->postponed_event_vector); - _vec_len (wrk->pending_disconnects) = 0; + _vec_len (wrk->postponed_event_vector) = 0; /* Try to dequeue what is available. Don't wait for lock. * XXX: we may need priorities here */ @@ -810,8 +810,8 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vec_append (fifo_events, wrk->pending_event_vector); vec_append (fifo_events, wrk->pending_disconnects); - _vec_len (wrk->postponed_event_vector) = 0; _vec_len (wrk->pending_event_vector) = 0; + _vec_len (wrk->pending_disconnects) = 0; n_events = vec_len (fifo_events); if (PREDICT_FALSE (!n_events)) diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 1fb95b3ad3a..f8e74a88fcf 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -972,13 +972,13 @@ format_tcp_scoreboard (u8 * s, va_list * args) hole = scoreboard_first_hole (sb); if (hole) - s = format (s, "\n%Uhead %u tail %u %u holes:\n", format_white_space, - indent, sb->head, sb->tail, pool_elts (sb->holes)); + s = format (s, "\n%Uhead %u tail %u %u holes:\n%U", format_white_space, + indent, sb->head, sb->tail, pool_elts (sb->holes), + format_white_space, indent); while (hole) { - s = format (s, "%U%U", format_white_space, indent, format_tcp_sack_hole, - hole, tc); + s = format (s, "%U", format_tcp_sack_hole, hole, tc); hole = scoreboard_next_hole (sb, hole); } @@ -1051,38 +1051,25 @@ tcp_snd_space_inline (tcp_connection_t * tc) { int snd_space, snt_limited; - if (PREDICT_TRUE (!tcp_in_fastrecovery (tc))) - { - snd_space = tcp_available_output_snd_space (tc); - - /* If we haven't gotten dupacks or if we did and have gotten sacked - * bytes then we can still send as per Limited Transmit (RFC3042) */ - if (PREDICT_FALSE (tc->rcv_dupacks != 0 - && (tcp_opts_sack_permitted (tc) - && tc->sack_sb.last_sacked_bytes == 0))) - { - if (tc->rcv_dupacks == 1 && tc->limited_transmit != tc->snd_nxt) - tc->limited_transmit = tc->snd_nxt; - ASSERT (seq_leq (tc->limited_transmit, tc->snd_nxt)); + if (PREDICT_FALSE (tcp_in_fastrecovery (tc))) + return 0; - snt_limited = tc->snd_nxt - tc->limited_transmit; - snd_space = clib_max (2 * tc->snd_mss - snt_limited, 0); - } - return tcp_round_snd_space (tc, snd_space); - } + snd_space = tcp_available_output_snd_space (tc); - /* RFC 5681: When previously unsent data is available and the new value of - * cwnd and the receiver's advertised window allow, a TCP SHOULD send 1*SMSS - * bytes of previously unsent data. */ - if (tcp_in_fastrecovery (tc) && !tcp_fastrecovery_sent_1_smss (tc)) + /* If we haven't gotten dupacks or if we did and have gotten sacked + * bytes then we can still send as per Limited Transmit (RFC3042) */ + if (PREDICT_FALSE (tc->rcv_dupacks != 0 + && (tcp_opts_sack_permitted (tc) + && tc->sack_sb.last_sacked_bytes == 0))) { - if (tcp_available_cc_snd_space (tc) < tc->snd_mss) - return 0; - tcp_fastrecovery_1_smss_on (tc); - return tc->snd_mss; - } + if (tc->rcv_dupacks == 1 && tc->limited_transmit != tc->snd_nxt) + tc->limited_transmit = tc->snd_nxt; + ASSERT (seq_leq (tc->limited_transmit, tc->snd_nxt)); - return 0; + snt_limited = tc->snd_nxt - tc->limited_transmit; + snd_space = clib_max (2 * tc->snd_mss - snt_limited, 0); + } + return tcp_round_snd_space (tc, snd_space); } u32 diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 21e5f3cdaba..0b79a6699d7 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -719,8 +719,7 @@ scoreboard_update_bytes (tcp_connection_t * tc, sack_scoreboard_t * sb) sack_scoreboard_hole_t * scoreboard_next_rxt_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * start, - u8 have_sent_1_smss, - u8 * can_rescue, u8 * snd_limited) + u8 have_unsent, u8 * can_rescue, u8 * snd_limited) { sack_scoreboard_hole_t *hole = 0; @@ -742,11 +741,11 @@ scoreboard_next_rxt_hole (sack_scoreboard_t * sb, } else { - /* Rule (2): output takes care of transmitting new data */ - if (!have_sent_1_smss) + /* Rule (2): available unsent data */ + if (have_unsent) { - hole = 0; sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; + return 0; } /* Rule (3): if hole not lost */ else if (seq_lt (hole->start, sb->high_sacked)) @@ -772,16 +771,17 @@ scoreboard_next_rxt_hole (sack_scoreboard_t * sb, } static void -scoreboard_init_high_rxt (sack_scoreboard_t * sb, u32 seq) +scoreboard_init_high_rxt (sack_scoreboard_t * sb, u32 snd_una) { sack_scoreboard_hole_t *hole; hole = scoreboard_first_hole (sb); if (hole) { - seq = seq_gt (seq, hole->start) ? seq : hole->start; + snd_una = seq_gt (snd_una, hole->start) ? snd_una : hole->start; sb->cur_rxt_hole = sb->head; } - sb->high_rxt = seq; + sb->high_rxt = snd_una; + sb->rescue_rxt = snd_una - 1; } void @@ -1306,7 +1306,6 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) { tc->cwnd = tc->ssthresh; scoreboard_init_high_rxt (&tc->sack_sb, tc->snd_una); - tc->sack_sb.rescue_rxt = tc->snd_una - 1; } else { @@ -1316,6 +1315,7 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) tc->cwnd = tc->ssthresh + 3 * tc->snd_mss; } + /* Constrain rate until we get a partial ack */ pacer_wnd = clib_max (0.1 * tc->cwnd, 2 * tc->snd_mss); tcp_connection_tx_pacer_reset (tc, pacer_wnd, 0 /* start bucket */ ); @@ -1387,6 +1387,10 @@ partial_ack: * Legitimate ACK. 2) If PARTIAL ACK try to retransmit */ + /* Update the pacing rate. For the first partial ack we move from + * the artificially constrained rate to the one after congestion */ + tcp_connection_tx_pacer_update (tc); + /* XXX limit this only to first partial ack? */ tcp_retransmit_timer_force_update (tc); @@ -1422,10 +1426,14 @@ partial_ack: { /* Apparently all retransmitted holes have been acked */ tc->snd_rxt_bytes = 0; + tc->sack_sb.high_rxt = tc->snd_una; } } else { + tcp_fastrecovery_first_on (tc); + /* Reuse last bytes delivered to track total bytes acked */ + tc->sack_sb.last_bytes_delivered += tc->bytes_acked; if (tc->snd_rxt_bytes > tc->bytes_acked) tc->snd_rxt_bytes -= tc->bytes_acked; else @@ -1473,7 +1481,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, { tcp_make_ack (tc, b); *next = tcp_next_output (tc->c_is_ip4); - *error = TCP_ERROR_ACK_INVALID; + *error = TCP_ERROR_ACK_FUTURE; TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 0, vnet_buffer (b)->tcp.ack_number); return -1; @@ -1483,7 +1491,6 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, vnet_buffer (b)->tcp.ack_number); tc->snd_nxt = vnet_buffer (b)->tcp.ack_number; - *error = TCP_ERROR_ACK_FUTURE; } /* If old ACK, probably it's an old dupack */ diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 78f7c3f8294..b15cf9b362b 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -1240,49 +1240,27 @@ tcp_timer_delack_handler (u32 index) } /** - * Build a retransmit segment + * Allocate a new buffer and build a new tcp segment * - * @return the number of bytes in the segment or 0 if there's nothing to - * retransmit + * @param wrk tcp worker + * @param tc connection for which the segment will be allocated + * @param offset offset of the first byte in the tx fifo + * @param max_deq_byte segment size + * @param[out] b pointer to buffer allocated + * + * @return the number of bytes in the segment or 0 if buffer cannot be + * allocated or no data available */ -static u32 -tcp_prepare_retransmit_segment (tcp_worker_ctx_t * wrk, - tcp_connection_t * tc, u32 offset, - u32 max_deq_bytes, vlib_buffer_t ** b) +static int +tcp_prepare_segment (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, + u32 offset, u32 max_deq_bytes, vlib_buffer_t ** b) { u32 bytes_per_buffer = vnet_get_tcp_main ()->bytes_per_buffer; + u32 bi, seg_size; vlib_main_t *vm = wrk->vm; int n_bytes = 0; - u32 start, bi, available_bytes, seg_size; u8 *data; - ASSERT (tc->state >= TCP_STATE_ESTABLISHED); - ASSERT (max_deq_bytes != 0); - - /* - * Make sure we can retransmit something - */ - available_bytes = session_tx_fifo_max_dequeue (&tc->connection); - ASSERT (available_bytes >= offset); - available_bytes -= offset; - if (!available_bytes) - return 0; - max_deq_bytes = clib_min (tc->snd_mss, max_deq_bytes); - max_deq_bytes = clib_min (available_bytes, max_deq_bytes); - - /* Start is beyond snd_congestion */ - start = tc->snd_una + offset; - if (seq_geq (start, tc->snd_congestion)) - goto done; - - /* Don't overshoot snd_congestion */ - if (seq_gt (start + max_deq_bytes, tc->snd_congestion)) - { - max_deq_bytes = tc->snd_congestion - start; - if (max_deq_bytes == 0) - goto done; - } - seg_size = max_deq_bytes + MAX_HDRS_LEN; /* @@ -1374,6 +1352,55 @@ tcp_prepare_retransmit_segment (tcp_worker_ctx_t * wrk, ASSERT (n_bytes > 0); ASSERT (((*b)->current_data + (*b)->current_length) <= bytes_per_buffer); + return n_bytes; +} + +/** + * Build a retransmit segment + * + * @return the number of bytes in the segment or 0 if there's nothing to + * retransmit + */ +static u32 +tcp_prepare_retransmit_segment (tcp_worker_ctx_t * wrk, + tcp_connection_t * tc, u32 offset, + u32 max_deq_bytes, vlib_buffer_t ** b) +{ + u32 start, available_bytes; + int n_bytes = 0; + + ASSERT (tc->state >= TCP_STATE_ESTABLISHED); + ASSERT (max_deq_bytes != 0); + + /* + * Make sure we can retransmit something + */ + available_bytes = session_tx_fifo_max_dequeue (&tc->connection); + ASSERT (available_bytes >= offset); + available_bytes -= offset; + if (!available_bytes) + return 0; + + max_deq_bytes = clib_min (tc->snd_mss, max_deq_bytes); + max_deq_bytes = clib_min (available_bytes, max_deq_bytes); + + /* Start is beyond snd_congestion */ + start = tc->snd_una + offset; + if (seq_geq (start, tc->snd_congestion)) + goto done; + + /* Don't overshoot snd_congestion */ + if (seq_gt (start + max_deq_bytes, tc->snd_congestion)) + { + max_deq_bytes = tc->snd_congestion - start; + if (max_deq_bytes == 0) + goto done; + } + + n_bytes = tcp_prepare_segment (wrk, tc, offset, max_deq_bytes, b); + if (!n_bytes) + return 0; + if (tcp_in_fastrecovery (tc)) tc->snd_rxt_bytes += n_bytes; @@ -1696,6 +1723,36 @@ tcp_retransmit_first_unacked (tcp_worker_ctx_t * wrk, tcp_connection_t * tc) return 0; } +static int +tcp_fast_retransmit_unsent (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, + u32 burst_size) +{ + u32 offset, n_segs = 0, n_written, bi; + vlib_main_t *vm = wrk->vm; + vlib_buffer_t *b = 0; + + tc->snd_nxt = tc->snd_una_max; + offset = tc->snd_una_max - tc->snd_una; + while (n_segs < burst_size) + { + n_written = tcp_prepare_segment (wrk, tc, offset, tc->snd_mss, &b); + if (!n_written) + goto done; + + bi = vlib_get_buffer_index (vm, b); + tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4); + offset += n_written; + n_segs += 1; + } + +done: + return n_segs; +} + +#define scoreboard_rescue_rxt_valid(_sb, _tc) \ + (seq_geq (_sb->rescue_rxt, _tc->snd_una) \ + && seq_leq (_sb->rescue_rxt, _tc->snd_congestion)) + /** * Do fast retransmit with SACKs */ @@ -1703,55 +1760,54 @@ int tcp_fast_retransmit_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, u32 burst_size) { + u32 n_written = 0, offset, max_bytes, n_segs = 0, n_segs_now; + sack_scoreboard_hole_t *hole; vlib_main_t *vm = wrk->vm; - u32 n_written = 0, offset, max_bytes, n_segs = 0; vlib_buffer_t *b = 0; - sack_scoreboard_hole_t *hole; sack_scoreboard_t *sb; u32 bi, old_snd_nxt; int snd_space; + u32 max_deq; u8 snd_limited = 0, can_rescue = 0; ASSERT (tcp_in_fastrecovery (tc)); - old_snd_nxt = tc->snd_nxt; - sb = &tc->sack_sb; snd_space = tcp_available_cc_snd_space (tc); - hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); - if (snd_space < tc->snd_mss) { tcp_program_fastretransmit (wrk, tc); - goto done; + return 0; } TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); + old_snd_nxt = tc->snd_nxt; + sb = &tc->sack_sb; + hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); + + max_deq = session_tx_fifo_max_dequeue (&tc->connection); + max_deq -= tc->snd_una_max - tc->snd_una; + while (snd_space > 0 && n_segs < burst_size) { - hole = scoreboard_next_rxt_hole (sb, hole, - tcp_fastrecovery_sent_1_smss (tc), - &can_rescue, &snd_limited); + hole = scoreboard_next_rxt_hole (sb, hole, max_deq, &can_rescue, + &snd_limited); if (!hole) { - if (!can_rescue || !(seq_lt (sb->rescue_rxt, tc->snd_una) - || seq_gt (sb->rescue_rxt, - tc->snd_congestion))) + if (max_deq) { - if (tcp_fastrecovery_first (tc)) - break; - - /* We tend to lose the first segment. Try re-resending - * it but only once and after we've tried everything */ - hole = scoreboard_first_hole (sb); - if (hole && hole->start == tc->snd_una) - { - tcp_retransmit_first_unacked (wrk, tc); - tcp_fastrecovery_first_on (tc); - n_segs += 1; - } - break; + snd_space = clib_min (max_deq, snd_space); + burst_size = clib_min (burst_size - n_segs, + snd_space / tc->snd_mss); + n_segs_now = tcp_fast_retransmit_unsent (wrk, tc, burst_size); + if (max_deq > n_segs_now * tc->snd_mss) + tcp_program_fastretransmit (wrk, tc); + n_segs += n_segs_now; + goto done; } + if (!can_rescue || scoreboard_rescue_rxt_valid (sb, tc)) + break; + /* If rescue rxt undefined or less than snd_una then one segment of * up to SMSS octets that MUST include the highest outstanding * unSACKed sequence number SHOULD be returned, and RescueRxt set to @@ -1778,19 +1834,21 @@ tcp_fast_retransmit_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, max_bytes = snd_limited ? clib_min (max_bytes, tc->snd_mss) : max_bytes; if (max_bytes == 0) break; + offset = sb->high_rxt - tc->snd_una; tc->snd_nxt = sb->high_rxt; n_written = tcp_prepare_retransmit_segment (wrk, tc, offset, max_bytes, &b); + ASSERT (n_written <= snd_space); /* Nothing left to retransmit */ if (n_written == 0) break; bi = vlib_get_buffer_index (vm, b); - sb->high_rxt += n_written; tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4); - ASSERT (n_written <= snd_space); + + sb->high_rxt += n_written; snd_space -= n_written; n_segs += 1; } @@ -1811,24 +1869,26 @@ int tcp_fast_retransmit_no_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, u32 burst_size) { - u32 n_written = 0, offset = 0, bi, old_snd_nxt; + u32 n_written = 0, offset = 0, bi, old_snd_nxt, max_deq, n_segs_now; vlib_main_t *vm = wrk->vm; int snd_space, n_segs = 0; vlib_buffer_t *b; ASSERT (tcp_in_fastrecovery (tc)); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); - - /* Start resending from first un-acked segment */ old_snd_nxt = tc->snd_nxt; - tc->snd_nxt = tc->snd_una; - snd_space = tcp_available_cc_snd_space (tc); + if (!tcp_fastrecovery_first (tc)) + goto send_unsent; + + /* RFC 6582: [If a partial ack], retransmit the first unacknowledged + * segment. */ + snd_space = tc->sack_sb.last_bytes_delivered; + tc->snd_nxt = tc->snd_una; while (snd_space > 0 && n_segs < burst_size) { - offset += n_written; - n_written = tcp_prepare_retransmit_segment (wrk, tc, offset, snd_space, - &b); + n_written = tcp_prepare_retransmit_segment (wrk, tc, offset, + tc->snd_mss, &b); /* Nothing left to retransmit */ if (n_written == 0) @@ -1837,16 +1897,37 @@ tcp_fast_retransmit_no_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, bi = vlib_get_buffer_index (vm, b); tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4); snd_space -= n_written; + offset += n_written; n_segs += 1; } - /* More data to resend */ - if (seq_lt (tc->snd_nxt, tc->snd_congestion)) - tcp_program_fastretransmit (wrk, tc); + if (n_segs == burst_size) + goto done; + +send_unsent: - /* Restore snd_nxt. If window allows, send 1 SMSS of new data */ + /* RFC 6582: Send a new segment if permitted by the new value of cwnd. */ + snd_space = tcp_available_cc_snd_space (tc); + if (snd_space < tc->snd_mss) + goto done; + + max_deq = session_tx_fifo_max_dequeue (&tc->connection); + max_deq -= tc->snd_una_max - tc->snd_una; + if (max_deq) + { + snd_space = clib_min (max_deq, snd_space); + burst_size = clib_min (burst_size - n_segs, snd_space / tc->snd_mss); + n_segs_now = tcp_fast_retransmit_unsent (wrk, tc, burst_size); + if (max_deq > n_segs_now * tc->snd_mss) + tcp_program_fastretransmit (wrk, tc); + n_segs += n_segs_now; + } + + /* Restore snd_nxt */ tc->snd_nxt = old_snd_nxt; +done: + tcp_fastrecovery_first_off (tc); return n_segs; } |