From 1f152cd6faf96b524b6b7071b5cffe1916f9c5cc Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Fri, 18 Aug 2017 19:28:03 -0700 Subject: tcp: retransmit and multi-buffer segment fixes and improvements - set session state as closed on session manager delete - enable retransmit as opposed to persist timer after persist timer completes - properly discard buffer chain bytes when new data overlaps ooo segments - don't use rxt bytes in snd space estimate used on tx path Change-Id: Id9cab686e532e5fe70c775d5440260e8eb890a9f Signed-off-by: Florin Coras --- src/svm/svm_fifo.c | 14 +- src/vnet/session/segment_manager.c | 1 + src/vnet/session/session.c | 105 ++++++++++++--- src/vnet/session/session_node.c | 26 ++-- src/vnet/session/stream_session.h | 2 +- src/vnet/tcp/tcp.c | 39 +++--- src/vnet/tcp/tcp.h | 23 +++- src/vnet/tcp/tcp_input.c | 27 ++-- src/vnet/tcp/tcp_output.c | 264 +++++++++++++++++++++---------------- 9 files changed, 319 insertions(+), 182 deletions(-) (limited to 'src') diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c index 7f8127cfa69..8fe82f56abd 100644 --- a/src/svm/svm_fifo.c +++ b/src/svm/svm_fifo.c @@ -53,10 +53,12 @@ ooo_segment_end_pos (svm_fifo_t * f, ooo_segment_t * s) u8 * format_ooo_segment (u8 * s, va_list * args) { + svm_fifo_t *f = va_arg (*args, svm_fifo_t *); ooo_segment_t *seg = va_arg (*args, ooo_segment_t *); - - s = format (s, "pos %u, len %u, next %d, prev %d", - seg->start, seg->length, seg->next, seg->prev); + u32 normalized_start = (seg->start + f->nitems - f->tail) % f->nitems; + s = format (s, "[%u, %u], len %u, next %d, prev %d", normalized_start, + (normalized_start + seg->length) % f->nitems, seg->length, + seg->next, seg->prev); return s; } @@ -154,7 +156,7 @@ format_ooo_list (u8 * s, va_list * args) while (ooo_segment_index != OOO_SEGMENT_INVALID_INDEX) { seg = pool_elt_at_index (f->ooo_segments, ooo_segment_index); - s = format (s, " %U\n", format_ooo_segment, seg); + s = format (s, " %U\n", format_ooo_segment, f, seg); ooo_segment_index = seg->next; } @@ -557,7 +559,6 @@ svm_fifo_enqueue_with_offset_internal (svm_fifo_t * f, { u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; u32 cursize, nitems, normalized_offset; - u32 offset_from_tail; f->ooos_newest = OOO_SEGMENT_INVALID_INDEX; @@ -570,8 +571,7 @@ svm_fifo_enqueue_with_offset_internal (svm_fifo_t * f, normalized_offset = (f->tail + offset) % nitems; /* Will this request fit? */ - offset_from_tail = (nitems + normalized_offset - f->tail) % nitems; - if ((required_bytes + offset_from_tail) > (nitems - cursize)) + if ((required_bytes + offset) > (nitems - cursize)) return -1; svm_fifo_trace_add (f, offset, required_bytes, 1); diff --git a/src/vnet/session/segment_manager.c b/src/vnet/session/segment_manager.c index 262b7faab92..43977063e0e 100644 --- a/src/vnet/session/segment_manager.c +++ b/src/vnet/session/segment_manager.c @@ -224,6 +224,7 @@ segment_manager_del (segment_manager_t * sm) session = stream_session_get (session_index, thread_index); /* Instead of directly removing the session call disconnect */ + session->session_state = SESSION_STATE_CLOSED; session_send_session_evt_to_thread (stream_session_handle (session), FIFO_EVENT_DISCONNECT, thread_index); diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 70a5cd83749..6fe990476e7 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -92,38 +92,104 @@ stream_session_create_i (segment_manager_t * sm, transport_connection_t * tc, return 0; } -/** Enqueue buffer chain tail */ +/** + * Discards bytes from buffer chain + * + * It discards n_bytes_to_drop starting at first buffer after chain_b + */ +always_inline void +session_enqueue_discard_chain_bytes (vlib_main_t * vm, vlib_buffer_t * b, + vlib_buffer_t ** chain_b, + u32 n_bytes_to_drop) +{ + vlib_buffer_t *next = *chain_b; + u32 to_drop = n_bytes_to_drop; + ASSERT (b->flags & VLIB_BUFFER_NEXT_PRESENT); + while (to_drop && (next->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + next = vlib_get_buffer (vm, next->next_buffer); + if (next->current_length > to_drop) + { + vlib_buffer_advance (next, to_drop); + to_drop = 0; + } + else + { + to_drop -= next->current_length; + next->current_length = 0; + } + } + *chain_b = next; + + if (to_drop == 0) + b->total_length_not_including_first_buffer -= n_bytes_to_drop; +} + +/** + * Enqueue buffer chain tail + */ always_inline int session_enqueue_chain_tail (stream_session_t * s, vlib_buffer_t * b, u32 offset, u8 is_in_order) { vlib_buffer_t *chain_b; - u32 chain_bi = b->next_buffer, len; + u32 chain_bi, len, diff; vlib_main_t *vm = vlib_get_main (); u8 *data; - u16 written = 0; + u32 written = 0; int rv = 0; + if (is_in_order && offset) + { + diff = offset - b->current_length; + if (diff > b->total_length_not_including_first_buffer) + return 0; + chain_b = b; + session_enqueue_discard_chain_bytes (vm, b, &chain_b, diff); + chain_bi = vlib_get_buffer_index (vm, chain_b); + } + else + chain_bi = b->next_buffer; + do { chain_b = vlib_get_buffer (vm, chain_bi); data = vlib_buffer_get_current (chain_b); len = chain_b->current_length; + if (!len) + continue; if (is_in_order) { rv = svm_fifo_enqueue_nowait (s->server_rx_fifo, len, data); - if (rv < len) + if (rv == len) + { + written += rv; + } + else if (rv < len) { return (rv > 0) ? (written + rv) : written; } - written += rv; + else if (rv > len) + { + written += rv; + + /* written more than what was left in chain */ + if (written > b->total_length_not_including_first_buffer) + return written; + + /* drop the bytes that have already been delivered */ + session_enqueue_discard_chain_bytes (vm, b, &chain_b, rv - len); + } } else { rv = svm_fifo_enqueue_with_offset (s->server_rx_fifo, offset, len, data); if (rv) - return -1; + { + clib_warning ("failed to enqueue multi-buffer seg"); + return -1; + } offset += len; } } @@ -155,22 +221,22 @@ stream_session_enqueue_data (transport_connection_t * tc, vlib_buffer_t * b, u32 offset, u8 queue_event, u8 is_in_order) { stream_session_t *s; - int enqueued = 0, rv; + int enqueued = 0, rv, in_order_off; s = stream_session_get (tc->s_index, tc->thread_index); if (is_in_order) { - enqueued = - svm_fifo_enqueue_nowait (s->server_rx_fifo, b->current_length, - vlib_buffer_get_current (b)); - if (PREDICT_FALSE - ((b->flags & VLIB_BUFFER_NEXT_PRESENT) && enqueued > 0)) + enqueued = svm_fifo_enqueue_nowait (s->server_rx_fifo, + b->current_length, + vlib_buffer_get_current (b)); + if (PREDICT_FALSE ((b->flags & VLIB_BUFFER_NEXT_PRESENT) + && enqueued >= 0)) { - rv = session_enqueue_chain_tail (s, b, 0, 1); - if (rv <= 0) - return enqueued; - enqueued += rv; + in_order_off = enqueued > b->current_length ? enqueued : 0; + rv = session_enqueue_chain_tail (s, b, in_order_off, 1); + if (rv > 0) + enqueued += rv; } } else @@ -179,9 +245,10 @@ stream_session_enqueue_data (transport_connection_t * tc, vlib_buffer_t * b, b->current_length, vlib_buffer_get_current (b)); if (PREDICT_FALSE ((b->flags & VLIB_BUFFER_NEXT_PRESENT) && !rv)) - rv = session_enqueue_chain_tail (s, b, offset + b->current_length, 0); - if (rv) - return -1; + session_enqueue_chain_tail (s, b, offset + b->current_length, 0); + /* if something was enqueued, report even this as success for ooo + * segment handling */ + return rv; } if (queue_event) diff --git a/src/vnet/session/session_node.c b/src/vnet/session/session_node.c index fac2b852880..cd52742bc67 100644 --- a/src/vnet/session/session_node.c +++ b/src/vnet/session/session_node.c @@ -76,7 +76,7 @@ session_tx_fifo_chain_tail (session_manager_main_t * smm, vlib_main_t * vm, u8 thread_index, svm_fifo_t * fifo, vlib_buffer_t * b0, u32 bi0, u8 n_bufs_per_seg, u32 left_from_seg, u32 * left_to_snd0, - u16 * n_bufs, u32 * rx_offset, u16 deq_per_buf, + u16 * n_bufs, u32 * tx_offset, u16 deq_per_buf, u8 peek_data) { vlib_buffer_t *chain_b0, *prev_b0; @@ -104,8 +104,8 @@ session_tx_fifo_chain_tail (session_manager_main_t * smm, vlib_main_t * vm, data0 = vlib_buffer_get_current (chain_b0); if (peek_data) { - n_bytes_read = svm_fifo_peek (fifo, *rx_offset, len_to_deq0, data0); - *rx_offset += n_bytes_read; + n_bytes_read = svm_fifo_peek (fifo, *tx_offset, len_to_deq0, data0); + *tx_offset += n_bytes_read; } else { @@ -126,7 +126,8 @@ session_tx_fifo_chain_tail (session_manager_main_t * smm, vlib_main_t * vm, if (to_deq == 0) break; } - ASSERT (to_deq == 0); + ASSERT (to_deq == 0 + && b0->total_length_not_including_first_buffer == left_from_seg); *left_to_snd0 -= left_from_seg; } @@ -144,7 +145,7 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, transport_proto_vft_t *transport_vft; u32 next_index, next0, *to_next, n_left_to_next, bi0; vlib_buffer_t *b0; - u32 rx_offset = 0, max_dequeue0, n_bytes_per_seg; + u32 tx_offset = 0, max_dequeue0, n_bytes_per_seg, left_for_seg; u16 snd_mss0, n_bufs_per_seg, n_bufs; u8 *data0; int i, n_bytes_read; @@ -170,11 +171,11 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, if (peek_data) { /* Offset in rx fifo from where to peek data */ - rx_offset = transport_vft->tx_fifo_offset (tc0); + tx_offset = transport_vft->tx_fifo_offset (tc0); } /* Check how much we can pull. If buffering, subtract the offset */ - max_dequeue0 = svm_fifo_max_dequeue (s0->server_tx_fifo) - rx_offset; + max_dequeue0 = svm_fifo_max_dequeue (s0->server_tx_fifo) - tx_offset; /* Nothing to read return */ if (max_dequeue0 == 0) @@ -193,6 +194,7 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, } else { + /* Expectation is that snd_space0 is already a multiple of snd_mss */ max_len_to_snd0 = snd_space0; } @@ -265,8 +267,7 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, b0 = vlib_get_buffer (vm, bi0); b0->error = 0; - b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID - | VNET_BUFFER_F_LOCALLY_ORIGINATED; + b0->flags = VNET_BUFFER_F_LOCALLY_ORIGINATED; b0->current_data = 0; b0->total_length_not_including_first_buffer = 0; @@ -274,11 +275,11 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, data0 = vlib_buffer_make_headroom (b0, MAX_HDRS_LEN); if (peek_data) { - n_bytes_read = svm_fifo_peek (s0->server_tx_fifo, rx_offset, + n_bytes_read = svm_fifo_peek (s0->server_tx_fifo, tx_offset, len_to_deq0, data0); /* Keep track of progress locally, transport is also supposed to * increment it independently when pushing the header */ - rx_offset += n_bytes_read; + tx_offset += n_bytes_read; } else { @@ -299,12 +300,11 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, */ if (PREDICT_FALSE (n_bufs_per_seg > 1 && left_to_snd0)) { - u32 left_for_seg; left_for_seg = clib_min (snd_mss0 - n_bytes_read, left_to_snd0); session_tx_fifo_chain_tail (smm, vm, thread_index, s0->server_tx_fifo, b0, bi0, n_bufs_per_seg, left_for_seg, - &left_to_snd0, &n_bufs, &rx_offset, + &left_to_snd0, &n_bufs, &tx_offset, deq_per_buf, peek_data); } diff --git a/src/vnet/session/stream_session.h b/src/vnet/session/stream_session.h index 533cf97fef9..275052d3ee5 100644 --- a/src/vnet/session/stream_session.h +++ b/src/vnet/session/stream_session.h @@ -56,7 +56,7 @@ typedef struct _stream_session_t u8 session_type; /** State */ - u8 session_state; + volatile u8 session_state; u8 thread_index; diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 6edf52af6f5..197fff9647c 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -798,7 +798,8 @@ format_tcp_vars (u8 * s, va_list * args) s = format (s, "rtt_seq %u\n", tc->rtt_seq); s = format (s, " tsval_recent %u tsval_recent_age %u\n", tc->tsval_recent, tcp_time_now () - tc->tsval_recent_age); - s = format (s, " scoreboard: %U\n", format_tcp_scoreboard, &tc->sack_sb); + s = format (s, " scoreboard: %U\n", format_tcp_scoreboard, &tc->sack_sb, + tc); if (vec_len (tc->snd_sacks)) s = format (s, " sacks tx: %U\n", format_tcp_sacks, tc); @@ -858,7 +859,7 @@ format_tcp_session (u8 * s, va_list * args) if (tc) s = format (s, "%U", format_tcp_connection, tc, verbose); else - s = format (s, "empty"); + s = format (s, "empty\n"); return s; } @@ -930,7 +931,11 @@ u8 * format_tcp_sack_hole (u8 * s, va_list * args) { sack_scoreboard_hole_t *hole = va_arg (*args, sack_scoreboard_hole_t *); - s = format (s, "[%u, %u]", hole->start, hole->end); + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + if (tc) + s = format (s, " [%u, %u]", hole->start - tc->iss, hole->end - tc->iss); + else + s = format (s, " [%u, %u]", hole->start, hole->end); return s; } @@ -938,6 +943,7 @@ u8 * format_tcp_scoreboard (u8 * s, va_list * args) { sack_scoreboard_t *sb = va_arg (*args, sack_scoreboard_t *); + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); sack_scoreboard_hole_t *hole; s = format (s, "sacked_bytes %u last_sacked_bytes %u lost_bytes %u\n", sb->sacked_bytes, sb->last_sacked_bytes, sb->lost_bytes); @@ -952,7 +958,7 @@ format_tcp_scoreboard (u8 * s, va_list * args) while (hole) { - s = format (s, "%U", format_tcp_sack_hole, hole); + s = format (s, "%U", format_tcp_sack_hole, hole, tc); hole = scoreboard_next_hole (sb, hole); } @@ -1001,13 +1007,10 @@ tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space) return tc->snd_wnd <= snd_space ? tc->snd_wnd : 0; } - /* If we can't write at least a segment, don't try at all */ + /* If not snd_wnd constrained and we can't write at least a segment, + * don't try at all */ if (PREDICT_FALSE (snd_space < tc->snd_mss)) - { - if (snd_space > clib_min (tc->mss, tc->rcv_opts.mss) - TCP_HDR_LEN_MAX) - return snd_space; - return 0; - } + return 0; /* round down to mss multiple */ return snd_space - (snd_space % tc->snd_mss); @@ -1030,7 +1033,7 @@ tcp_snd_space (tcp_connection_t * tc) if (PREDICT_TRUE (tcp_in_cong_recovery (tc) == 0)) { - snd_space = tcp_available_snd_space (tc); + snd_space = tcp_available_output_snd_space (tc); /* If we haven't gotten dupacks or if we did and have gotten sacked * bytes then we can still send as per Limited Transmit (RFC3042) */ @@ -1051,17 +1054,20 @@ tcp_snd_space (tcp_connection_t * tc) if (tcp_in_recovery (tc)) { tc->snd_nxt = tc->snd_una_max; - snd_space = tcp_available_wnd (tc) - tc->snd_rxt_bytes + snd_space = tcp_available_snd_wnd (tc) - tc->snd_rxt_bytes - (tc->snd_una_max - tc->snd_congestion); if (snd_space <= 0 || (tc->snd_una_max - tc->snd_una) >= tc->snd_wnd) return 0; return tcp_round_snd_space (tc, snd_space); } - /* If in fast recovery, send 1 SMSS if wnd allows */ - if (tcp_in_fastrecovery (tc) - && tcp_available_snd_space (tc) && !tcp_fastrecovery_sent_1_smss (tc)) + /* RFC 5681: When previously unsent data is available and the new value of + * cwnd and the receiver's advertised window allow, a TCP SHOULD send 1*SMSS + * bytes of previously unsent data. */ + if (tcp_in_fastrecovery (tc) && !tcp_fastrecovery_sent_1_smss (tc)) { + if (tcp_available_output_snd_space (tc) < tc->snd_mss) + return 0; tcp_fastrecovery_1_smss_on (tc); return tc->snd_mss; } @@ -1073,7 +1079,8 @@ u32 tcp_session_send_space (transport_connection_t * trans_conn) { tcp_connection_t *tc = (tcp_connection_t *) trans_conn; - return tcp_snd_space (tc); + return clib_min (tcp_snd_space (tc), + tc->snd_wnd - (tc->snd_nxt - tc->snd_una)); } i32 diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 097cc8cf950..9e4660b86e6 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -498,7 +498,9 @@ tcp_half_open_connection_get (u32 conn_index) void tcp_make_ack (tcp_connection_t * ts, vlib_buffer_t * b); void tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b); void tcp_make_synack (tcp_connection_t * ts, vlib_buffer_t * b); -void tcp_send_reset (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4); +void tcp_send_reset_w_pkt (tcp_connection_t * tc, vlib_buffer_t * pkt, + u8 is_ip4); +void tcp_send_reset (tcp_connection_t * tc); void tcp_send_syn (tcp_connection_t * tc); void tcp_send_fin (tcp_connection_t * tc); void tcp_init_mss (tcp_connection_t * tc); @@ -582,15 +584,30 @@ tcp_loss_wnd (const tcp_connection_t * tc) } always_inline u32 -tcp_available_wnd (const tcp_connection_t * tc) +tcp_available_snd_wnd (const tcp_connection_t * tc) { return clib_min (tc->cwnd, tc->snd_wnd); } +always_inline u32 +tcp_available_output_snd_space (const tcp_connection_t * tc) +{ + u32 available_wnd = tcp_available_snd_wnd (tc); + int flight_size = (int) (tc->snd_nxt - tc->snd_una); + + if (available_wnd <= flight_size) + return 0; + + return available_wnd - flight_size; +} + +/** + * Estimate of how many bytes we can still push into the network + */ always_inline u32 tcp_available_snd_space (const tcp_connection_t * tc) { - u32 available_wnd = tcp_available_wnd (tc); + u32 available_wnd = tcp_available_snd_wnd (tc); u32 flight_size = tcp_flight_size (tc); if (available_wnd <= flight_size) diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 6f9e4c7a2ce..95f9ade1369 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -1363,7 +1363,7 @@ always_inline int tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, u16 data_len) { - int written; + int written, error = TCP_ERROR_ENQUEUED; ASSERT (seq_geq (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt)); @@ -1381,12 +1381,12 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, /* Update rcv_nxt */ if (PREDICT_TRUE (written == data_len)) { - tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end; + tc->rcv_nxt += written; } /* If more data written than expected, account for out-of-order bytes. */ else if (written > data_len) { - tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end + written - data_len; + tc->rcv_nxt += written; /* Send ACK confirming the update */ tc->flags |= TCP_CONN_SNDACK; @@ -1400,7 +1400,7 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, * not be enqueued. Inform peer */ tc->flags |= TCP_CONN_SNDACK; - return TCP_ERROR_PARTIALLY_ENQUEUED; + error = TCP_ERROR_PARTIALLY_ENQUEUED; } else { @@ -1415,7 +1415,7 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt); } - return TCP_ERROR_ENQUEUED; + return error; } /** Enqueue out-of-order data */ @@ -1495,10 +1495,10 @@ tcp_can_delack (tcp_connection_t * tc) static int tcp_buffer_discard_bytes (vlib_buffer_t * b, u32 n_bytes_to_drop) { - u32 discard; + u32 discard, first = b->current_length; vlib_main_t *vm = vlib_get_main (); - /* Handle multi segment packets */ + /* Handle multi-buffer segments */ if (n_bytes_to_drop > b->current_length) { if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT)) @@ -1511,7 +1511,12 @@ tcp_buffer_discard_bytes (vlib_buffer_t * b, u32 n_bytes_to_drop) n_bytes_to_drop -= discard; } while (n_bytes_to_drop); + if (n_bytes_to_drop > first) + b->total_length_not_including_first_buffer -= n_bytes_to_drop - first; } + else + vlib_buffer_advance (b, n_bytes_to_drop); + vnet_buffer (b)->tcp.data_len -= n_bytes_to_drop; return 0; } @@ -1908,7 +1913,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (ack0 <= tc0->iss || ack0 > tc0->snd_nxt) { if (!tcp_rst (tcp0)) - tcp_send_reset (tc0, b0, is_ip4); + tcp_send_reset_w_pkt (tc0, b0, is_ip4); goto drop; } @@ -1995,7 +2000,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, * allocate session send reset */ if (stream_session_connect_notify (&new_tc0->connection, 0)) { - tcp_send_reset (new_tc0, b0, is_ip4); + tcp_send_reset_w_pkt (new_tc0, b0, is_ip4); tcp_connection_cleanup (new_tc0); goto drop; } @@ -2017,7 +2022,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (stream_session_connect_notify (&new_tc0->connection, 0)) { tcp_connection_cleanup (new_tc0); - tcp_send_reset (tc0, b0, is_ip4); + tcp_send_reset_w_pkt (tc0, b0, is_ip4); TCP_EVT_DBG (TCP_EVT_RST_SENT, tc0); goto drop; } @@ -2221,7 +2226,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, */ if (!tcp_rcv_ack_is_acceptable (tc0, b0)) { - tcp_send_reset (tc0, b0, is_ip4); + tcp_send_reset_w_pkt (tc0, b0, is_ip4); goto drop; } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 7da0c07349d..c56eadf8fdd 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -440,7 +440,8 @@ tcp_init_mss (tcp_connection_t * tc) always_inline int tcp_alloc_tx_buffers (tcp_main_t * tm, u8 thread_index, u32 n_free_buffers) { - vec_validate (tm->tx_buffers[thread_index], n_free_buffers - 1); + vec_validate (tm->tx_buffers[thread_index], + vec_len (tm->tx_buffers[thread_index]) + n_free_buffers - 1); _vec_len (tm->tx_buffers[thread_index]) = vlib_buffer_alloc_from_free_list (vlib_get_main (), tm->tx_buffers[thread_index], @@ -480,27 +481,31 @@ tcp_return_buffer (tcp_main_t * tm) _vec_len (my_tx_buffers) += 1; } -always_inline void +always_inline void * tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b) { - vlib_buffer_t *it = b; - u32 save_free_list = b->flags & VLIB_BUFFER_FREE_LIST_INDEX_MASK; - do - { - it->current_data = 0; - it->current_length = 0; - it->total_length_not_including_first_buffer = 0; - } - while ((it->flags & VLIB_BUFFER_NEXT_PRESENT) - && (it = vlib_get_buffer (vm, it->next_buffer))); - if (b->flags & VLIB_BUFFER_NEXT_PRESENT) vlib_buffer_free_one (vm, b->next_buffer); - b->flags = save_free_list; + b->flags = 0; + b->current_data = 0; + b->current_length = 0; + b->total_length_not_including_first_buffer = 0; + vnet_buffer (b)->tcp.flags = 0; /* Leave enough space for headers */ - vlib_buffer_make_headroom (b, MAX_HDRS_LEN); + return vlib_buffer_make_headroom (b, MAX_HDRS_LEN); +} + +always_inline void * +tcp_init_buffer (vlib_main_t * vm, vlib_buffer_t * b) +{ + ASSERT ((b->flags & VLIB_BUFFER_NEXT_PRESENT) == 0); + b->flags = VNET_BUFFER_F_LOCALLY_ORIGINATED; + b->total_length_not_including_first_buffer = 0; vnet_buffer (b)->tcp.flags = 0; + + /* Leave enough space for headers */ + return vlib_buffer_make_headroom (b, MAX_HDRS_LEN); } /** @@ -632,6 +637,59 @@ tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, vlib_put_frame_to_node (vm, next_index, f); } +always_inline void +tcp_enqueue_to_output_i (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, + u8 is_ip4, u8 flush) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + u32 thread_index = vlib_get_thread_index (); + u32 *to_next, next_index; + vlib_frame_t *f; + + b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; + b->error = 0; + + /* Decide where to send the packet */ + next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index; + + /* Initialize the trajectory trace, if configured */ + if (VLIB_BUFFER_TRACE_TRAJECTORY > 0) + { + b->pre_data[0] = 1; + b->pre_data[1] = next_index; + } + + /* Get frame to v4/6 output node */ + f = tm->tx_frames[!is_ip4][thread_index]; + if (!f) + { + f = vlib_get_frame_to_node (vm, next_index); + ASSERT (f); + tm->tx_frames[!is_ip4][thread_index] = f; + } + to_next = vlib_frame_vector_args (f); + to_next[f->n_vectors] = bi; + f->n_vectors += 1; + if (flush || f->n_vectors == VLIB_FRAME_SIZE) + { + vlib_put_frame_to_node (vm, next_index, f); + tm->tx_frames[!is_ip4][thread_index] = 0; + } +} + +always_inline void +tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) +{ + tcp_enqueue_to_output_i (vm, b, bi, is_ip4, 0); +} + +always_inline void +tcp_enqueue_to_output_now (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, + u8 is_ip4) +{ + tcp_enqueue_to_output_i (vm, b, bi, is_ip4, 1); +} + int tcp_make_reset_in_place (vlib_main_t * vm, vlib_buffer_t * b0, tcp_state_t state, u8 thread_index, u8 is_ip4) @@ -712,9 +770,11 @@ tcp_make_reset_in_place (vlib_main_t * vm, vlib_buffer_t * b0, /** * Send reset without reusing existing buffer + * + * It extracts connection info out of original packet */ void -tcp_send_reset (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4) +tcp_send_reset_w_pkt (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4) { vlib_buffer_t *b; u32 bi; @@ -730,9 +790,7 @@ tcp_send_reset (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4) return; b = vlib_get_buffer (vm, bi); - - /* Leave enough space for headers */ - vlib_buffer_make_headroom (b, MAX_HDRS_LEN); + tcp_init_buffer (vm, b); /* Make and write options */ tcp_hdr_len = sizeof (tcp_header_t); @@ -787,6 +845,38 @@ tcp_send_reset (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4) TCP_EVT_DBG (TCP_EVT_RST_SENT, tc); } +/** + * Build and set reset packet for connection + */ +void +tcp_send_reset (tcp_connection_t * tc) +{ + vlib_main_t *vm = vlib_get_main (); + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_buffer_t *b; + u32 bi; + tcp_header_t *th; + u16 tcp_hdr_opts_len, advertise_wnd, opts_write_len; + u8 flags; + + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); + tcp_init_buffer (vm, b); + + tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); + tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t); + advertise_wnd = tcp_window_to_advertise (tc, TCP_STATE_ESTABLISHED); + flags = TCP_FLAG_RST; + th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt, + tc->rcv_nxt, tcp_hdr_opts_len, flags, + advertise_wnd); + opts_write_len = tcp_options_write ((u8 *) (th + 1), &tc->snd_opts); + ASSERT (opts_write_len == tc->snd_opts_len); + vnet_buffer (b)->tcp.connection_index = tc->c_c_index; + tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4); +} + void tcp_push_ip_hdr (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b) { @@ -835,9 +925,7 @@ tcp_send_syn (tcp_connection_t * tc) return; b = vlib_get_buffer (vm, bi); - - /* Leave enough space for headers */ - vlib_buffer_make_headroom (b, MAX_HDRS_LEN); + tcp_init_buffer (vm, b); /* Set random initial sequence */ time_now = tcp_time_now (); @@ -875,59 +963,6 @@ tcp_send_syn (tcp_connection_t * tc) TCP_EVT_DBG (TCP_EVT_SYN_SENT, tc); } -always_inline void -tcp_enqueue_to_output_i (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, - u8 is_ip4, u8 flush) -{ - tcp_main_t *tm = vnet_get_tcp_main (); - u32 thread_index = vlib_get_thread_index (); - u32 *to_next, next_index; - vlib_frame_t *f; - - b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; - b->error = 0; - - /* Decide where to send the packet */ - next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index; - - /* Initialize the trajectory trace, if configured */ - if (VLIB_BUFFER_TRACE_TRAJECTORY > 0) - { - b->pre_data[0] = 1; - b->pre_data[1] = next_index; - } - - /* Get frame to v4/6 output node */ - f = tm->tx_frames[!is_ip4][thread_index]; - if (!f) - { - f = vlib_get_frame_to_node (vm, next_index); - ASSERT (f); - tm->tx_frames[!is_ip4][thread_index] = f; - } - to_next = vlib_frame_vector_args (f); - to_next[f->n_vectors] = bi; - f->n_vectors += 1; - if (flush || f->n_vectors == VLIB_FRAME_SIZE) - { - vlib_put_frame_to_node (vm, next_index, f); - tm->tx_frames[!is_ip4][thread_index] = 0; - } -} - -always_inline void -tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) -{ - tcp_enqueue_to_output_i (vm, b, bi, is_ip4, 0); -} - -always_inline void -tcp_enqueue_to_output_now (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, - u8 is_ip4) -{ - tcp_enqueue_to_output_i (vm, b, bi, is_ip4, 1); -} - /** * Flush tx frame populated by retransmits and timer pops */ @@ -969,7 +1004,7 @@ tcp_send_fin (tcp_connection_t * tc) if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) return; b = vlib_get_buffer (vm, bi); - + /* buffer will be initialized by in tcp_make_fin */ tcp_make_fin (tc, b); tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4); tc->flags |= TCP_CONN_FINSNT; @@ -1013,6 +1048,8 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, tcp_header_t *th; data_len = b->current_length + b->total_length_not_including_first_buffer; + ASSERT (!b->total_length_not_including_first_buffer + || (b->flags & VLIB_BUFFER_NEXT_PRESENT)); vnet_buffer (b)->tcp.flags = 0; if (compute_opts) @@ -1106,29 +1143,27 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, * Make sure we can retransmit something */ available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection); + available_bytes -= offset; if (!available_bytes) return 0; max_deq_bytes = clib_min (tc->snd_mss, max_deq_bytes); max_deq_bytes = clib_min (available_bytes, max_deq_bytes); - seg_size = max_deq_bytes + MAX_HDRS_LEN; /* Start is beyond snd_congestion */ start = tc->snd_una + offset; if (seq_geq (start, tc->snd_congestion)) - { - goto done; - } + goto done; /* Don't overshoot snd_congestion */ if (seq_gt (start + max_deq_bytes, tc->snd_congestion)) { max_deq_bytes = tc->snd_congestion - start; if (max_deq_bytes == 0) - { - goto done; - } + goto done; } + seg_size = max_deq_bytes + MAX_HDRS_LEN; + /* * Prepare options */ @@ -1141,7 +1176,7 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) return 0; *b = vlib_get_buffer (vm, bi); - data = vlib_buffer_make_headroom (*b, MAX_HDRS_LEN); + data = tcp_init_buffer (vm, *b); /* Easy case, buffer size greater than mss */ if (PREDICT_TRUE (seg_size <= tm->bytes_per_buffer)) @@ -1162,7 +1197,6 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, int i; n_bufs_per_seg = ceil ((double) seg_size / tm->bytes_per_buffer); - ASSERT (available_bytes >= max_deq_bytes); /* Make sure we have enough buffers */ available_bufs = vec_len (tm->tx_buffers[thread_index]); @@ -1182,8 +1216,6 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, b[0]->current_length = n_bytes; b[0]->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; b[0]->total_length_not_including_first_buffer = 0; - - tcp_push_hdr_i (tc, *b, tc->state, 0); max_deq_bytes -= n_bytes; chain_b = *b; @@ -1197,22 +1229,22 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, chain_b->current_data = 0; data = vlib_buffer_get_current (chain_b); n_peeked = stream_session_peek_bytes (&tc->connection, data, - n_bytes, len_to_deq); - n_bytes += n_peeked; + offset + n_bytes, len_to_deq); ASSERT (n_peeked == len_to_deq); + n_bytes += n_peeked; chain_b->current_length = n_peeked; - b[0]->total_length_not_including_first_buffer += - chain_b->current_length; + chain_b->flags = 0; + chain_b->next_buffer = 0; /* update previous buffer */ prev_b->next_buffer = chain_bi; prev_b->flags |= VLIB_BUFFER_NEXT_PRESENT; - /* update current buffer */ - chain_b->next_buffer = 0; - max_deq_bytes -= n_peeked; + b[0]->total_length_not_including_first_buffer += n_peeked; } + + tcp_push_hdr_i (tc, *b, tc->state, 0); } ASSERT (n_bytes > 0); @@ -1348,7 +1380,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) return; b = vlib_get_buffer (vm, bi); - vlib_buffer_make_headroom (b, MAX_HDRS_LEN); + tcp_init_buffer (vm, b); tcp_push_hdr_i (tc, b, tc->state, 1); /* Account for the SYN */ @@ -1409,8 +1441,9 @@ tcp_timer_persist_handler (u32 index) u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; - u32 bi, old_snd_nxt, snd_bytes = 0, available_bytes = 0; + u32 bi, old_snd_nxt, max_snd_bytes, available_bytes, offset; int n_bytes = 0; + u8 *data; tc = tcp_connection_get_if_valid (index, thread_index); @@ -1419,12 +1452,13 @@ tcp_timer_persist_handler (u32 index) /* Make sure timer handle is set to invalid */ tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID; + offset = tc->snd_una_max - tc->snd_una; /* Problem already solved or worse */ available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection); if (tc->state == TCP_STATE_CLOSED || tc->state > TCP_STATE_ESTABLISHED || tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc) - || !available_bytes) + || !available_bytes || available_bytes <= offset) return; /* Increment RTO backoff */ @@ -1437,18 +1471,17 @@ tcp_timer_persist_handler (u32 index) if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) return; b = vlib_get_buffer (vm, bi); + data = tcp_init_buffer (vm, b); - tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); + tcp_validate_txf_size (tc, offset); tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); - snd_bytes = clib_min (tc->snd_mss, tm->bytes_per_buffer); - n_bytes = stream_session_peek_bytes (&tc->connection, - vlib_buffer_get_current (b), - tc->snd_una_max - tc->snd_una, - snd_bytes); - ASSERT (n_bytes != 0); + max_snd_bytes = clib_min (tc->snd_mss, tm->bytes_per_buffer - MAX_HDRS_LEN); + n_bytes = stream_session_peek_bytes (&tc->connection, data, offset, + max_snd_bytes); b->current_length = n_bytes; - ASSERT (tc->snd_nxt == tc->snd_una_max || tc->rto_boff > 1 - || tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)); + ASSERT (n_bytes != 0 && (tc->snd_nxt == tc->snd_una_max || tc->rto_boff > 1 + || tcp_timer_is_active (tc, + TCP_TIMER_RETRANSMIT))); /* Allow updating of snd_una_max but don't update snd_nxt */ old_snd_nxt = tc->snd_nxt; @@ -1456,8 +1489,8 @@ tcp_timer_persist_handler (u32 index) tc->snd_nxt = old_snd_nxt; tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); - /* Re-enable persist timer */ - tcp_persist_timer_set (tc); + /* Just sent new data, enable retransmit */ + tcp_retransmit_timer_update (tc); } /** @@ -1490,7 +1523,7 @@ void tcp_fast_retransmit_sack (tcp_connection_t * tc) { vlib_main_t *vm = vlib_get_main (); - u32 n_written = 0, offset = 0, max_bytes; + u32 n_written = 0, offset, max_bytes; vlib_buffer_t *b = 0; sack_scoreboard_hole_t *hole; sack_scoreboard_t *sb; @@ -1523,7 +1556,9 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) * unSACKed sequence number SHOULD be returned, and RescueRxt set to * RecoveryPoint. HighRxt MUST NOT be updated. */ - max_bytes = clib_min (tc->snd_mss, snd_space); + max_bytes = clib_min (tc->snd_mss, + tc->snd_congestion - tc->snd_una); + max_bytes = clib_min (max_bytes, snd_space); offset = tc->snd_congestion - tc->snd_una - max_bytes; sb->rescue_rxt = tc->snd_congestion; tc->snd_nxt = tc->snd_una + offset; @@ -1535,9 +1570,12 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) break; } - max_bytes = snd_limited ? tc->snd_mss : hole->end - sb->high_rxt; + max_bytes = clib_min (hole->end - sb->high_rxt, snd_space); + max_bytes = snd_limited ? clib_min (max_bytes, tc->snd_mss) : max_bytes; + if (max_bytes == 0) + break; offset = sb->high_rxt - tc->snd_una; - tc->snd_nxt = tc->snd_una + offset; + tc->snd_nxt = sb->high_rxt; n_written = tcp_prepare_retransmit_segment (tc, offset, max_bytes, &b); /* Nothing left to retransmit */ @@ -1547,6 +1585,7 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) bi = vlib_get_buffer_index (vm, b); sb->high_rxt += n_written; tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + ASSERT (n_written <= snd_space); snd_space -= n_written; } @@ -1835,6 +1874,7 @@ tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b) tc = (tcp_connection_t *) tconn; tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED, 0); + ASSERT (seq_leq (tc->snd_una_max, tc->snd_una + tc->snd_wnd)); if (tc->rtt_ts == 0 && !tcp_in_cong_recovery (tc)) { -- cgit 1.2.3-korg