summaryrefslogtreecommitdiffstats
path: root/src/vnet/tcp/tcp_output.c
diff options
context:
space:
mode:
authorFlorin Coras <fcoras@cisco.com>2017-08-01 16:56:58 -0700
committerDave Barach <openvpp@barachs.net>2017-08-11 16:03:19 +0000
commitb2215d6b0d8ef7d425d2b9eea524a1c055a9f3b3 (patch)
tree6299677b83934af494e6bb7dd130ed8928304729 /src/vnet/tcp/tcp_output.c
parent755e41e4574103f5435ca45384c236bf11d8e28f (diff)
Fix tcp multi buffer segments retransmission
- Fix tcp/udp sw checksum computation - Fix allocation of multi buffer tcp segments for retransmits - Send FIN only if/when tx fifo is empty Change-Id: I2e43a14b87a72c9e547b4339b9a51811cf5732c4 Signed-off-by: Florin Coras <fcoras@cisco.com>
Diffstat (limited to 'src/vnet/tcp/tcp_output.c')
-rw-r--r--src/vnet/tcp/tcp_output.c290
1 files changed, 184 insertions, 106 deletions
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index f8fbb8a9e69..4c1add21c27 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -15,6 +15,7 @@
#include <vnet/tcp/tcp.h>
#include <vnet/lisp-cp/packets.h>
+#include <math.h>
vlib_node_registration_t tcp4_output_node;
vlib_node_registration_t tcp6_output_node;
@@ -84,7 +85,7 @@ void
tcp_update_rcv_mss (tcp_connection_t * tc)
{
/* TODO find our iface MTU */
- tc->mss = dummy_mtu;
+ tc->mss = dummy_mtu - sizeof (tcp_header_t);
}
/**
@@ -437,27 +438,34 @@ tcp_init_mss (tcp_connection_t * tc)
}
always_inline int
+tcp_alloc_tx_buffers (tcp_main_t * tm, u8 thread_index, u32 n_free_buffers)
+{
+ vec_validate (tm->tx_buffers[thread_index], n_free_buffers - 1);
+ _vec_len (tm->tx_buffers[thread_index]) =
+ vlib_buffer_alloc_from_free_list (vlib_get_main (),
+ tm->tx_buffers[thread_index],
+ n_free_buffers,
+ VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+ /* buffer shortage, report failure */
+ if (vec_len (tm->tx_buffers[thread_index]) == 0)
+ {
+ clib_warning ("out of buffers");
+ return -1;
+ }
+ return 0;
+}
+
+always_inline int
tcp_get_free_buffer_index (tcp_main_t * tm, u32 * bidx)
{
- u32 *my_tx_buffers, n_free_buffers;
+ u32 *my_tx_buffers;
u32 thread_index = vlib_get_thread_index ();
- my_tx_buffers = tm->tx_buffers[thread_index];
- if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0))
+ if (PREDICT_FALSE (vec_len (tm->tx_buffers[thread_index]) == 0))
{
- n_free_buffers = VLIB_FRAME_SIZE;
- vec_validate (my_tx_buffers, n_free_buffers - 1);
- _vec_len (my_tx_buffers) =
- vlib_buffer_alloc_from_free_list (vlib_get_main (), my_tx_buffers,
- n_free_buffers,
- VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
- /* buffer shortage, report failure */
- if (vec_len (my_tx_buffers) == 0)
- {
- clib_warning ("out of buffers");
- return -1;
- }
- tm->tx_buffers[thread_index] = my_tx_buffers;
+ if (tcp_alloc_tx_buffers (tm, thread_index, VLIB_FRAME_SIZE))
+ return -1;
}
+ my_tx_buffers = tm->tx_buffers[thread_index];
*bidx = my_tx_buffers[_vec_len (my_tx_buffers) - 1];
_vec_len (my_tx_buffers) -= 1;
return 0;
@@ -476,6 +484,7 @@ always_inline void
tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b)
{
vlib_buffer_t *it = b;
+ u32 save_free_list = b->flags & VLIB_BUFFER_FREE_LIST_INDEX_MASK;
do
{
it->current_data = 0;
@@ -485,6 +494,10 @@ tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b)
while ((it->flags & VLIB_BUFFER_NEXT_PRESENT)
&& (it = vlib_get_buffer (vm, it->next_buffer)));
+ if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+ vlib_buffer_free_one (vm, b->next_buffer);
+ b->flags = save_free_list;
+
/* Leave enough space for headers */
vlib_buffer_make_headroom (b, MAX_HDRS_LEN);
vnet_buffer (b)->tcp.flags = 0;
@@ -959,18 +972,16 @@ tcp_send_fin (tcp_connection_t * tc)
return;
b = vlib_get_buffer (vm, bi);
- /* Leave enough space for headers */
- vlib_buffer_make_headroom (b, MAX_HDRS_LEN);
-
tcp_make_fin (tc, b);
tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4);
tc->flags |= TCP_CONN_FINSNT;
+ tc->flags &= ~TCP_CONN_FINPNDG;
tcp_retransmit_timer_force_update (tc);
TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc);
}
always_inline u8
-tcp_make_state_flags (tcp_state_t next_state)
+tcp_make_state_flags (tcp_connection_t * tc, tcp_state_t next_state)
{
switch (next_state)
{
@@ -982,7 +993,10 @@ tcp_make_state_flags (tcp_state_t next_state)
return TCP_FLAG_SYN;
case TCP_STATE_LAST_ACK:
case TCP_STATE_FIN_WAIT_1:
- return TCP_FLAG_FIN;
+ if (tc->snd_nxt + 1 < tc->snd_una_max)
+ return TCP_FLAG_ACK;
+ else
+ return TCP_FLAG_FIN;
default:
clib_warning ("Shouldn't be here!");
}
@@ -1008,7 +1022,7 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b,
tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t);
advertise_wnd = tcp_window_to_advertise (tc, next_state);
- flags = tcp_make_state_flags (next_state);
+ flags = tcp_make_state_flags (tc, next_state);
/* Push header and options */
th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt,
@@ -1055,7 +1069,11 @@ tcp_send_ack (tcp_connection_t * tc)
tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
}
-/* Send delayed ACK when timer expires */
+/**
+ * Delayed ack timer handler
+ *
+ * Sends delayed ACK when timer expires
+ */
void
tcp_timer_delack_handler (u32 index)
{
@@ -1067,49 +1085,138 @@ tcp_timer_delack_handler (u32 index)
tcp_send_ack (tc);
}
-/** Build a retransmit segment
+/**
+ * Build a retransmit segment
*
* @return the number of bytes in the segment or 0 if there's nothing to
* retransmit
*/
u32
-tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b,
- u32 offset, u32 max_bytes)
+tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset,
+ u32 max_deq_bytes, vlib_buffer_t ** b)
{
+ tcp_main_t *tm = vnet_get_tcp_main ();
vlib_main_t *vm = vlib_get_main ();
int n_bytes = 0;
- u32 start;
-
- tcp_reuse_buffer (vm, b);
+ u32 start, bi, available_bytes;
ASSERT (tc->state >= TCP_STATE_ESTABLISHED);
- ASSERT (max_bytes != 0);
+ ASSERT (max_deq_bytes != 0);
- max_bytes = clib_min (tc->snd_mss, max_bytes);
+ /*
+ * Make sure we can retransmit something
+ */
+ max_deq_bytes = clib_min (tc->snd_mss, max_deq_bytes);
+ available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection);
+ if (!available_bytes)
+ return 0;
+ max_deq_bytes = clib_min (available_bytes, max_deq_bytes);
start = tc->snd_una + offset;
/* Start is beyond snd_congestion */
if (seq_geq (start, tc->snd_congestion))
- goto done;
+ {
+ goto done;
+ }
/* Don't overshoot snd_congestion */
- if (seq_gt (start + max_bytes, tc->snd_congestion))
+ if (seq_gt (start + max_deq_bytes, tc->snd_congestion))
{
- max_bytes = tc->snd_congestion - start;
- if (max_bytes == 0)
- goto done;
+ max_deq_bytes = tc->snd_congestion - start;
+ if (max_deq_bytes == 0)
+ {
+ goto done;
+ }
}
+ /*
+ * Prepare options
+ */
tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state);
- ASSERT (max_bytes <= tc->snd_mss);
+ /*
+ * Allocate and fill in buffer(s)
+ */
+
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return 0;
+ *b = vlib_get_buffer (vm, bi);
+
+ /* Easy case, buffer size greater than mss */
+ if (PREDICT_TRUE (max_deq_bytes <= tm->bytes_per_buffer))
+ {
+ n_bytes = stream_session_peek_bytes (&tc->connection,
+ vlib_buffer_get_current (*b),
+ offset, max_deq_bytes);
+ ASSERT (n_bytes == max_deq_bytes);
+ b[0]->current_length = n_bytes;
+ tcp_push_hdr_i (tc, *b, tc->state, 0);
+ }
+ /* Split mss into multiple buffers */
+ else
+ {
+ u32 chain_bi = ~0, n_bufs_per_seg;
+ u32 thread_index = vlib_get_thread_index ();
+ u16 n_peeked, len_to_deq, available_bufs;
+ vlib_buffer_t *chain_b, *prev_b;
+ u8 *data0;
+ int i;
+
+ n_bufs_per_seg = ceil ((double) max_deq_bytes / tm->bytes_per_buffer);
+ ASSERT (available_bytes >= max_deq_bytes);
+
+ /* Make sure we have enough buffers */
+ available_bufs = vec_len (tm->tx_buffers[thread_index]);
+ if (n_bufs_per_seg > available_bufs)
+ {
+ if (tcp_alloc_tx_buffers (tm, thread_index,
+ VLIB_FRAME_SIZE - available_bufs))
+ {
+ tcp_return_buffer (tm);
+ return 0;
+ }
+ }
+
+ n_bytes = stream_session_peek_bytes (&tc->connection,
+ vlib_buffer_get_current (*b),
+ offset, tm->bytes_per_buffer);
+ b[0]->current_length = n_bytes;
+ b[0]->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
+ b[0]->total_length_not_including_first_buffer = 0;
+
+ tcp_push_hdr_i (tc, *b, tc->state, 0);
+ max_deq_bytes -= n_bytes;
+
+ chain_b = *b;
+ for (i = 1; i < n_bufs_per_seg; i++)
+ {
+ prev_b = chain_b;
+ len_to_deq = clib_min (max_deq_bytes, tm->bytes_per_buffer);
+ tcp_get_free_buffer_index (tm, &chain_bi);
+ ASSERT (chain_bi != (u32) ~ 0);
+ chain_b = vlib_get_buffer (vm, chain_bi);
+ chain_b->current_data = 0;
+ data0 = vlib_buffer_get_current (chain_b);
+ n_peeked = stream_session_peek_bytes (&tc->connection, data0,
+ n_bytes, len_to_deq);
+ n_bytes += n_peeked;
+ ASSERT (n_peeked == len_to_deq);
+ chain_b->current_length = n_peeked;
+ b[0]->total_length_not_including_first_buffer +=
+ chain_b->current_length;
+
+ /* update previous buffer */
+ prev_b->next_buffer = chain_bi;
+ prev_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
+
+ /* update current buffer */
+ chain_b->next_buffer = 0;
+
+ max_deq_bytes -= n_peeked;
+ }
+ }
- n_bytes = stream_session_peek_bytes (&tc->connection,
- vlib_buffer_get_current (b), offset,
- max_bytes);
ASSERT (n_bytes > 0);
- b->current_length = n_bytes;
- tcp_push_hdr_i (tc, b, tc->state, 0);
if (tcp_in_fastrecovery (tc))
tc->snd_rxt_bytes += n_bytes;
@@ -1147,7 +1254,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
vlib_main_t *vm = vlib_get_main ();
u32 thread_index = vlib_get_thread_index ();
tcp_connection_t *tc;
- vlib_buffer_t *b;
+ vlib_buffer_t *b = 0;
u32 bi, n_bytes;
if (is_syn)
@@ -1174,17 +1281,11 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
/* Go back to first un-acked byte */
tc->snd_nxt = tc->snd_una;
- if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
- return;
-
- b = vlib_get_buffer (vm, bi);
-
if (tc->state >= TCP_STATE_ESTABLISHED)
{
/* Lost FIN, retransmit and return */
- if (tc->flags & TCP_CONN_FINSNT)
+ if (tcp_is_lost_fin (tc))
{
- tcp_return_buffer (tm);
tcp_send_fin (tc);
return;
}
@@ -1199,7 +1300,9 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1);
/* Send one segment */
- n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss);
+ n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b);
+ ASSERT (n_bytes);
+ bi = vlib_get_buffer_index (vm, b);
/* TODO be less aggressive about this */
scoreboard_clear (&tc->sack_sb);
@@ -1212,7 +1315,6 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
tcp_retransmit_timer_set (tc);
ASSERT (0 || (tc->rto_boff > 1
&& tc->snd_una == tc->snd_congestion));
- tcp_return_buffer (tm);
return;
}
@@ -1234,7 +1336,6 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
clib_warning ("could not remove half-open connection");
ASSERT (0);
}
- tcp_return_buffer (tm);
return;
}
@@ -1243,6 +1344,9 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
if (tc->rto_boff > TCP_RTO_SYN_RETRIES)
tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return;
+ b = vlib_get_buffer (vm, bi);
vlib_buffer_make_headroom (b, MAX_HDRS_LEN);
tcp_push_hdr_i (tc, b, tc->state, 1);
@@ -1256,7 +1360,6 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
{
ASSERT (tc->state == TCP_STATE_CLOSED);
clib_warning ("connection closed ...");
- tcp_return_buffer (tm);
return;
}
@@ -1305,7 +1408,7 @@ tcp_timer_persist_handler (u32 index)
u32 thread_index = vlib_get_thread_index ();
tcp_connection_t *tc;
vlib_buffer_t *b;
- u32 bi, old_snd_nxt;
+ u32 bi, old_snd_nxt, snd_bytes = 0, available_bytes = 0;
int n_bytes = 0;
tc = tcp_connection_get_if_valid (index, thread_index);
@@ -1317,34 +1420,31 @@ tcp_timer_persist_handler (u32 index)
tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID;
/* Problem already solved or worse */
+ available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection);
if (tc->state == TCP_STATE_CLOSED || tc->state > TCP_STATE_ESTABLISHED
- || tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc))
+ || tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc)
+ || !available_bytes)
return;
/* Increment RTO backoff */
tc->rto_boff += 1;
tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
- /* Try to force the first unsent segment */
+ /*
+ * Try to force the first unsent segment (or buffer)
+ */
if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
return;
-
b = vlib_get_buffer (vm, bi);
tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state);
+ snd_bytes = clib_min (tc->snd_mss, tm->bytes_per_buffer);
n_bytes = stream_session_peek_bytes (&tc->connection,
vlib_buffer_get_current (b),
tc->snd_una_max - tc->snd_una,
- tc->snd_mss);
- /* Nothing to send */
- if (n_bytes <= 0)
- {
- // clib_warning ("persist found nothing to send");
- tcp_return_buffer (tm);
- return;
- }
-
+ snd_bytes);
+ ASSERT (n_bytes != 0);
b->current_length = n_bytes;
ASSERT (tc->snd_nxt == tc->snd_una_max || tc->rto_boff > 1
|| tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT));
@@ -1365,32 +1465,20 @@ tcp_timer_persist_handler (u32 index)
void
tcp_retransmit_first_unacked (tcp_connection_t * tc)
{
- tcp_main_t *tm = vnet_get_tcp_main ();
vlib_main_t *vm = vlib_get_main ();
vlib_buffer_t *b;
- u32 bi, n_bytes, old_snd_nxt;
+ u32 bi, old_snd_nxt, n_bytes;
old_snd_nxt = tc->snd_nxt;
tc->snd_nxt = tc->snd_una;
- /* Get buffer */
- if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
- return;
-
- b = vlib_get_buffer (vm, bi);
-
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2);
-
- n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss);
- if (n_bytes == 0)
- {
- tcp_return_buffer (tm);
- goto done;
- }
-
+ n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b);
+ if (!n_bytes)
+ return;
+ bi = vlib_get_buffer_index (vm, b);
tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
-done:
tc->snd_nxt = old_snd_nxt;
}
@@ -1400,10 +1488,9 @@ done:
void
tcp_fast_retransmit_sack (tcp_connection_t * tc)
{
- tcp_main_t *tm = vnet_get_tcp_main ();
vlib_main_t *vm = vlib_get_main ();
u32 n_written = 0, offset = 0, max_bytes;
- vlib_buffer_t *b;
+ vlib_buffer_t *b = 0;
sack_scoreboard_hole_t *hole;
sack_scoreboard_t *sb;
u32 bi, old_snd_nxt;
@@ -1420,10 +1507,6 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc)
hole = scoreboard_get_hole (sb, sb->cur_rxt_hole);
while (hole && snd_space > 0)
{
- if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
- return;
-
- b = vlib_get_buffer (vm, bi);
hole = scoreboard_next_rxt_hole (sb, hole,
tcp_fastrecovery_sent_1_smss (tc),
&can_rescue, &snd_limited);
@@ -1443,7 +1526,10 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc)
offset = tc->snd_congestion - tc->snd_una - max_bytes;
sb->rescue_rxt = tc->snd_congestion;
tc->snd_nxt = tc->snd_una + offset;
- tcp_prepare_retransmit_segment (tc, b, offset, max_bytes);
+ n_written = tcp_prepare_retransmit_segment (tc, offset, max_bytes,
+ &b);
+ ASSERT (n_written);
+ bi = vlib_get_buffer_index (vm, b);
tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
break;
}
@@ -1451,15 +1537,13 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc)
max_bytes = snd_limited ? tc->snd_mss : hole->end - sb->high_rxt;
offset = sb->high_rxt - tc->snd_una;
tc->snd_nxt = tc->snd_una + offset;
- n_written = tcp_prepare_retransmit_segment (tc, b, offset, max_bytes);
+ n_written = tcp_prepare_retransmit_segment (tc, offset, max_bytes, &b);
/* Nothing left to retransmit */
if (n_written == 0)
- {
- tcp_return_buffer (tm);
- break;
- }
+ break;
+ bi = vlib_get_buffer_index (vm, b);
sb->high_rxt += n_written;
tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
snd_space -= n_written;
@@ -1475,7 +1559,6 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc)
void
tcp_fast_retransmit_no_sack (tcp_connection_t * tc)
{
- tcp_main_t *tm = vnet_get_tcp_main ();
vlib_main_t *vm = vlib_get_main ();
u32 n_written = 0, offset = 0, bi, old_snd_nxt;
int snd_space;
@@ -1491,19 +1574,14 @@ tcp_fast_retransmit_no_sack (tcp_connection_t * tc)
while (snd_space > 0)
{
- if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
- return;
- b = vlib_get_buffer (vm, bi);
offset += n_written;
- n_written = tcp_prepare_retransmit_segment (tc, b, offset, snd_space);
+ n_written = tcp_prepare_retransmit_segment (tc, offset, snd_space, &b);
/* Nothing left to retransmit */
if (n_written == 0)
- {
- tcp_return_buffer (tm);
- break;
- }
+ break;
+ bi = vlib_get_buffer_index (vm, b);
tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
snd_space -= n_written;
}