aboutsummaryrefslogtreecommitdiffstats
path: root/src/vnet/tcp/tcp_output.c
diff options
context:
space:
mode:
authorFlorin Coras <fcoras@cisco.com>2017-09-05 14:03:37 -0400
committerDamjan Marion <dmarion.lists@gmail.com>2017-09-12 11:41:10 +0000
commit4eeeaaf5e822718eb222e6c49abd82e1bcb566fd (patch)
treef8ceca24b5f954bc615f1ef2e9383652b035052b /src/vnet/tcp/tcp_output.c
parent2504ac699e423f1ca840a63247ce55cb27735e0a (diff)
tcp: horizontal scaling improvments
- do not scale syn-ack window - fix the max number of outstanding syns in builtin client - fix syn-sent ack validation to use modulo arithmetic - improve retransmit timer handler - fix output buffer allocator leakeage - improved debugging Change-Id: Iac3bc0eadf7d0b494a93e22d210a3153b61b3273 Signed-off-by: Florin Coras <fcoras@cisco.com>
Diffstat (limited to 'src/vnet/tcp/tcp_output.c')
-rw-r--r--src/vnet/tcp/tcp_output.c212
1 files changed, 99 insertions, 113 deletions
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index 15a9dcb48a8..9cb3e77937e 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -66,11 +66,10 @@ format_tcp_tx_trace (u8 * s, va_list * args)
}
static u8
-tcp_window_compute_scale (u32 available_space)
+tcp_window_compute_scale (u32 window)
{
u8 wnd_scale = 0;
- while (wnd_scale < TCP_MAX_WND_SCALE
- && (available_space >> wnd_scale) > TCP_WND_MAX)
+ while (wnd_scale < TCP_MAX_WND_SCALE && (window >> wnd_scale) > TCP_WND_MAX)
wnd_scale++;
return wnd_scale;
}
@@ -444,12 +443,10 @@ tcp_alloc_tx_buffers (tcp_main_t * tm, u8 thread_index, u32 n_free_buffers)
vec_validate (tm->tx_buffers[thread_index],
current_length + n_free_buffers - 1);
- _vec_len (tm->tx_buffers[thread_index]) =
- current_length + vlib_buffer_alloc_from_free_list (vlib_get_main (),
- tm->tx_buffers
- [thread_index],
- n_free_buffers,
- VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+ _vec_len (tm->tx_buffers[thread_index]) = current_length
+ + vlib_buffer_alloc (vlib_get_main (),
+ &tm->tx_buffers[thread_index][current_length],
+ n_free_buffers);
/* buffer shortage, report failure */
if (vec_len (tm->tx_buffers[thread_index]) == 0)
{
@@ -470,7 +467,7 @@ tcp_get_free_buffer_index (tcp_main_t * tm, u32 * bidx)
return -1;
}
my_tx_buffers = tm->tx_buffers[thread_index];
- *bidx = my_tx_buffers[_vec_len (my_tx_buffers) - 1];
+ *bidx = my_tx_buffers[vec_len (my_tx_buffers) - 1];
_vec_len (my_tx_buffers) -= 1;
return 0;
}
@@ -478,10 +475,7 @@ tcp_get_free_buffer_index (tcp_main_t * tm, u32 * bidx)
always_inline void
tcp_return_buffer (tcp_main_t * tm)
{
- u32 *my_tx_buffers;
- u32 thread_index = vlib_get_thread_index ();
- my_tx_buffers = tm->tx_buffers[thread_index];
- _vec_len (my_tx_buffers) += 1;
+ _vec_len (tm->tx_buffers[vlib_get_thread_index ()]) += 1;
}
always_inline void *
@@ -489,7 +483,8 @@ tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b)
{
if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
vlib_buffer_free_one (vm, b->next_buffer);
- b->flags = 0;
+ /* Zero all flags but free list index and trace flag */
+ b->flags &= VLIB_BUFFER_NEXT_PRESENT - 1;
b->current_data = 0;
b->current_length = 0;
b->total_length_not_including_first_buffer = 0;
@@ -503,7 +498,8 @@ always_inline void *
tcp_init_buffer (vlib_main_t * vm, vlib_buffer_t * b)
{
ASSERT ((b->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
- b->flags = VNET_BUFFER_F_LOCALLY_ORIGINATED;
+ b->flags &= VLIB_BUFFER_FREE_LIST_INDEX_MASK;
+ b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
b->total_length_not_including_first_buffer = 0;
vnet_buffer (b)->tcp.flags = 0;
@@ -567,8 +563,34 @@ tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b)
/* Reset flags, make sure ack is sent */
vnet_buffer (b)->tcp.flags &= ~TCP_BUF_FLAG_DUPACK;
+}
+
+/**
+ * Convert buffer to SYN
+ */
+void
+tcp_make_syn (tcp_connection_t * tc, vlib_buffer_t * b)
+{
+ u8 tcp_hdr_opts_len, tcp_opts_len;
+ tcp_header_t *th;
+ u16 initial_wnd;
+ tcp_options_t snd_opts;
+
+ initial_wnd = tcp_initial_window_to_advertise (tc);
- tc->snd_nxt += 1;
+ /* Make and write options */
+ memset (&snd_opts, 0, sizeof (snd_opts));
+ tcp_opts_len = tcp_make_syn_options (&snd_opts, tc->rcv_wscale);
+ tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t);
+
+ th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss,
+ tc->rcv_nxt, tcp_hdr_opts_len, TCP_FLAG_SYN,
+ initial_wnd);
+ vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
+ tcp_options_write ((u8 *) (th + 1), &snd_opts);
+
+ tcp_timer_update (tc, TCP_TIMER_RETRANSMIT_SYN,
+ tc->rto * TCP_TO_TIMER_TICK);
}
/**
@@ -582,37 +604,25 @@ tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b)
u8 tcp_opts_len, tcp_hdr_opts_len;
tcp_header_t *th;
u16 initial_wnd;
- u32 time_now;
memset (snd_opts, 0, sizeof (*snd_opts));
-
tcp_reuse_buffer (vm, b);
- /* Set random initial sequence */
- time_now = tcp_time_now ();
-
- tc->iss = random_u32 (&time_now);
- tc->snd_una = tc->iss;
- tc->snd_nxt = tc->iss + 1;
- tc->snd_una_max = tc->snd_nxt;
-
initial_wnd = tcp_initial_window_to_advertise (tc);
-
- /* Make and write options */
tcp_opts_len = tcp_make_synack_options (tc, snd_opts);
tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t);
th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss,
tc->rcv_nxt, tcp_hdr_opts_len,
TCP_FLAG_SYN | TCP_FLAG_ACK, initial_wnd);
-
tcp_options_write ((u8 *) (th + 1), snd_opts);
vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK;
- /* Init retransmit timer */
- tcp_retransmit_timer_set (tc);
+ /* Init retransmit timer. Use update instead of set because of
+ * retransmissions */
+ tcp_retransmit_timer_force_update (tc);
TCP_EVT_DBG (TCP_EVT_SYNACK_SENT, tc);
}
@@ -918,44 +928,17 @@ tcp_send_syn (tcp_connection_t * tc)
u32 bi;
tcp_main_t *tm = vnet_get_tcp_main ();
vlib_main_t *vm = vlib_get_main ();
- u8 tcp_hdr_opts_len, tcp_opts_len;
- tcp_header_t *th;
- u32 time_now;
- u16 initial_wnd;
- tcp_options_t snd_opts;
if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
return;
b = vlib_get_buffer (vm, bi);
tcp_init_buffer (vm, b);
-
- /* Set random initial sequence */
- time_now = tcp_time_now ();
-
- tc->iss = random_u32 (&time_now);
- tc->snd_una = tc->iss;
- tc->snd_una_max = tc->snd_nxt = tc->iss + 1;
-
- initial_wnd = tcp_initial_window_to_advertise (tc);
-
- /* Make and write options */
- memset (&snd_opts, 0, sizeof (snd_opts));
- tcp_opts_len = tcp_make_syn_options (&snd_opts, tc->rcv_wscale);
- tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t);
-
- th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss,
- tc->rcv_nxt, tcp_hdr_opts_len, TCP_FLAG_SYN,
- initial_wnd);
-
- tcp_options_write ((u8 *) (th + 1), &snd_opts);
+ tcp_make_syn (tc, b);
/* Measure RTT with this */
tc->rtt_ts = tcp_time_now ();
tc->rtt_seq = tc->snd_nxt;
-
- /* Start retransmit trimer */
- tcp_timer_set (tc, TCP_TIMER_RETRANSMIT_SYN, tc->rto * TCP_TO_TIMER_TICK);
tc->rto_boff = 0;
/* Set the connection establishment timer */
@@ -1010,8 +993,12 @@ tcp_send_fin (tcp_connection_t * tc)
/* buffer will be initialized by in tcp_make_fin */
tcp_make_fin (tc, b);
tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4);
- tc->flags |= TCP_CONN_FINSNT;
- tc->flags &= ~TCP_CONN_FINPNDG;
+ if (!(tc->flags & TCP_CONN_FINSNT))
+ {
+ tc->flags |= TCP_CONN_FINSNT;
+ tc->flags &= ~TCP_CONN_FINPNDG;
+ tc->snd_nxt += 1;
+ }
tcp_retransmit_timer_force_update (tc);
TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc);
}
@@ -1146,6 +1133,7 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset,
* Make sure we can retransmit something
*/
available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection);
+ ASSERT (available_bytes >= offset);
available_bytes -= offset;
if (!available_bytes)
return 0;
@@ -1209,6 +1197,7 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset,
VLIB_FRAME_SIZE - available_bufs))
{
tcp_return_buffer (tm);
+ *b = 0;
return 0;
}
}
@@ -1236,7 +1225,7 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset,
ASSERT (n_peeked == len_to_deq);
n_bytes += n_peeked;
chain_b->current_length = n_peeked;
- chain_b->flags = 0;
+ chain_b->flags &= VLIB_BUFFER_FREE_LIST_INDEX_MASK;
chain_b->next_buffer = 0;
/* update previous buffer */
@@ -1310,19 +1299,6 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID;
}
- if (!tcp_in_recovery (tc) && tc->rto_boff > 0
- && tc->state >= TCP_STATE_ESTABLISHED)
- {
- tc->rto_boff = 0;
- tcp_update_rto (tc);
- }
-
- /* Increment RTO backoff (also equal to number of retries) */
- tc->rto_boff += 1;
-
- /* Go back to first un-acked byte */
- tc->snd_nxt = tc->snd_una;
-
if (tc->state >= TCP_STATE_ESTABLISHED)
{
/* Lost FIN, retransmit and return */
@@ -1332,6 +1308,18 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
return;
}
+ /* We're not in recovery so make sure rto_boff is 0 */
+ if (!tcp_in_recovery (tc) && tc->rto_boff > 0)
+ {
+ tc->rto_boff = 0;
+ tcp_update_rto (tc);
+ }
+
+ /* Increment RTO backoff (also equal to number of retries) and go back
+ * to first un-acked byte */
+ tc->rto_boff += 1;
+ tc->snd_nxt = tc->snd_una;
+
/* First retransmit timeout */
if (tc->rto_boff == 1)
tcp_rtx_timeout_cc (tc);
@@ -1349,12 +1337,11 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
if (n_bytes == 0)
{
- if (b)
- {
- clib_warning ("retransmit fail: %U", format_tcp_connection, tc,
- 2);
- ASSERT (tc->rto_boff > 1 && tc->snd_una == tc->snd_congestion);
- }
+ ASSERT (!b);
+ if (tc->snd_una == tc->snd_una_max)
+ return;
+ ASSERT (tc->rto_boff > 1 && tc->snd_una == tc->snd_congestion);
+ clib_warning ("retransmit fail: %U", format_tcp_connection, tc, 2);
/* Try again eventually */
tcp_retransmit_timer_set (tc);
return;
@@ -1365,16 +1352,18 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
/* For first retransmit, record timestamp (Eifel detection RFC3522) */
if (tc->rto_boff == 1)
tc->snd_rxt_ts = tcp_time_now ();
+
+ tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
+ tcp_retransmit_timer_update (tc);
}
- /* Retransmit for SYN/SYNACK */
- else if (tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_SYN_SENT)
+ /* Retransmit for SYN */
+ else if (tc->state == TCP_STATE_SYN_SENT)
{
/* Half-open connection actually moved to established but we were
* waiting for syn retransmit to pop to call cleanup from the right
* thread. */
if (tc->flags & TCP_CONN_HALF_OPEN_DONE)
{
- ASSERT (tc->state == TCP_STATE_SYN_SENT);
if (tcp_half_open_connection_cleanup (tc))
{
clib_warning ("could not remove half-open connection");
@@ -1385,49 +1374,46 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
/* Try without increasing RTO a number of times. If this fails,
* start growing RTO exponentially */
+ tc->rto_boff += 1;
if (tc->rto_boff > TCP_RTO_SYN_RETRIES)
tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
- {
- clib_warning ("tcp_get_free_buffer_index FAIL");
- return;
- }
+ return;
+
b = vlib_get_buffer (vm, bi);
tcp_init_buffer (vm, b);
- tcp_push_hdr_i (tc, b, tc->state, 1);
+ tcp_make_syn (tc, b);
- /* Account for the SYN */
- tc->snd_nxt += 1;
tc->rtt_ts = 0;
- TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc,
- (tc->state == TCP_STATE_SYN_SENT ? 0 : 1));
+ TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc, 0);
+
+ /* This goes straight to ipx_lookup. Retransmit timer set already */
+ tcp_push_ip_hdr (tm, tc, b);
+ tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4);
}
- else
+ /* Retransmit SYN-ACK */
+ else if (tc->state == TCP_STATE_SYN_RCVD)
{
- ASSERT (tc->state == TCP_STATE_CLOSED);
- clib_warning ("connection closed ...");
- return;
- }
+ tc->rto_boff += 1;
+ tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
+ tc->rtt_ts = 0;
- if (!is_syn)
- {
- tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return;
+
+ b = vlib_get_buffer (vm, bi);
+ tcp_make_synack (tc, b);
+ TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc, 1);
- /* Re-enable retransmit timer */
- tcp_retransmit_timer_set (tc);
+ /* Retransmit timer already updated, just enqueue to output */
+ tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
}
else
{
- ASSERT (tc->state == TCP_STATE_SYN_SENT);
-
- /* This goes straight to ipx_lookup */
- tcp_push_ip_hdr (tm, tc, b);
- tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4);
-
- /* Re-enable retransmit timer */
- tcp_timer_set (tc, TCP_TIMER_RETRANSMIT_SYN,
- tc->rto * TCP_TO_TIMER_TICK);
+ ASSERT (tc->state == TCP_STATE_CLOSED);
+ clib_warning ("connection closed ...");
+ return;
}
}