aboutsummaryrefslogtreecommitdiffstats
path: root/src/vnet/tcp
diff options
context:
space:
mode:
authorDave Barach <dbarach@cisco.com>2017-06-26 11:35:07 -0400
committerFlorin Coras <florin.coras@gmail.com>2017-07-11 15:40:19 +0000
commit2c25a62cc1cc4937165de740a3b32d78429c72d6 (patch)
treeb860025298501460e21cf8e5722c6155f87495ec /src/vnet/tcp
parent8af1b2fdecc883eadfec6b91434adc6044e24cb2 (diff)
Horizontal (nSessions) scaling draft
- Data structure preallocation. - Input state machine fixes for mid-stream 3-way handshake retries. - Batch connections in the builtin_client - Multiple private fifo segment support - Fix elog simultaneous event type registration - Fix sacks when segment hole is added after highest sacked - Add "accepting" session state for sessions pending accept - Add ssvm non-recursive locking - Estimate RTT for syn-ack - Don't init fifo pointers. We're using relative offsets for ooo segments - CLI to dump individual session Change-Id: Ie0598563fd246537bafba4feed7985478ea1d415 Signed-off-by: Dave Barach <dbarach@cisco.com> Signed-off-by: Florin Coras <fcoras@cisco.com>
Diffstat (limited to 'src/vnet/tcp')
-rw-r--r--src/vnet/tcp/builtin_client.c118
-rw-r--r--src/vnet/tcp/builtin_client.h7
-rw-r--r--src/vnet/tcp/builtin_server.c66
-rw-r--r--src/vnet/tcp/tcp.c225
-rw-r--r--src/vnet/tcp/tcp.h13
-rwxr-xr-xsrc/vnet/tcp/tcp_debug.h13
-rw-r--r--src/vnet/tcp/tcp_input.c97
-rw-r--r--src/vnet/tcp/tcp_newreno.c4
-rw-r--r--src/vnet/tcp/tcp_output.c53
-rw-r--r--src/vnet/tcp/tcp_packet.h1
-rw-r--r--src/vnet/tcp/tcp_test.c10
11 files changed, 480 insertions, 127 deletions
diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c
index 6f8be082d95..a6c8a23582b 100644
--- a/src/vnet/tcp/builtin_client.c
+++ b/src/vnet/tcp/builtin_client.c
@@ -170,62 +170,90 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
{
tclient_main_t *tm = &tclient_main;
int my_thread_index = vlib_get_thread_index ();
- vl_api_disconnect_session_t *dmp;
session_t *sp;
int i;
int delete_session;
u32 *connection_indices;
- u32 tx_quota = 0;
- u32 delta, prev_bytes_received_this_session;
+ u32 *connections_this_batch;
+ u32 nconnections_this_batch;
connection_indices = tm->connection_index_by_thread[my_thread_index];
+ connections_this_batch =
+ tm->connections_this_batch_by_thread[my_thread_index];
- if (tm->run_test == 0 || vec_len (connection_indices) == 0)
+ if ((tm->run_test == 0) ||
+ ((vec_len (connection_indices) == 0)
+ && vec_len (connections_this_batch) == 0))
return 0;
- for (i = 0; i < vec_len (connection_indices); i++)
+ /* Grab another pile of connections */
+ if (PREDICT_FALSE (vec_len (connections_this_batch) == 0))
+ {
+ nconnections_this_batch =
+ clib_min (tm->connections_per_batch, vec_len (connection_indices));
+
+ ASSERT (nconnections_this_batch > 0);
+ vec_validate (connections_this_batch, nconnections_this_batch - 1);
+ clib_memcpy (connections_this_batch,
+ connection_indices + vec_len (connection_indices)
+ - nconnections_this_batch,
+ nconnections_this_batch * sizeof (u32));
+ _vec_len (connection_indices) -= nconnections_this_batch;
+ }
+
+ if (PREDICT_FALSE (tm->prev_conns != tm->connections_per_batch
+ && tm->prev_conns == vec_len (connections_this_batch)))
+ {
+ tm->repeats++;
+ tm->prev_conns = vec_len (connections_this_batch);
+ if (tm->repeats == 500000)
+ {
+ clib_warning ("stuck clients");
+ }
+ }
+ else
+ {
+ tm->prev_conns = vec_len (connections_this_batch);
+ tm->repeats = 0;
+ }
+
+ for (i = 0; i < vec_len (connections_this_batch); i++)
{
delete_session = 1;
- sp = pool_elt_at_index (tm->sessions, connection_indices[i]);
+ sp = pool_elt_at_index (tm->sessions, connections_this_batch[i]);
- if ((tm->no_return || tx_quota < 60) && sp->bytes_to_send > 0)
+ if (sp->bytes_to_send > 0)
{
send_test_chunk (tm, sp);
delete_session = 0;
- tx_quota++;
}
- if (!tm->no_return && sp->bytes_to_receive > 0)
+ if (sp->bytes_to_receive > 0)
{
- prev_bytes_received_this_session = sp->bytes_received;
receive_test_chunk (tm, sp);
- delta = sp->bytes_received - prev_bytes_received_this_session;
- if (delta > 0)
- tx_quota--;
delete_session = 0;
}
if (PREDICT_FALSE (delete_session == 1))
{
- __sync_fetch_and_add (&tm->tx_total, tm->bytes_to_send);
+ u32 index, thread_index;
+ stream_session_t *s;
+
+ __sync_fetch_and_add (&tm->tx_total, sp->bytes_sent);
__sync_fetch_and_add (&tm->rx_total, sp->bytes_received);
- dmp = vl_msg_api_alloc_as_if_client (sizeof (*dmp));
- memset (dmp, 0, sizeof (*dmp));
- dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION);
- dmp->client_index = tm->my_client_index;
- dmp->handle = sp->vpp_session_handle;
- if (!unix_shared_memory_queue_add (tm->vl_input_queue, (u8 *) & dmp,
- 1))
+ stream_session_parse_handle (sp->vpp_session_handle,
+ &index, &thread_index);
+ s = stream_session_get_if_valid (index, thread_index);
+
+ if (s)
{
- vec_delete (connection_indices, 1, i);
- tm->connection_index_by_thread[my_thread_index] =
- connection_indices;
+ stream_session_disconnect (s);
+ vec_delete (connections_this_batch, 1, i);
+ i--;
__sync_fetch_and_add (&tm->ready_connections, -1);
}
else
- {
- vl_msg_api_free (dmp);
- }
+ clib_warning ("session AWOL?");
/* Kick the debug CLI process */
if (tm->ready_connections == 0)
@@ -236,6 +264,10 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
}
}
}
+
+ tm->connection_index_by_thread[my_thread_index] = connection_indices;
+ tm->connections_this_batch_by_thread[my_thread_index] =
+ connections_this_batch;
return 0;
}
@@ -356,6 +388,8 @@ tcp_test_clients_init (vlib_main_t * vm)
tm->vlib_main = vm;
vec_validate (tm->connection_index_by_thread, thread_main->n_vlib_mains);
+ vec_validate (tm->connections_this_batch_by_thread,
+ thread_main->n_vlib_mains);
return 0;
}
@@ -388,7 +422,8 @@ builtin_session_connected_callback (u32 app_index, u32 api_context,
pool_get (tm->sessions, session);
memset (session, 0, sizeof (*session));
session_index = session - tm->sessions;
- session->bytes_to_receive = session->bytes_to_send = tm->bytes_to_send;
+ session->bytes_to_send = tm->bytes_to_send;
+ session->bytes_to_receive = tm->no_return ? 0ULL : tm->bytes_to_send;
session->server_rx_fifo = s->server_rx_fifo;
session->server_rx_fifo->client_session_index = session_index;
session->server_tx_fifo = s->server_tx_fifo;
@@ -485,6 +520,8 @@ attach_builtin_test_clients_app (void)
options[SESSION_OPTIONS_SEGMENT_SIZE] = (2ULL << 32);
options[SESSION_OPTIONS_RX_FIFO_SIZE] = tm->fifo_size;
options[SESSION_OPTIONS_TX_FIFO_SIZE] = tm->fifo_size / 2;
+ options[APP_OPTIONS_PRIVATE_SEGMENT_COUNT] = tm->private_segment_count;
+ options[APP_OPTIONS_PRIVATE_SEGMENT_SIZE] = tm->private_segment_size;
options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = prealloc_fifos;
options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP;
@@ -561,6 +598,9 @@ test_tcp_clients_command_fn (vlib_main_t * vm,
tm->bytes_to_send = 8192;
tm->no_return = 0;
tm->fifo_size = 64 << 10;
+ tm->connections_per_batch = 1000;
+ tm->private_segment_count = 0;
+ tm->private_segment_size = 0;
vec_free (tm->connect_uri);
@@ -582,6 +622,20 @@ test_tcp_clients_command_fn (vlib_main_t * vm,
tm->no_return = 1;
else if (unformat (input, "fifo-size %d", &tm->fifo_size))
tm->fifo_size <<= 10;
+ else if (unformat (input, "private-segment-count %d",
+ &tm->private_segment_count))
+ ;
+ else if (unformat (input, "private-segment-size %dm", &tmp))
+ tm->private_segment_size = tmp << 20;
+ else if (unformat (input, "private-segment-size %dg", &tmp))
+ tm->private_segment_size = tmp << 30;
+ else if (unformat (input, "private-segment-size %d", &tmp))
+ tm->private_segment_size = tmp;
+ else if (unformat (input, "preallocate-fifos"))
+ tm->prealloc_fifos = 1;
+ else
+ if (unformat (input, "client-batch %d", &tm->connections_per_batch))
+ ;
else
return clib_error_return (0, "unknown input `%U'",
format_unformat_error, input);
@@ -688,9 +742,13 @@ test_tcp_clients_command_fn (vlib_main_t * vm,
vlib_cli_output (vm, "zero delta-t?");
cleanup:
- pool_free (tm->sessions);
+ tm->run_test = 0;
for (i = 0; i < vec_len (tm->connection_index_by_thread); i++)
- vec_reset_length (tm->connection_index_by_thread[i]);
+ {
+ vec_reset_length (tm->connection_index_by_thread[i]);
+ vec_reset_length (tm->connections_this_batch_by_thread[i]);
+ }
+ pool_free (tm->sessions);
return 0;
}
diff --git a/src/vnet/tcp/builtin_client.h b/src/vnet/tcp/builtin_client.h
index 3462e0eeee8..38af231dec3 100644
--- a/src/vnet/tcp/builtin_client.h
+++ b/src/vnet/tcp/builtin_client.h
@@ -63,6 +63,9 @@ typedef struct
u32 configured_segment_size;
u32 fifo_size;
u32 expected_connections; /**< Number of clients/connections */
+ u32 connections_per_batch; /**< Connections to rx/tx at once */
+ u32 private_segment_count; /**< Number of private fifo segs */
+ u32 private_segment_size; /**< size of private fifo segs */
/*
* Test state variables
@@ -72,6 +75,7 @@ typedef struct
uword *session_index_by_vpp_handles; /**< Hash table for disconnecting */
u8 *connect_test_data; /**< Pre-computed test data */
u32 **connection_index_by_thread;
+ u32 **connections_this_batch_by_thread; /**< active connection batch */
pthread_t client_thread_handle;
volatile u32 ready_connections;
@@ -82,7 +86,8 @@ typedef struct
f64 test_start_time;
f64 test_end_time;
-
+ u32 prev_conns;
+ u32 repeats;
/*
* Flags
*/
diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c
index 775bfc26e47..8e958ac0b45 100644
--- a/src/vnet/tcp/builtin_server.c
+++ b/src/vnet/tcp/builtin_server.c
@@ -56,12 +56,15 @@ typedef struct
u32 fifo_size; /**< Fifo size */
u32 rcv_buffer_size; /**< Rcv buffer size */
u32 prealloc_fifos; /**< Preallocate fifos */
+ u32 private_segment_count; /**< Number of private segments */
+ u32 private_segment_size; /**< Size of private segments */
/*
* Test state
*/
u8 **rx_buf; /**< Per-thread RX buffer */
u64 byte_index;
+ u32 **rx_retries;
vlib_main_t *vlib_main;
} builtin_server_main_t;
@@ -77,6 +80,8 @@ builtin_session_accept_callback (stream_session_t * s)
session_manager_get_vpp_event_queue (s->thread_index);
s->session_state = SESSION_STATE_READY;
bsm->byte_index = 0;
+ vec_validate (bsm->rx_retries[s->thread_index], s->session_index);
+ bsm->rx_retries[s->thread_index][s->session_index] = 0;
return 0;
}
@@ -173,11 +178,16 @@ builtin_server_rx_callback (stream_session_t * s)
builtin_server_main_t *bsm = &builtin_server_main;
session_fifo_event_t evt;
static int serial_number = 0;
- u32 my_thread_id = vlib_get_thread_index ();
+ u32 thread_index = vlib_get_thread_index ();
+
+ ASSERT (s->thread_index == thread_index);
rx_fifo = s->server_rx_fifo;
tx_fifo = s->server_tx_fifo;
+ ASSERT (rx_fifo->master_thread_index == thread_index);
+ ASSERT (tx_fifo->master_thread_index == thread_index);
+
max_dequeue = svm_fifo_max_dequeue (s->server_rx_fifo);
max_enqueue = svm_fifo_max_enqueue (s->server_tx_fifo);
@@ -201,21 +211,31 @@ builtin_server_rx_callback (stream_session_t * s)
evt.event_type = FIFO_EVENT_BUILTIN_RX;
evt.event_id = 0;
- q = bsm->vpp_queue[s->thread_index];
+ q = bsm->vpp_queue[thread_index];
if (PREDICT_FALSE (q->cursize == q->maxsize))
clib_warning ("out of event queue space");
- else
- unix_shared_memory_queue_add (q, (u8 *) & evt,
- 0 /* don't wait for mutex */ );
+ else if (unix_shared_memory_queue_add (q, (u8 *) & evt, 0 /* don't wait for mutex */
+ ))
+ clib_warning ("failed to enqueue self-tap");
+
+ bsm->rx_retries[thread_index][s->session_index]++;
+ if (bsm->rx_retries[thread_index][s->session_index] == 500000)
+ {
+ clib_warning ("session stuck: %U", format_stream_session, s, 2);
+ }
+ }
+ else
+ {
+ bsm->rx_retries[thread_index][s->session_index] = 0;
}
return 0;
}
- _vec_len (bsm->rx_buf[my_thread_id]) = max_transfer;
+ _vec_len (bsm->rx_buf[thread_index]) = max_transfer;
actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, max_transfer,
- bsm->rx_buf[my_thread_id]);
+ bsm->rx_buf[thread_index]);
ASSERT (actual_transfer == max_transfer);
// test_bytes (bsm, actual_transfer);
@@ -225,7 +245,7 @@ builtin_server_rx_callback (stream_session_t * s)
*/
n_written = svm_fifo_enqueue_nowait (tx_fifo, actual_transfer,
- bsm->rx_buf[my_thread_id]);
+ bsm->rx_buf[thread_index]);
if (n_written != max_transfer)
clib_warning ("short trout!");
@@ -237,11 +257,13 @@ builtin_server_rx_callback (stream_session_t * s)
evt.event_type = FIFO_EVENT_APP_TX;
evt.event_id = serial_number++;
- unix_shared_memory_queue_add (bsm->vpp_queue[s->thread_index],
- (u8 *) & evt, 0 /* do wait for mutex */ );
+ if (unix_shared_memory_queue_add (bsm->vpp_queue[s->thread_index],
+ (u8 *) & evt,
+ 0 /* do wait for mutex */ ))
+ clib_warning ("failed to enqueue tx evt");
}
- if (PREDICT_FALSE (max_enqueue < max_dequeue))
+ if (PREDICT_FALSE (n_written < max_dequeue))
goto rx_event;
return 0;
@@ -328,9 +350,13 @@ server_attach ()
a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 512 << 20;
a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = bsm->fifo_size;
a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = bsm->fifo_size;
- a->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP;
+ a->options[APP_OPTIONS_PRIVATE_SEGMENT_COUNT] = bsm->private_segment_count;
+ a->options[APP_OPTIONS_PRIVATE_SEGMENT_SIZE] = bsm->private_segment_size;
a->options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] =
bsm->prealloc_fifos ? bsm->prealloc_fifos : 1;
+
+ a->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP;
+
a->segment_name = segment_name;
a->segment_name_length = ARRAY_LEN (segment_name);
@@ -374,6 +400,8 @@ server_create (vlib_main_t * vm)
num_threads = 1 /* main thread */ + vtm->n_threads;
vec_validate (builtin_server_main.vpp_queue, num_threads - 1);
vec_validate (bsm->rx_buf, num_threads - 1);
+ vec_validate (bsm->rx_retries, num_threads - 1);
+
for (i = 0; i < num_threads; i++)
vec_validate (bsm->rx_buf[i], bsm->rcv_buffer_size);
@@ -435,11 +463,14 @@ server_create_command_fn (vlib_main_t * vm, unformat_input_t * input,
{
builtin_server_main_t *bsm = &builtin_server_main;
int rv;
+ u32 tmp;
bsm->no_echo = 0;
bsm->fifo_size = 64 << 10;
bsm->rcv_buffer_size = 128 << 10;
bsm->prealloc_fifos = 0;
+ bsm->private_segment_count = 0;
+ bsm->private_segment_size = 0;
while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
{
@@ -449,8 +480,17 @@ server_create_command_fn (vlib_main_t * vm, unformat_input_t * input,
bsm->fifo_size <<= 10;
else if (unformat (input, "rcv-buf-size %d", &bsm->rcv_buffer_size))
;
- else if (unformat (input, "prealloc-fifos", &bsm->prealloc_fifos))
+ else if (unformat (input, "prealloc-fifos %d", &bsm->prealloc_fifos))
+ ;
+ else if (unformat (input, "private-segment-count %d",
+ &bsm->private_segment_count))
;
+ else if (unformat (input, "private-segment-size %dm", &tmp))
+ bsm->private_segment_size = tmp << 20;
+ else if (unformat (input, "private-segment-size %dg", &tmp))
+ bsm->private_segment_size = tmp << 30;
+ else if (unformat (input, "private-segment-size %d", &tmp))
+ bsm->private_segment_size = tmp;
else
return clib_error_return (0, "unknown input `%U'",
format_unformat_error, input);
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index 4e85eb3fc93..f379e699839 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -74,8 +74,16 @@ static void
tcp_connection_unbind (u32 listener_index)
{
tcp_main_t *tm = vnet_get_tcp_main ();
- TCP_EVT_DBG (TCP_EVT_UNBIND,
- pool_elt_at_index (tm->listener_pool, listener_index));
+ tcp_connection_t *tc;
+
+ tc = pool_elt_at_index (tm->listener_pool, listener_index);
+
+ TCP_EVT_DBG (TCP_EVT_UNBIND, tc);
+
+ /* Poison the entry */
+ if (CLIB_DEBUG > 0)
+ memset (tc, 0xFA, sizeof (*tc));
+
pool_put_index (tm->listener_pool, listener_index);
}
@@ -124,9 +132,20 @@ tcp_connection_cleanup (tcp_connection_t * tc)
/* Check if half-open */
if (tc->state == TCP_STATE_SYN_SENT)
- pool_put (tm->half_open_connections, tc);
+ {
+ /* Poison the entry */
+ if (CLIB_DEBUG > 0)
+ memset (tc, 0xFA, sizeof (*tc));
+ pool_put (tm->half_open_connections, tc);
+ }
else
- pool_put (tm->connections[tc->c_thread_index], tc);
+ {
+ int thread_index = tc->c_thread_index;
+ /* Poison the entry */
+ if (CLIB_DEBUG > 0)
+ memset (tc, 0xFA, sizeof (*tc));
+ pool_put (tm->connections[thread_index], tc);
+ }
}
/**
@@ -168,13 +187,14 @@ tcp_connection_reset (tcp_connection_t * tc)
/* Make sure all timers are cleared */
tcp_connection_timers_reset (tc);
-
stream_session_reset_notify (&tc->connection);
+
+ /* Wait for cleanup from session layer but not forever */
+ tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME);
break;
case TCP_STATE_CLOSED:
return;
}
-
}
/**
@@ -278,6 +298,9 @@ tcp_allocate_local_port (tcp_main_t * tm, ip46_address_t * ip)
tries = max - min;
time_now = tcp_time_now ();
+ /* Only support active opens from thread 0 */
+ ASSERT (vlib_get_thread_index () == 0);
+
/* Start at random point or max */
pool_get (tm->local_endpoints, tep);
clib_memcpy (&tep->ip, ip, sizeof (*ip));
@@ -343,6 +366,7 @@ tcp_connection_timers_reset (tcp_connection_t * tc)
}
}
+#if 0
typedef struct ip4_tcp_hdr
{
ip4_header_t ip;
@@ -435,6 +459,7 @@ tcp_connection_fib_attach (tcp_connection_t * tc)
tcp_connection_stack_on_fib_entry (tc);
}
+#endif /* 0 */
/** Initialize tcp connection variables
*
@@ -447,7 +472,7 @@ tcp_connection_init_vars (tcp_connection_t * tc)
tcp_init_mss (tc);
scoreboard_init (&tc->sack_sb);
tcp_cc_init (tc);
- tcp_connection_fib_attach (tc);
+ // tcp_connection_fib_attach (tc);
}
int
@@ -485,14 +510,38 @@ tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4)
if (is_ip4)
{
ip4_address_t *ip4;
- ip4 = ip_interface_get_first_ip (sw_if_index, 1);
- lcl_addr.ip4.as_u32 = ip4->as_u32;
+ int index;
+ if (vec_len (tm->ip4_src_addresses))
+ {
+ index = tm->last_v4_address_rotor++;
+ if (tm->last_v4_address_rotor >= vec_len (tm->ip4_src_addresses))
+ tm->last_v4_address_rotor = 0;
+ lcl_addr.ip4.as_u32 = tm->ip4_src_addresses[index].as_u32;
+ }
+ else
+ {
+ ip4 = ip_interface_get_first_ip (sw_if_index, 1);
+ lcl_addr.ip4.as_u32 = ip4->as_u32;
+ }
}
else
{
ip6_address_t *ip6;
- ip6 = ip_interface_get_first_ip (sw_if_index, 0);
- clib_memcpy (&lcl_addr.ip6, ip6, sizeof (*ip6));
+ int index;
+
+ if (vec_len (tm->ip6_src_addresses))
+ {
+ index = tm->last_v6_address_rotor++;
+ if (tm->last_v6_address_rotor >= vec_len (tm->ip6_src_addresses))
+ tm->last_v6_address_rotor = 0;
+ clib_memcpy (&lcl_addr.ip6, &tm->ip6_src_addresses[index],
+ sizeof (*ip6));
+ }
+ else
+ {
+ ip6 = ip_interface_get_first_ip (sw_if_index, 0);
+ clib_memcpy (&lcl_addr.ip6, ip6, sizeof (*ip6));
+ }
}
/* Allocate source port */
@@ -614,7 +663,7 @@ u8 *
format_tcp_vars (u8 * s, va_list * args)
{
tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
- s = format (s, " snd_una %u snd_nxt %u snd_una_max %u\n",
+ s = format (s, " snd_una %u snd_nxt %u snd_una_max %u",
tc->snd_una - tc->iss, tc->snd_nxt - tc->iss,
tc->snd_una_max - tc->iss);
s = format (s, " rcv_nxt %u rcv_las %u\n",
@@ -628,12 +677,17 @@ format_tcp_vars (u8 * s, va_list * args)
s = format (s, " cong %U ", format_tcp_congestion_status, tc);
s = format (s, "cwnd %u ssthresh %u rtx_bytes %u bytes_acked %u\n",
tc->cwnd, tc->ssthresh, tc->snd_rxt_bytes, tc->bytes_acked);
- s = format (s, " prev_ssthresh %u snd_congestion %u dupack %u\n",
+ s = format (s, " prev_ssthresh %u snd_congestion %u dupack %u",
tc->prev_ssthresh, tc->snd_congestion - tc->iss,
tc->rcv_dupacks);
+ s = format (s, " limited_transmit %u\n", tc->limited_transmit - tc->iss);
+ s = format (s, " tsecr %u tsecr_last_ack %u\n", tc->rcv_opts.tsecr,
+ tc->tsecr_last_ack);
s = format (s, " rto %u rto_boff %u srtt %u rttvar %u rtt_ts %u ", tc->rto,
tc->rto_boff, tc->srtt, tc->rttvar, tc->rtt_ts);
s = format (s, "rtt_seq %u\n", tc->rtt_seq);
+ s = format (s, " tsval_recent %u tsval_recent_age %u\n", tc->tsval_recent,
+ tcp_time_now () - tc->tsval_recent_age);
s = format (s, " scoreboard: %U\n", format_tcp_scoreboard, &tc->sack_sb);
if (vec_len (tc->snd_sacks))
s = format (s, " sacks tx: %U\n", format_tcp_sacks, tc);
@@ -719,11 +773,21 @@ format_tcp_sacks (u8 * s, va_list * args)
tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
sack_block_t *sacks = tc->snd_sacks;
sack_block_t *block;
- vec_foreach (block, sacks)
- {
- s = format (s, " start %u end %u\n", block->start - tc->irs,
- block->end - tc->irs);
- }
+ int i, len = 0;
+
+ len = vec_len (sacks);
+ for (i = 0; i < len - 1; i++)
+ {
+ block = &sacks[i];
+ s = format (s, " start %u end %u\n", block->start - tc->irs,
+ block->end - tc->irs);
+ }
+ if (len)
+ {
+ block = &sacks[len - 1];
+ s = format (s, " start %u end %u", block->start - tc->irs,
+ block->end - tc->irs);
+ }
return s;
}
@@ -796,14 +860,18 @@ tcp_session_send_mss (transport_connection_t * trans_conn)
always_inline u32
tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space)
{
- if (tc->snd_wnd < tc->snd_mss)
+ if (PREDICT_FALSE (tc->snd_wnd < tc->snd_mss))
{
return tc->snd_wnd <= snd_space ? tc->snd_wnd : 0;
}
/* If we can't write at least a segment, don't try at all */
- if (snd_space < tc->snd_mss)
- return 0;
+ if (PREDICT_FALSE (snd_space < tc->snd_mss))
+ {
+ if (snd_space > clib_min (tc->mss, tc->rcv_opts.mss) - TCP_HDR_LEN_MAX)
+ return snd_space;
+ return 0;
+ }
/* round down to mss multiple */
return snd_space - (snd_space % tc->snd_mss);
@@ -1042,6 +1110,8 @@ tcp_main_enable (vlib_main_t * vm)
vlib_thread_main_t *vtm = vlib_get_thread_main ();
clib_error_t *error = 0;
u32 num_threads;
+ int thread, i;
+ tcp_connection_t *tc __attribute__ ((unused));
if ((error = vlib_call_init_function (vm, ip_main_init)))
return error;
@@ -1074,6 +1144,27 @@ tcp_main_enable (vlib_main_t * vm)
num_threads = 1 /* main thread */ + vtm->n_threads;
vec_validate (tm->connections, num_threads - 1);
+ /*
+ * Preallocate connections
+ */
+ for (thread = 0; thread < num_threads; thread++)
+ {
+ for (i = 0; i < tm->preallocated_connections; i++)
+ pool_get (tm->connections[thread], tc);
+
+ for (i = 0; i < tm->preallocated_connections; i++)
+ pool_put_index (tm->connections[thread], i);
+ }
+
+ /*
+ * Preallocate half-open connections
+ */
+ for (i = 0; i < tm->preallocated_half_open_connections; i++)
+ pool_get (tm->half_open_connections, tc);
+
+ for (i = 0; i < tm->preallocated_half_open_connections; i++)
+ pool_put_index (tm->half_open_connections, i);
+
/* Initialize per worker thread tx buffers (used for control messages) */
vec_validate (tm->tx_buffers, num_threads - 1);
@@ -1116,7 +1207,6 @@ tcp_init (vlib_main_t * vm)
{
tcp_main_t *tm = vnet_get_tcp_main ();
- tm->vlib_main = vm;
tm->vnet_main = vnet_get_main ();
tm->is_enabled = 0;
@@ -1125,6 +1215,97 @@ tcp_init (vlib_main_t * vm)
VLIB_INIT_FUNCTION (tcp_init);
+
+static clib_error_t *
+tcp_config_fn (vlib_main_t * vm, unformat_input_t * input)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (input, "preallocated-connections %d",
+ &tm->preallocated_connections))
+ ;
+ else if (unformat (input, "preallocated-half-open-connections %d",
+ &tm->preallocated_half_open_connections))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ return 0;
+}
+
+VLIB_CONFIG_FUNCTION (tcp_config_fn, "tcp");
+
+static clib_error_t *
+tcp_src_address (vlib_main_t * vm,
+ unformat_input_t * input, vlib_cli_command_t * cmd_arg)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ ip4_address_t v4start, v4end;
+ ip6_address_t v6start, v6end;
+ int v4set = 0;
+ int v6set = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "%U - %U", unformat_ip4_address, &v4start,
+ unformat_ip4_address, &v4end))
+ v4set = 1;
+ else if (unformat (input, "%U", unformat_ip4_address, &v4start))
+ {
+ memcpy (&v4end, &v4start, sizeof (v4start));
+ v4set = 1;
+ }
+ else if (unformat (input, "%U - %U", unformat_ip6_address, &v6start,
+ unformat_ip4_address, &v6end))
+ v6set = 1;
+ else if (unformat (input, "%U", unformat_ip6_address, &v6start))
+ {
+ memcpy (&v6end, &v6start, sizeof (v4start));
+ v6set = 1;
+ }
+ else
+ break;
+ }
+
+ if (!v4set && !v6set)
+ return clib_error_return (0, "at least one v4 or v6 address required");
+
+ if (v4set)
+ {
+ u32 tmp;
+
+ do
+ {
+ vec_add1 (tm->ip4_src_addresses, v4start);
+ tmp = clib_net_to_host_u32 (v4start.as_u32);
+ tmp++;
+ v4start.as_u32 = clib_host_to_net_u32 (tmp);
+ }
+ while (clib_host_to_net_u32 (v4start.as_u32) <=
+ clib_host_to_net_u32 (v4end.as_u32));
+ }
+ if (v6set)
+ {
+ clib_warning ("v6 src address list unimplemented...");
+ }
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (tcp_src_address_command, static) =
+{
+ .path = "tcp src-address",
+ .short_help = "tcp src-address <ip-addr> [- <ip-addr>] add src address range",
+ .function = tcp_src_address,
+};
+/* *INDENT-ON* */
+
+
+
/*
* fd.io coding-style-patch-verification: ON
*
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index 12d804b82f6..37b10fd4753 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -348,6 +348,16 @@ typedef struct _tcp_main
/* Flag that indicates if stack is on or off */
u8 is_enabled;
+ /** Number of preallocated connections */
+ u32 preallocated_connections;
+ u32 preallocated_half_open_connections;
+
+ /** Vectors of src addresses. Optional unless one needs > 63K active-opens */
+ ip4_address_t *ip4_src_addresses;
+ u32 last_v4_address_rotor;
+ u32 last_v6_address_rotor;
+ ip6_address_t *ip6_src_addresses;
+
/* convenience */
vlib_main_t *vlib_main;
vnet_main_t *vnet_main;
@@ -569,6 +579,7 @@ tcp_connection_force_ack (tcp_connection_t * tc, vlib_buffer_t * b)
always_inline void
tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval)
{
+ ASSERT (tc->c_thread_index == vlib_get_thread_index ());
tc->timers[timer_id]
= tw_timer_start_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index],
tc->c_c_index, timer_id, interval);
@@ -577,6 +588,7 @@ tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval)
always_inline void
tcp_timer_reset (tcp_connection_t * tc, u8 timer_id)
{
+ ASSERT (tc->c_thread_index == vlib_get_thread_index ());
if (tc->timers[timer_id] == TCP_TIMER_HANDLE_INVALID)
return;
@@ -588,6 +600,7 @@ tcp_timer_reset (tcp_connection_t * tc, u8 timer_id)
always_inline void
tcp_timer_update (tcp_connection_t * tc, u8 timer_id, u32 interval)
{
+ ASSERT (tc->c_thread_index == vlib_get_thread_index ());
if (tc->timers[timer_id] != TCP_TIMER_HANDLE_INVALID)
tw_timer_stop_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index],
tc->timers[timer_id]);
diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h
index ae68ad1b264..be51bca2a26 100755
--- a/src/vnet/tcp/tcp_debug.h
+++ b/src/vnet/tcp/tcp_debug.h
@@ -383,9 +383,16 @@ typedef enum _tcp_dbg_evt
"establish", \
}, \
}; \
- DECLARE_ETD(_tc, _e, 2); \
- ed->data[0] = _timer_id; \
- ed->data[1] = _timer_id; \
+ if (_tc) \
+ { \
+ DECLARE_ETD(_tc, _e, 2); \
+ ed->data[0] = _timer_id; \
+ ed->data[1] = _timer_id; \
+ } \
+ else \
+ { \
+ clib_warning ("pop for unexisting connection %d", _tc_index); \
+ } \
}
#define TCP_EVT_SEG_INVALID_HANDLER(_tc, _seq, _end, ...) \
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index a2e6dad1298..45db0da69c6 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -251,6 +251,7 @@ tcp_update_timestamp (tcp_connection_t * tc, u32 seq, u32 seq_end)
if (tcp_opts_tstamp (&tc->rcv_opts) && tc->tsval_recent
&& seq_leq (seq, tc->rcv_las) && seq_leq (tc->rcv_las, seq_end))
{
+ ASSERT (timestamp_leq (tc->tsval_recent, tc->rcv_opts.tsval));
tc->tsval_recent = tc->rcv_opts.tsval;
tc->tsval_recent_age = tcp_time_now ();
}
@@ -383,12 +384,9 @@ tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt)
if (tc->srtt != 0)
{
err = mrtt - tc->srtt;
-// tc->srtt += err >> 3;
/* XXX Drop in RTT results in RTTVAR increase and bigger RTO.
* The increase should be bound */
-// tc->rttvar += ((int) clib_abs (err) - (int) tc->rttvar) >> 2;
-
tc->srtt = clib_max ((int) tc->srtt + (err >> 3), 1);
diff = (clib_abs (err) - (int) tc->rttvar) >> 2;
tc->rttvar = clib_max ((int) tc->rttvar + diff, 1);
@@ -491,6 +489,14 @@ tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 prev_snd_wnd,
&& (prev_snd_wnd == tc->snd_wnd));
}
+static u8
+tcp_is_lost_fin (tcp_connection_t * tc)
+{
+ if ((tc->flags & TCP_CONN_FINSNT) && tc->snd_una_max - tc->snd_una == 1)
+ return 1;
+ return 0;
+}
+
/**
* Checks if ack is a congestion control event.
*/
@@ -503,7 +509,7 @@ tcp_ack_is_cc_event (tcp_connection_t * tc, vlib_buffer_t * b,
*is_dack = tc->sack_sb.last_sacked_bytes
|| tcp_ack_is_dupack (tc, b, prev_snd_wnd, prev_snd_una);
- return (*is_dack || tcp_in_cong_recovery (tc));
+ return ((*is_dack || tcp_in_cong_recovery (tc)) && !tcp_is_lost_fin (tc));
}
void
@@ -750,10 +756,20 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
* last hole end */
tmp = tc->rcv_opts.sacks[vec_len (tc->rcv_opts.sacks) - 1];
last_hole = scoreboard_last_hole (sb);
- if (seq_gt (tc->snd_una_max, sb->high_sacked)
- && seq_gt (tc->snd_una_max, last_hole->end))
- last_hole->end = tc->snd_una_max;
- /* keep track of max byte sacked for when the last hole
+ if (seq_gt (tc->snd_una_max, last_hole->end))
+ {
+ if (seq_geq (last_hole->start, sb->high_sacked))
+ {
+ last_hole->end = tc->snd_una_max;
+ }
+ /* New hole after high sacked block */
+ else if (seq_lt (sb->high_sacked, tc->snd_una_max))
+ {
+ scoreboard_insert_hole (sb, sb->tail, sb->high_sacked,
+ tc->snd_una_max);
+ }
+ }
+ /* Keep track of max byte sacked for when the last hole
* is acked */
if (seq_gt (tmp.end, sb->high_sacked))
sb->high_sacked = tmp.end;
@@ -764,7 +780,6 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
while (hole && blk_index < vec_len (tc->rcv_opts.sacks))
{
blk = &tc->rcv_opts.sacks[blk_index];
-
if (seq_leq (blk->start, hole->start))
{
/* Block covers hole. Remove hole */
@@ -784,6 +799,7 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
}
else if (!next_hole)
{
+ ASSERT (seq_geq (sb->high_sacked, ack));
sb->snd_una_adv = sb->high_sacked - ack;
sb->last_bytes_delivered += sb->high_sacked - hole->end;
}
@@ -819,7 +835,6 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
{
hole->end = blk->start;
}
-
hole = scoreboard_next_hole (sb, hole);
}
}
@@ -827,10 +842,13 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
scoreboard_update_bytes (tc, sb);
sb->last_sacked_bytes = sb->sacked_bytes
- (old_sacked_bytes - sb->last_bytes_delivered);
+ ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes);
ASSERT (sb->sacked_bytes == 0
|| sb->sacked_bytes < tc->snd_una_max - seq_max (tc->snd_una, ack));
ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_una_max
- seq_max (tc->snd_una, ack));
+ ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc)
+ || sb->holes[sb->head].start == ack + sb->snd_una_adv);
}
/**
@@ -916,7 +934,8 @@ tcp_cc_congestion_undo (tcp_connection_t * tc)
static u8
tcp_cc_is_spurious_retransmit (tcp_connection_t * tc)
{
- return (tc->snd_rxt_ts
+ return (tcp_in_recovery (tc)
+ && tc->snd_rxt_ts
&& tcp_opts_tstamp (&tc->rcv_opts)
&& timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts));
}
@@ -994,6 +1013,7 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
{
ASSERT (tc->snd_una != tc->snd_una_max
|| tc->sack_sb.last_sacked_bytes);
+
tc->rcv_dupacks++;
if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD && !tc->bytes_acked)
@@ -1012,17 +1032,20 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
goto partial_ack_test;
}
- /* If of of the two conditions lower hold, reset dupacks
- * 1) Cumulative ack does not cover more than congestion threshold,
- * and the following doesn't hold: the congestion window is
- * greater than SMSS bytes and the difference between highest_ack
- * and prev_highest_ack is at most 4*SMSS bytes (XXX)
- * 2) RFC6582 heuristic to avoid multiple fast retransmits
+ /* If of of the two conditions lower hold, reset dupacks because
+ * we're probably after timeout (RFC6582 heuristics).
+ * If Cumulative ack does not cover more than congestion threshold,
+ * and:
+ * 1) The following doesn't hold: The congestion window is greater
+ * than SMSS bytes and the difference between highest_ack
+ * and prev_highest_ack is at most 4*SMSS bytes
+ * 2) Echoed timestamp in the last non-dup ack does not equal the
+ * stored timestamp
*/
- if ((seq_gt (tc->snd_una, tc->snd_congestion)
- || !(tc->cwnd > tc->snd_mss
- && tc->bytes_acked <= 4 * tc->snd_mss))
- || tc->rcv_opts.tsecr != tc->tsecr_last_ack)
+ if (seq_leq (tc->snd_una, tc->snd_congestion)
+ && ((!(tc->cwnd > tc->snd_mss
+ && tc->bytes_acked <= 4 * tc->snd_mss))
+ || (tc->rcv_opts.tsecr != tc->tsecr_last_ack)))
{
tc->rcv_dupacks = 0;
return;
@@ -1038,6 +1061,7 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
* three segments that have left the network and should've been
* buffered at the receiver XXX */
tc->cwnd = tc->ssthresh + tc->rcv_dupacks * tc->snd_mss;
+ ASSERT (tc->cwnd >= tc->snd_mss);
/* If cwnd allows, send more data */
if (tcp_opts_sack_permitted (&tc->rcv_opts)
@@ -1112,7 +1136,7 @@ partial_ack:
>= tc->sack_sb.last_bytes_delivered);
rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv
- tc->sack_sb.last_bytes_delivered;
- if (rxt_delivered && seq_gt (tc->sack_sb.high_rxt, tc->snd_una))
+ if (0 && rxt_delivered && seq_gt (tc->sack_sb.high_rxt, tc->snd_una))
{
/* If we have sacks and we haven't gotten an ack beyond high_rxt,
* remove sacked bytes delivered */
@@ -1301,6 +1325,8 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b,
{
int written;
+ ASSERT (seq_geq (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt));
+
/* Pure ACK. Update rcv_nxt and be done. */
if (PREDICT_FALSE (data_len == 0))
{
@@ -1450,6 +1476,7 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b,
/* Chop off the bytes in the past */
n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number;
n_data_bytes -= n_bytes_to_drop;
+ vnet_buffer (b)->tcp.seq_number = tc->rcv_nxt;
vlib_buffer_advance (b, n_bytes_to_drop);
goto in_order;
@@ -1912,11 +1939,12 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
goto drop;
}
- stream_session_init_fifos_pointers (&new_tc0->connection,
- new_tc0->irs + 1,
- new_tc0->iss + 1);
/* Make sure after data segment processing ACK is sent */
new_tc0->flags |= TCP_CONN_SNDACK;
+
+ /* Update rtt with the syn-ack sample */
+ new_tc0->bytes_acked = 1;
+ tcp_update_rtt (new_tc0, vnet_buffer (b0)->tcp.ack_number);
}
/* SYN: Simultaneous open. Change state to SYN-RCVD and send SYN-ACK */
else
@@ -1932,9 +1960,8 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
goto drop;
}
- stream_session_init_fifos_pointers (&new_tc0->connection,
- new_tc0->irs + 1,
- new_tc0->iss + 1);
+ tc0->rtt_ts = 0;
+
tcp_make_synack (new_tc0, b0);
next0 = tcp_next_output (is_ip4);
@@ -2151,8 +2178,6 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
<< tc0->rcv_opts.wscale;
tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number;
tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
-
- /* Shoulder tap the server */
stream_session_accept_notify (&tc0->connection);
/* Reset SYN-ACK retransmit timer */
@@ -2175,6 +2200,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
/* If FIN is ACKed */
if (tc0->snd_una == tc0->snd_una_max)
{
+ ASSERT (tcp_fin (tcp0));
tc0->state = TCP_STATE_FIN_WAIT_2;
/* Stop all timers, 2MSL will be set lower */
tcp_connection_timers_reset (tc0);
@@ -2545,10 +2571,6 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
tcp_make_synack (child0, b0);
next0 = tcp_next_output (is_ip4);
- /* Init fifo pointers after we have iss */
- stream_session_init_fifos_pointers (&child0->connection,
- child0->irs + 1,
- child0->iss + 1);
drop:
if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
{
@@ -2886,9 +2908,12 @@ do { \
_(LISTEN, TCP_FLAG_SYN, TCP_INPUT_NEXT_LISTEN, TCP_ERROR_NONE);
_(LISTEN, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_NONE);
_(LISTEN, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_NONE);
+ _(LISTEN, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET,
+ TCP_ERROR_NONE);
/* ACK for for a SYN-ACK -> tcp-rcv-process. */
_(SYN_RCVD, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(SYN_RCVD, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+ _(SYN_RCVD, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
/* SYN-ACK for a SYN */
_(SYN_SENT, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT,
TCP_ERROR_NONE);
@@ -2905,12 +2930,14 @@ do { \
_(ESTABLISHED, TCP_FLAG_RST, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
_(ESTABLISHED, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED,
TCP_ERROR_NONE);
+ _(ESTABLISHED, TCP_FLAG_SYN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
/* ACK or FIN-ACK to our FIN */
_(FIN_WAIT_1, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(FIN_WAIT_1, TCP_FLAG_ACK | TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
/* FIN in reply to our FIN from the other side */
_(FIN_WAIT_1, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+ _(FIN_WAIT_1, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
/* FIN confirming that the peer (app) has closed */
_(FIN_WAIT_2, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(FIN_WAIT_2, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
@@ -2929,6 +2956,8 @@ do { \
TCP_ERROR_NONE);
_(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED);
_(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED);
+ _(CLOSED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET,
+ TCP_ERROR_CONNECTION_CLOSED);
#undef _
}
diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c
index c825e952c9b..103fea4c194 100644
--- a/src/vnet/tcp/tcp_newreno.c
+++ b/src/vnet/tcp/tcp_newreno.c
@@ -63,8 +63,8 @@ newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type)
* window deflation" attempts to ensure that, when fast recovery
* eventually ends, approximately ssthresh amount of data will be
* outstanding in the network.*/
- tc->cwnd = (tc->cwnd > tc->bytes_acked) ?
- tc->cwnd - tc->bytes_acked : 0;
+ tc->cwnd = (tc->cwnd > tc->bytes_acked + tc->snd_mss) ?
+ tc->cwnd - tc->bytes_acked : tc->snd_mss;
if (tc->bytes_acked > tc->snd_mss)
tc->cwnd += tc->snd_mss;
}
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index 41bebcb34af..b418e8baa54 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -19,17 +19,20 @@
vlib_node_registration_t tcp4_output_node;
vlib_node_registration_t tcp6_output_node;
-typedef enum _tcp_output_nect
+typedef enum _tcp_output_next
{
TCP_OUTPUT_NEXT_DROP,
+ TCP_OUTPUT_NEXT_IP_LOOKUP,
TCP_OUTPUT_N_NEXT
} tcp_output_next_t;
#define foreach_tcp4_output_next \
_ (DROP, "error-drop") \
+ _ (IP_LOOKUP, "ip4-lookup")
#define foreach_tcp6_output_next \
_ (DROP, "error-drop") \
+ _ (IP_LOOKUP, "ip6-lookup")
static char *tcp_error_strings[] = {
#define tcp_error(n,s) s,
@@ -427,16 +430,16 @@ tcp_init_mss (tcp_connection_t * tc)
#define tcp_get_free_buffer_index(tm, bidx) \
do { \
u32 *my_tx_buffers, n_free_buffers; \
- u32 thread_index = vlib_get_thread_index(); \
- my_tx_buffers = tm->tx_buffers[thread_index]; \
+ u32 thread_index = vlib_get_thread_index(); \
+ my_tx_buffers = tm->tx_buffers[thread_index]; \
if (PREDICT_FALSE(vec_len (my_tx_buffers) == 0)) \
{ \
n_free_buffers = 32; /* TODO config or macro */ \
vec_validate (my_tx_buffers, n_free_buffers - 1); \
_vec_len(my_tx_buffers) = vlib_buffer_alloc_from_free_list ( \
- tm->vlib_main, my_tx_buffers, n_free_buffers, \
+ vlib_get_main(), my_tx_buffers, n_free_buffers, \
VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); \
- tm->tx_buffers[thread_index] = my_tx_buffers; \
+ tm->tx_buffers[thread_index] = my_tx_buffers; \
} \
/* buffer shortage */ \
if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) \
@@ -445,12 +448,12 @@ do { \
_vec_len (my_tx_buffers) -= 1; \
} while (0)
-#define tcp_return_buffer(tm) \
-do { \
- u32 *my_tx_buffers; \
- u32 thread_index = vlib_get_thread_index(); \
- my_tx_buffers = tm->tx_buffers[thread_index]; \
- _vec_len (my_tx_buffers) +=1; \
+#define tcp_return_buffer(tm) \
+do { \
+ u32 *my_tx_buffers; \
+ u32 thread_index = vlib_get_thread_index(); \
+ my_tx_buffers = tm->tx_buffers[thread_index]; \
+ _vec_len (my_tx_buffers) +=1; \
} while (0)
always_inline void
@@ -757,23 +760,22 @@ void
tcp_push_ip_hdr (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b)
{
tcp_header_t *th = vlib_buffer_get_current (b);
-
+ vlib_main_t *vm = vlib_get_main ();
if (tc->c_is_ip4)
{
ip4_header_t *ih;
- ih = vlib_buffer_push_ip4 (tm->vlib_main, b, &tc->c_lcl_ip4,
+ ih = vlib_buffer_push_ip4 (vm, b, &tc->c_lcl_ip4,
&tc->c_rmt_ip4, IP_PROTOCOL_TCP);
- th->checksum = ip4_tcp_udp_compute_checksum (tm->vlib_main, b, ih);
+ th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih);
}
else
{
ip6_header_t *ih;
int bogus = ~0;
- ih = vlib_buffer_push_ip6 (tm->vlib_main, b, &tc->c_lcl_ip6,
+ ih = vlib_buffer_push_ip6 (vm, b, &tc->c_lcl_ip6,
&tc->c_rmt_ip6, IP_PROTOCOL_TCP);
- th->checksum = ip6_tcp_udp_icmp_compute_checksum (tm->vlib_main, b, ih,
- &bogus);
+ th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih, &bogus);
ASSERT (!bogus);
}
}
@@ -851,6 +853,13 @@ tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4)
/* Decide where to send the packet */
next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index;
+ /* Initialize the trajectory trace, if configured */
+ if (VLIB_BUFFER_TRACE_TRAJECTORY > 0)
+ {
+ b->pre_data[0] = 1;
+ b->pre_data[1] = next_index;
+ }
+
/* Enqueue the packet */
f = vlib_get_frame_to_node (vm, next_index);
to_next = vlib_frame_vector_args (f);
@@ -1144,6 +1153,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
/* Account for the SYN */
tc->snd_nxt += 1;
+ tc->rtt_ts = 0;
}
else
{
@@ -1232,7 +1242,7 @@ tcp_timer_persist_handler (u32 index)
/* Nothing to send */
if (n_bytes <= 0)
{
- clib_warning ("persist found nothing to send");
+ // clib_warning ("persist found nothing to send");
tcp_return_buffer (tm);
return;
}
@@ -1448,7 +1458,7 @@ tcp46_output_inline (vlib_main_t * vm,
tcp_connection_t *tc0;
tcp_tx_trace_t *t0;
tcp_header_t *th0 = 0;
- u32 error0 = TCP_ERROR_PKTS_SENT, next0 = TCP_OUTPUT_NEXT_DROP;
+ u32 error0 = TCP_ERROR_PKTS_SENT, next0 = TCP_OUTPUT_NEXT_IP_LOOKUP;
bi0 = from[0];
to_next[0] = bi0;
@@ -1527,6 +1537,7 @@ tcp46_output_inline (vlib_main_t * vm,
tc0->rto_boff = 0;
}
+#if 0
/* Make sure we haven't lost route to our peer */
if (PREDICT_FALSE (tc0->last_fib_check
< tc0->snd_opts.tsval + TCP_FIB_RECHECK_PERIOD))
@@ -1547,6 +1558,10 @@ tcp46_output_inline (vlib_main_t * vm,
/* Use pre-computed dpo to set next node */
next0 = tc0->c_rmt_dpo.dpoi_next_node;
vnet_buffer (b0)->ip.adj_index[VLIB_TX] = tc0->c_rmt_dpo.dpoi_index;
+#endif
+
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0;
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0;
b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED;
done:
diff --git a/src/vnet/tcp/tcp_packet.h b/src/vnet/tcp/tcp_packet.h
index a6f62ee16d5..9ccfe6553ff 100644
--- a/src/vnet/tcp/tcp_packet.h
+++ b/src/vnet/tcp/tcp_packet.h
@@ -168,6 +168,7 @@ typedef struct
#define TCP_OPTION_LEN_TIMESTAMP 10
#define TCP_OPTION_LEN_SACK_BLOCK 8
+#define TCP_HDR_LEN_MAX 60
#define TCP_WND_MAX 65535U
#define TCP_MAX_WND_SCALE 14 /* See RFC 1323 */
#define TCP_OPTS_ALIGN 4
diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c
index a461e3b8b7d..510deb4fec0 100644
--- a/src/vnet/tcp/tcp_test.c
+++ b/src/vnet/tcp/tcp_test.c
@@ -290,7 +290,7 @@ tcp_test_sack_tx (vlib_main_t * vm, unformat_input_t * input)
{
tcp_connection_t _tc, *tc = &_tc;
sack_block_t *sacks;
- int i, verbose = 0;
+ int i, verbose = 0, expected;
while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
{
@@ -326,8 +326,12 @@ tcp_test_sack_tx (vlib_main_t * vm, unformat_input_t * input)
sacks = vec_dup (tc->snd_sacks);
tcp_update_sack_list (tc, 1100, 1200);
- TCP_TEST ((vec_len (tc->snd_sacks) == 5), "sack blocks %d expected %d",
- vec_len (tc->snd_sacks), 5);
+ if (verbose)
+ vlib_cli_output (vm, "add new segment [1100, 1200]\n%U",
+ format_tcp_sacks, tc);
+ expected = 5 < TCP_MAX_SACK_BLOCKS ? 6 : 5;
+ TCP_TEST ((vec_len (tc->snd_sacks) == expected),
+ "sack blocks %d expected %d", vec_len (tc->snd_sacks), expected);
TCP_TEST ((tc->snd_sacks[0].start == 1100),
"first sack block start %u expected %u", tc->snd_sacks[0].start,
1100);