aboutsummaryrefslogtreecommitdiffstats
path: root/src/vnet
diff options
context:
space:
mode:
authorFlorin Coras <fcoras@cisco.com>2017-06-09 21:07:32 -0700
committerDamjan Marion <dmarion.lists@gmail.com>2017-06-19 14:06:34 +0000
commitf03a59ab008908f98fd7d1b187a8c0fb78b01add (patch)
tree8ce1ab091e288d5edbc5df712f668e2e4888c90e /src/vnet
parent328dbc886d7acd3491cff86a7a85176e511acf35 (diff)
Overall tcp performance improvements (VPP-846)
- limit minimum rto per connection - cleanup sack scoreboard - switched svm fifo out-of-order data handling from absolute offsets to relative offsets. - improve cwnd handling when using sacks - add cc event debug stats - improved uri tcp test client/server: bugfixes and added half-duplex mode - expanded builtin client/server - updated uri socket client/server code to work in half-duplex - ensure session node unsets fifo event for empty fifo - fix session detach Change-Id: Ia446972340e32a65e0694ee2844355167d0c170d Signed-off-by: Florin Coras <fcoras@cisco.com>
Diffstat (limited to 'src/vnet')
-rw-r--r--src/vnet/session/application.c2
-rw-r--r--src/vnet/session/node.c7
-rw-r--r--src/vnet/session/segment_manager.c5
-rw-r--r--src/vnet/session/session.c25
-rw-r--r--src/vnet/session/session.h4
-rwxr-xr-xsrc/vnet/session/session_api.c2
-rw-r--r--src/vnet/session/transport.h2
-rw-r--r--src/vnet/tcp/builtin_client.c325
-rw-r--r--src/vnet/tcp/builtin_client.h107
-rw-r--r--src/vnet/tcp/builtin_server.c114
-rw-r--r--src/vnet/tcp/tcp.c22
-rw-r--r--src/vnet/tcp/tcp.h12
-rwxr-xr-xsrc/vnet/tcp/tcp_debug.h186
-rw-r--r--src/vnet/tcp/tcp_input.c193
-rw-r--r--src/vnet/tcp/tcp_newreno.c4
-rw-r--r--src/vnet/tcp/tcp_output.c10
-rw-r--r--src/vnet/tcp/tcp_test.c70
17 files changed, 664 insertions, 426 deletions
diff --git a/src/vnet/session/application.c b/src/vnet/session/application.c
index c679b1f53f8..4bdb10271ab 100644
--- a/src/vnet/session/application.c
+++ b/src/vnet/session/application.c
@@ -117,7 +117,7 @@ application_del (application_t * app)
/* Actual listener cleanup */
for (i = 0; i < vec_len (handles); i++)
{
- a->app_index = app->api_client_index;
+ a->app_index = app->index;
a->handle = handles[i];
/* seg manager is removed when unbind completes */
vnet_unbind (a);
diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c
index 07eeae82915..c0ab1bf096f 100644
--- a/src/vnet/session/node.c
+++ b/src/vnet/session/node.c
@@ -171,7 +171,10 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
/* Nothing to read return */
if (max_dequeue0 == 0)
- return 0;
+ {
+ svm_fifo_unset_event (s0->server_tx_fifo);
+ return 0;
+ }
/* Ensure we're not writing more than transport window allows */
if (max_dequeue0 < snd_space0)
@@ -393,7 +396,7 @@ session_event_get_session (session_fifo_event_t * e0, u8 thread_index)
s0 = stream_session_get_if_valid (session_index0, thread_index);
- ASSERT (s0->thread_index == thread_index);
+ ASSERT (s0 == 0 || s0->thread_index == thread_index);
return s0;
}
diff --git a/src/vnet/session/segment_manager.c b/src/vnet/session/segment_manager.c
index caf8eaa308a..bf571963b0e 100644
--- a/src/vnet/session/segment_manager.c
+++ b/src/vnet/session/segment_manager.c
@@ -306,11 +306,13 @@ again:
if (added_a_segment)
{
clib_warning ("added a segment, still cant allocate a fifo");
+ clib_spinlock_unlock (&sm->lockp);
return SESSION_ERROR_NEW_SEG_NO_SPACE;
}
if (session_manager_add_segment (sm))
{
+ clib_spinlock_unlock (&sm->lockp);
return VNET_API_ERROR_URI_FIFO_CREATE_FAILED;
}
@@ -320,6 +322,7 @@ again:
else
{
clib_warning ("No space to allocate fifos!");
+ clib_spinlock_unlock (&sm->lockp);
return SESSION_ERROR_NO_SPACE;
}
}
@@ -361,8 +364,10 @@ segment_manager_dealloc_fifos (u32 svm_segment_index, svm_fifo_t * rx_fifo,
if (sm->segment_indices[0] != svm_segment_index
&& !svm_fifo_segment_has_fifos (fifo_segment))
{
+ clib_spinlock_lock (&sm->lockp);
svm_fifo_segment_delete (fifo_segment);
vec_del1 (sm->segment_indices, svm_segment_index);
+ clib_spinlock_unlock (&sm->lockp);
}
}
diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c
index 534598d637f..fe198044db9 100644
--- a/src/vnet/session/session.c
+++ b/src/vnet/session/session.c
@@ -700,7 +700,7 @@ stream_session_init_fifos_pointers (transport_connection_t * tc,
svm_fifo_init_pointers (s->server_tx_fifo, tx_pointer);
}
-void
+int
stream_session_connect_notify (transport_connection_t * tc, u8 sst,
u8 is_fail)
{
@@ -709,6 +709,7 @@ stream_session_connect_notify (transport_connection_t * tc, u8 sst,
stream_session_t *new_s = 0;
u64 handle;
u32 api_context = 0;
+ int error = 0;
handle = stream_session_half_open_lookup (smm, &tc->lcl_ip, &tc->rmt_ip,
tc->lcl_port, tc->rmt_port,
@@ -716,7 +717,7 @@ stream_session_connect_notify (transport_connection_t * tc, u8 sst,
if (handle == HALF_OPEN_LOOKUP_INVALID_VALUE)
{
clib_warning ("This can't be good!");
- return;
+ return -1;
}
/* Get the app's index from the handle we stored when opening connection */
@@ -730,9 +731,12 @@ stream_session_connect_notify (transport_connection_t * tc, u8 sst,
/* Create new session (svm segments are allocated if needed) */
if (stream_session_create_i (sm, tc, &new_s))
- return;
-
- new_s->app_index = app->index;
+ {
+ is_fail = 1;
+ error = -1;
+ }
+ else
+ new_s->app_index = app->index;
}
/* Notify client */
@@ -741,6 +745,8 @@ stream_session_connect_notify (transport_connection_t * tc, u8 sst,
/* Cleanup session lookup */
stream_session_half_open_table_del (smm, sst, tc);
+
+ return error;
}
void
@@ -981,8 +987,13 @@ session_send_session_evt_to_thread (u64 session_handle,
/* Based on request block (or not) for lack of space */
if (PREDICT_TRUE (q->cursize < q->maxsize))
- unix_shared_memory_queue_add (q, (u8 *) & evt,
- 0 /* do wait for mutex */ );
+ {
+ if (unix_shared_memory_queue_add (q, (u8 *) & evt,
+ 1 /* do wait for mutex */ ))
+ {
+ clib_warning ("failed to enqueue evt");
+ }
+ }
else
{
clib_warning ("queue full");
diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h
index d9c38bd1ad8..5fa4225c4ed 100644
--- a/src/vnet/session/session.h
+++ b/src/vnet/session/session.h
@@ -368,8 +368,8 @@ stream_session_peek_bytes (transport_connection_t * tc, u8 * buffer,
u32 offset, u32 max_bytes);
u32 stream_session_dequeue_drop (transport_connection_t * tc, u32 max_bytes);
-void stream_session_connect_notify (transport_connection_t * tc, u8 sst,
- u8 is_fail);
+int stream_session_connect_notify (transport_connection_t * tc, u8 sst,
+ u8 is_fail);
void stream_session_init_fifos_pointers (transport_connection_t * tc,
u32 rx_pointer, u32 tx_pointer);
diff --git a/src/vnet/session/session_api.c b/src/vnet/session/session_api.c
index f772cb9fdb4..60f764af48a 100755
--- a/src/vnet/session/session_api.c
+++ b/src/vnet/session/session_api.c
@@ -419,7 +419,7 @@ done:
REPLY_MACRO (VL_API_UNBIND_URI_REPLY);
}
-void
+static void
vl_api_connect_uri_t_handler (vl_api_connect_uri_t * mp)
{
vl_api_connect_uri_reply_t *rmp;
diff --git a/src/vnet/session/transport.h b/src/vnet/session/transport.h
index e5f788bee08..04bd5ca0f29 100644
--- a/src/vnet/session/transport.h
+++ b/src/vnet/session/transport.h
@@ -39,6 +39,7 @@ typedef struct _transport_connection
#if TRANSPORT_DEBUG
elog_track_t elog_track; /**< Event logging */
+ u32 cc_stat_tstamp; /**< CC stats timestamp */
#endif
/** Macros for 'derived classes' where base is named "connection" */
@@ -57,6 +58,7 @@ typedef struct _transport_connection
#define c_is_ip4 connection.is_ip4
#define c_thread_index connection.thread_index
#define c_elog_track connection.elog_track
+#define c_cc_stat_tstamp connection.cc_stat_tstamp
} transport_connection_t;
/*
diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c
index 7238cda3e60..6f8be082d95 100644
--- a/src/vnet/tcp/builtin_client.c
+++ b/src/vnet/tcp/builtin_client.c
@@ -43,7 +43,7 @@
#include <vpp/api/vpe_all_api_h.h>
#undef vl_printfun
-#define TCP_BUILTIN_CLIENT_DBG (1)
+#define TCP_BUILTIN_CLIENT_DBG (0)
static void
send_test_chunk (tclient_main_t * tm, session_t * s)
@@ -92,7 +92,7 @@ send_test_chunk (tclient_main_t * tm, session_t * s)
ed->data[2] = s->bytes_to_send;
}
- /* Poke the TCP state machine */
+ /* Poke the session layer */
if (svm_fifo_set_event (s->server_tx_fifo))
{
/* Fabricate TX event, send to vpp */
@@ -100,8 +100,9 @@ send_test_chunk (tclient_main_t * tm, session_t * s)
evt.event_type = FIFO_EVENT_APP_TX;
evt.event_id = serial_number++;
- unix_shared_memory_queue_add (tm->vpp_event_queue, (u8 *) & evt,
- 0 /* do wait for mutex */ );
+ if (unix_shared_memory_queue_add (tm->vpp_event_queue, (u8 *) & evt,
+ 0 /* do wait for mutex */ ))
+ clib_warning ("could not enqueue event");
}
}
}
@@ -188,13 +189,13 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
sp = pool_elt_at_index (tm->sessions, connection_indices[i]);
- if (tx_quota < 60 && sp->bytes_to_send > 0)
+ if ((tm->no_return || tx_quota < 60) && sp->bytes_to_send > 0)
{
send_test_chunk (tm, sp);
delete_session = 0;
tx_quota++;
}
- if (sp->bytes_to_receive > 0)
+ if (!tm->no_return && sp->bytes_to_receive > 0)
{
prev_bytes_received_this_session = sp->bytes_received;
receive_test_chunk (tm, sp);
@@ -205,13 +206,14 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
}
if (PREDICT_FALSE (delete_session == 1))
{
+ __sync_fetch_and_add (&tm->tx_total, tm->bytes_to_send);
__sync_fetch_and_add (&tm->rx_total, sp->bytes_received);
+
dmp = vl_msg_api_alloc_as_if_client (sizeof (*dmp));
memset (dmp, 0, sizeof (*dmp));
dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION);
dmp->client_index = tm->my_client_index;
dmp->handle = sp->vpp_session_handle;
-// vl_msg_api_send_shmem (tm->vl_input_queue, (u8 *) & dmp);
if (!unix_shared_memory_queue_add (tm->vl_input_queue, (u8 *) & dmp,
1))
{
@@ -247,7 +249,6 @@ VLIB_REGISTER_NODE (builtin_client_node) =
};
/* *INDENT-ON* */
-
/* So we don't get "no handler for... " msgs */
static void
vl_api_memclnt_create_reply_t_handler (vl_api_memclnt_create_reply_t * mp)
@@ -255,76 +256,10 @@ vl_api_memclnt_create_reply_t_handler (vl_api_memclnt_create_reply_t * mp)
vlib_main_t *vm = vlib_get_main ();
tclient_main_t *tm = &tclient_main;
tm->my_client_index = mp->index;
- vlib_process_signal_event (vm, tm->node_index, 1 /* evt */ ,
+ vlib_process_signal_event (vm, tm->cli_node_index, 1 /* evt */ ,
0 /* data */ );
}
-static void
-vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp)
-{
- tclient_main_t *tm = &tclient_main;
- session_t *session;
- u32 session_index;
- i32 retval = /* clib_net_to_host_u32 ( */ mp->retval /*) */ ;
- int i;
-
- if (retval < 0)
- {
- clib_warning ("connection failed: retval %d", retval);
- return;
- }
-
- tm->our_event_queue =
- uword_to_pointer (mp->vpp_event_queue_address,
- unix_shared_memory_queue_t *);
- tm->vpp_event_queue =
- uword_to_pointer (mp->vpp_event_queue_address,
- unix_shared_memory_queue_t *);
-
- /*
- * Setup session
- */
- pool_get (tm->sessions, session);
- memset (session, 0, sizeof (*session));
- session_index = session - tm->sessions;
- session->bytes_to_receive = session->bytes_to_send = tm->bytes_to_send;
-
- session->server_rx_fifo =
- uword_to_pointer (mp->server_rx_fifo, svm_fifo_t *);
- session->server_rx_fifo->client_session_index = session_index;
- session->server_tx_fifo =
- uword_to_pointer (mp->server_tx_fifo, svm_fifo_t *);
- session->server_tx_fifo->client_session_index = session_index;
- session->vpp_session_handle = mp->handle;
-
- /* Add it to the session lookup table */
- hash_set (tm->session_index_by_vpp_handles, mp->handle, session_index);
-
- if (tm->ready_connections == tm->expected_connections - 1)
- {
- vlib_thread_main_t *thread_main = vlib_get_thread_main ();
- int thread_index;
-
- thread_index = 0;
- for (i = 0; i < pool_elts (tm->sessions); i++)
- {
- vec_add1 (tm->connection_index_by_thread[thread_index], i);
- thread_index++;
- if (thread_index == thread_main->n_vlib_mains)
- thread_index = 0;
- }
- }
- __sync_fetch_and_add (&tm->ready_connections, 1);
- if (tm->ready_connections == tm->expected_connections)
- {
- tm->run_test = 1;
- tm->test_start_time = vlib_time_now (tm->vlib_main);
- /* Signal the CLI process that the action is starting... */
- vlib_process_signal_event (tm->vlib_main, tm->cli_node_index,
- 1, 0 /* data */ );
- }
-}
-
static int
create_api_loopback (tclient_main_t * tm)
{
@@ -347,12 +282,11 @@ create_api_loopback (tclient_main_t * tm)
mp->_vl_msg_id = VL_API_MEMCLNT_CREATE;
mp->context = 0xFEEDFACE;
mp->input_queue = pointer_to_uword (tm->vl_input_queue);
- strncpy ((char *) mp->name, "tcp_tester", sizeof (mp->name) - 1);
+ strncpy ((char *) mp->name, "tcp_clients_tester", sizeof (mp->name) - 1);
vl_api_memclnt_create_t_handler (mp);
/* Wait for reply */
- tm->node_index = vlib_get_current_process (vm)->node_runtime.node_index;
vlib_process_wait_for_event_or_clock (vm, 1.0);
event_type = vlib_process_get_events (vm, &event_data);
switch (event_type)
@@ -373,7 +307,6 @@ create_api_loopback (tclient_main_t * tm)
#define foreach_tclient_static_api_msg \
_(MEMCLNT_CREATE_REPLY, memclnt_create_reply) \
-_(CONNECT_URI_REPLY, connect_uri_reply)
static clib_error_t *
tclient_api_hookup (vlib_main_t * vm)
@@ -411,8 +344,8 @@ tcp_test_clients_init (vlib_main_t * vm)
if (create_api_loopback (tm))
return -1;
- /* Init test data */
- vec_validate (tm->connect_test_data, 64 * 1024 - 1);
+ /* Init test data. Big buffer */
+ vec_validate (tm->connect_test_data, 1024 * 1024 - 1);
for (i = 0; i < vec_len (tm->connect_test_data); i++)
tm->connect_test_data[i] = i & 0xff;
@@ -430,37 +363,66 @@ static int
builtin_session_connected_callback (u32 app_index, u32 api_context,
stream_session_t * s, u8 is_fail)
{
- vl_api_connect_uri_reply_t _m, *mp = &_m;
- unix_shared_memory_queue_t *q;
- application_t *app;
- unix_shared_memory_queue_t *vpp_queue;
+ tclient_main_t *tm = &tclient_main;
+ session_t *session;
+ u32 session_index;
+ int i;
- app = application_get (app_index);
- q = vl_api_client_index_to_input_queue (app->api_client_index);
+ if (is_fail)
+ {
+ clib_warning ("connection %d failed!", api_context);
+ vlib_process_signal_event (tm->vlib_main, tm->cli_node_index, -1,
+ 0 /* data */ );
+ return -1;
+ }
- if (!q)
- return -1;
+ /* Mark vpp session as connected */
+ s->session_state = SESSION_STATE_READY;
- memset (mp, 0, sizeof (*mp));
- mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_CONNECT_URI_REPLY);
- mp->context = api_context;
- if (!is_fail)
+ tm->our_event_queue = session_manager_get_vpp_event_queue (s->thread_index);
+ tm->vpp_event_queue = session_manager_get_vpp_event_queue (s->thread_index);
+
+ /*
+ * Setup session
+ */
+ pool_get (tm->sessions, session);
+ memset (session, 0, sizeof (*session));
+ session_index = session - tm->sessions;
+ session->bytes_to_receive = session->bytes_to_send = tm->bytes_to_send;
+ session->server_rx_fifo = s->server_rx_fifo;
+ session->server_rx_fifo->client_session_index = session_index;
+ session->server_tx_fifo = s->server_tx_fifo;
+ session->server_tx_fifo->client_session_index = session_index;
+ session->vpp_session_handle = stream_session_handle (s);
+
+ /* Add it to the session lookup table */
+ hash_set (tm->session_index_by_vpp_handles, session->vpp_session_handle,
+ session_index);
+
+ if (tm->ready_connections == tm->expected_connections - 1)
{
- vpp_queue = session_manager_get_vpp_event_queue (s->thread_index);
- mp->server_rx_fifo = pointer_to_uword (s->server_rx_fifo);
- mp->server_tx_fifo = pointer_to_uword (s->server_tx_fifo);
- mp->handle = stream_session_handle (s);
- mp->vpp_event_queue_address = pointer_to_uword (vpp_queue);
- mp->retval = 0;
- s->session_state = SESSION_STATE_READY;
+ vlib_thread_main_t *thread_main = vlib_get_thread_main ();
+ int thread_index;
+
+ thread_index = 0;
+ for (i = 0; i < pool_elts (tm->sessions); i++)
+ {
+ vec_add1 (tm->connection_index_by_thread[thread_index], i);
+ thread_index++;
+ if (thread_index == thread_main->n_vlib_mains)
+ thread_index = 0;
+ }
}
- else
+ __sync_fetch_and_add (&tm->ready_connections, 1);
+ if (tm->ready_connections == tm->expected_connections)
{
- mp->retval = clib_host_to_net_u32 (VNET_API_ERROR_SESSION_CONNECT_FAIL);
+ tm->run_test = 1;
+ tm->test_start_time = vlib_time_now (tm->vlib_main);
+ /* Signal the CLI process that the action is starting... */
+ vlib_process_signal_event (tm->vlib_main, tm->cli_node_index, 1,
+ 0 /* data */ );
}
- vl_api_connect_uri_reply_t_handler (mp);
-
return 0;
}
@@ -489,23 +451,22 @@ builtin_server_rx_callback (stream_session_t * s)
}
/* *INDENT-OFF* */
-static session_cb_vft_t builtin_clients =
- {
- .session_reset_callback = builtin_session_reset_callback,
- .session_connected_callback = builtin_session_connected_callback,
- .session_accept_callback = builtin_session_create_callback,
- .session_disconnect_callback = builtin_session_disconnect_callback,
- .builtin_server_rx_callback = builtin_server_rx_callback
- };
+static session_cb_vft_t builtin_clients = {
+ .session_reset_callback = builtin_session_reset_callback,
+ .session_connected_callback = builtin_session_connected_callback,
+ .session_accept_callback = builtin_session_create_callback,
+ .session_disconnect_callback = builtin_session_disconnect_callback,
+ .builtin_server_rx_callback = builtin_server_rx_callback
+};
/* *INDENT-ON* */
static int
-attach_builtin_test_clients ()
+attach_builtin_test_clients_app (void)
{
tclient_main_t *tm = &tclient_main;
vnet_app_attach_args_t _a, *a = &_a;
u8 segment_name[128];
- u32 segment_name_length;
+ u32 segment_name_length, prealloc_fifos;
u64 options[16];
segment_name_length = ARRAY_LEN (segment_name);
@@ -518,13 +479,68 @@ attach_builtin_test_clients ()
a->segment_name_length = segment_name_length;
a->session_cb_vft = &builtin_clients;
+ prealloc_fifos = tm->prealloc_fifos ? tm->expected_connections : 1;
+
options[SESSION_OPTIONS_ACCEPT_COOKIE] = 0x12345678;
- options[SESSION_OPTIONS_SEGMENT_SIZE] = (2 << 30); /*$$$$ config / arg */
+ options[SESSION_OPTIONS_SEGMENT_SIZE] = (2ULL << 32);
+ options[SESSION_OPTIONS_RX_FIFO_SIZE] = tm->fifo_size;
+ options[SESSION_OPTIONS_TX_FIFO_SIZE] = tm->fifo_size / 2;
+ options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = prealloc_fifos;
+
options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP;
a->options = options;
- return vnet_application_attach (a);
+ if (vnet_application_attach (a))
+ return -1;
+
+ tm->app_index = a->app_index;
+ return 0;
+}
+
+static void *
+tclient_thread_fn (void *arg)
+{
+ return 0;
+}
+
+/** Start a transmit thread */
+int
+start_tx_pthread (tclient_main_t * tm)
+{
+ if (tm->client_thread_handle == 0)
+ {
+ int rv = pthread_create (&tm->client_thread_handle,
+ NULL /*attr */ ,
+ tclient_thread_fn, 0);
+ if (rv)
+ {
+ tm->client_thread_handle = 0;
+ return -1;
+ }
+ }
+ return 0;
+}
+
+void
+clients_connect (vlib_main_t * vm, u8 * uri, u32 n_clients)
+{
+ tclient_main_t *tm = &tclient_main;
+ vnet_connect_args_t _a, *a = &_a;
+ int i;
+ for (i = 0; i < n_clients; i++)
+ {
+ memset (a, 0, sizeof (*a));
+
+ a->uri = (char *) uri;
+ a->api_context = i;
+ a->app_index = tm->app_index;
+ a->mp = 0;
+ vnet_connect_uri (a);
+
+ /* Crude pacing for call setups, 100k/sec */
+ vlib_process_suspend (vm, 10e-6);
+ }
}
static clib_error_t *
@@ -534,17 +550,18 @@ test_tcp_clients_command_fn (vlib_main_t * vm,
{
tclient_main_t *tm = &tclient_main;
vlib_thread_main_t *thread_main = vlib_get_thread_main ();
- uword *event_data = 0;
- uword event_type;
- u8 *connect_uri = (u8 *) "tcp://6.0.1.1/1234";
- u8 *uri;
+ uword *event_data = 0, event_type;
+ u8 *default_connect_uri = (u8 *) "tcp://6.0.1.1/1234", *uri;
+ u64 tmp, total_bytes;
+ f64 cli_timeout = 20.0, delta;
u32 n_clients = 1;
+ char *transfer_type;
int i;
- u64 tmp;
- f64 cli_timeout = 20.0;
- f64 delta;
tm->bytes_to_send = 8192;
+ tm->no_return = 0;
+ tm->fifo_size = 64 << 10;
+
vec_free (tm->connect_uri);
while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
@@ -561,11 +578,18 @@ test_tcp_clients_command_fn (vlib_main_t * vm,
;
else if (unformat (input, "cli-timeout %f", &cli_timeout))
;
+ else if (unformat (input, "no-return"))
+ tm->no_return = 1;
+ else if (unformat (input, "fifo-size %d", &tm->fifo_size))
+ tm->fifo_size <<= 10;
else
return clib_error_return (0, "unknown input `%U'",
format_unformat_error, input);
}
+ /* Store cli process node index for signalling */
+ tm->cli_node_index = vlib_get_current_process (vm)->node_runtime.node_index;
+
if (tm->is_init == 0)
{
if (tcp_test_clients_init (vm))
@@ -575,28 +599,25 @@ test_tcp_clients_command_fn (vlib_main_t * vm,
tm->ready_connections = 0;
tm->expected_connections = n_clients;
tm->rx_total = 0;
+ tm->tx_total = 0;
- uri = connect_uri;
+ uri = default_connect_uri;
if (tm->connect_uri)
uri = tm->connect_uri;
#if TCP_BUILTIN_CLIENT_PTHREAD
- /* Start a transmit thread */
- if (tm->client_thread_handle == 0)
+ start_tx_pthread ();
+#endif
+
+ vnet_session_enable_disable (vm, 1 /* turn on TCP, etc. */ );
+
+ if (tm->test_client_attached == 0)
{
- int rv = pthread_create (&tm->client_thread_handle,
- NULL /*attr */ ,
- tclient_thread_fn, 0);
- if (rv)
+ if (attach_builtin_test_clients_app ())
{
- tm->client_thread_handle = 0;
- return clib_error_return (0, "pthread_create returned %d", rv);
+ return clib_error_return (0, "app attach failed");
}
}
-#endif
- vnet_session_enable_disable (vm, 1 /* turn on TCP, etc. */ );
- if (tm->test_client_attached == 0)
- attach_builtin_test_clients ();
tm->test_client_attached = 1;
/* Turn on the builtin client input nodes */
@@ -604,25 +625,8 @@ test_tcp_clients_command_fn (vlib_main_t * vm,
vlib_node_set_state (vlib_mains[i], builtin_client_node.index,
VLIB_NODE_STATE_POLLING);
- tm->cli_node_index = vlib_get_current_process (vm)->node_runtime.node_index;
-
/* Fire off connect requests */
- for (i = 0; i < n_clients; i++)
- {
- vl_api_connect_uri_t _cmp, *cmp = &_cmp;
- void vl_api_connect_uri_t_handler (vl_api_connect_uri_t * cmp);
-
- memset (cmp, 0, sizeof (*cmp));
-
- cmp->_vl_msg_id = ntohs (VL_API_CONNECT_URI);
- cmp->client_index = tm->my_client_index;
- cmp->context = ntohl (0xfeedface);
- memcpy (cmp->uri, uri, strlen ((char *) uri) + 1);
-
- vl_api_connect_uri_t_handler (cmp);
- /* Crude pacing for call setups, 100k/sec */
- vlib_process_suspend (vm, 10e-6);
- }
+ clients_connect (vm, uri, n_clients);
/* Park until the sessions come up, or ten seconds elapse... */
vlib_process_wait_for_event_or_clock (vm, 10.0 /* timeout, seconds */ );
@@ -668,14 +672,17 @@ test_tcp_clients_command_fn (vlib_main_t * vm,
if (delta != 0.0)
{
+ total_bytes = (tm->no_return ? tm->tx_total : tm->rx_total);
+ transfer_type = tm->no_return ? "half-duplex" : "full-duplex";
vlib_cli_output (vm,
"%lld bytes (%lld mbytes, %lld gbytes) in %.2f seconds",
- tm->rx_total, tm->rx_total / (1ULL << 20),
- tm->rx_total / (1ULL << 30), delta);
- vlib_cli_output (vm, "%.2f bytes/second full-duplex",
- ((f64) tm->rx_total) / (delta));
- vlib_cli_output (vm, "%.4f gbit/second full-duplex",
- (((f64) tm->rx_total * 8.0) / delta / 1e9));
+ total_bytes, total_bytes / (1ULL << 20),
+ total_bytes / (1ULL << 30), delta);
+ vlib_cli_output (vm, "%.2f bytes/second %s",
+ ((f64) total_bytes) / (delta), transfer_type);
+ vlib_cli_output (vm, "%.4f gbit/second %s",
+ (((f64) total_bytes * 8.0) / delta / 1e9),
+ transfer_type);
}
else
vlib_cli_output (vm, "zero delta-t?");
diff --git a/src/vnet/tcp/builtin_client.h b/src/vnet/tcp/builtin_client.h
index d5d79e53a22..3462e0eeee8 100644
--- a/src/vnet/tcp/builtin_client.h
+++ b/src/vnet/tcp/builtin_client.h
@@ -44,78 +44,59 @@ typedef struct
typedef struct
{
- /* API message ID base */
- u16 msg_id_base;
-
- /* vpe input queue */
- unix_shared_memory_queue_t *vl_input_queue;
-
- /* API client handle */
- u32 my_client_index;
-
- /* The URI we're playing with */
- u8 *uri;
-
- /* Session pool */
- session_t *sessions;
-
- /* Hash table for disconnect processing */
- uword *session_index_by_vpp_handles;
-
- /* intermediate rx buffer */
- u8 *rx_buf;
-
- /* URI for slave's connect */
- u8 *connect_uri;
-
- u32 connected_session_index;
-
- int i_am_master;
-
- /* drop all packets */
- int drop_packets;
-
- /* Our event queue */
- unix_shared_memory_queue_t *our_event_queue;
-
- /* $$$ single thread only for the moment */
- unix_shared_memory_queue_t *vpp_event_queue;
-
- pid_t my_pid;
-
- f64 test_start_time;
- f64 test_end_time;
-
- u32 expected_connections;
+ /*
+ * Application setup parameters
+ */
+ unix_shared_memory_queue_t *vl_input_queue; /**< vpe input queue */
+ unix_shared_memory_queue_t *our_event_queue; /**< Our event queue */
+ unix_shared_memory_queue_t *vpp_event_queue; /**< $$$ single thread */
+
+ u32 cli_node_index; /**< cli process node index */
+ u32 my_client_index; /**< loopback API client handle */
+ u32 app_index; /**< app index after attach */
+
+ /*
+ * Configuration params
+ */
+ u8 *connect_uri; /**< URI for slave's connect */
+ u64 bytes_to_send; /**< Bytes to send */
+ u32 configured_segment_size;
+ u32 fifo_size;
+ u32 expected_connections; /**< Number of clients/connections */
+
+ /*
+ * Test state variables
+ */
+ session_t *sessions; /**< Sessions pool */
+ u8 *rx_buf; /**< intermediate rx buffer */
+ uword *session_index_by_vpp_handles; /**< Hash table for disconnecting */
+ u8 *connect_test_data; /**< Pre-computed test data */
u32 **connection_index_by_thread;
+ pthread_t client_thread_handle;
+
volatile u32 ready_connections;
volatile u32 finished_connections;
-
volatile u64 rx_total;
- u32 cli_node_index;
-
- /* Signal variable */
- volatile int run_test;
-
- /* Bytes to send */
- u64 bytes_to_send;
-
- u32 configured_segment_size;
+ volatile u64 tx_total;
+ volatile int run_test; /**< Signal start of test */
- /* VNET_API_ERROR_FOO -> "Foo" hash table */
- uword *error_string_by_error_number;
-
- u8 *connect_test_data;
- pthread_t client_thread_handle;
- u64 client_bytes_received;
- u8 test_return_packets;
+ f64 test_start_time;
+ f64 test_end_time;
+ /*
+ * Flags
+ */
u8 is_init;
u8 test_client_attached;
+ u8 no_return;
+ u8 test_return_packets;
+ int i_am_master;
+ int drop_packets; /**< drop all packets */
+ u8 prealloc_fifos; /**< Request fifo preallocation */
- u32 node_index;
-
- /* convenience */
+ /*
+ * Convenience
+ */
vlib_main_t *vlib_main;
vnet_main_t *vnet_main;
ethernet_main_t *ethernet_main;
diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c
index 8bd2f3605e6..775bfc26e47 100644
--- a/src/vnet/tcp/builtin_server.c
+++ b/src/vnet/tcp/builtin_server.c
@@ -39,21 +39,30 @@
typedef struct
{
- /* Per-thread RX buffer */
- u8 **rx_buf;
+ /*
+ * Server app parameters
+ */
unix_shared_memory_queue_t **vpp_queue;
- u64 byte_index;
+ unix_shared_memory_queue_t *vl_input_queue; /**< Sever's event queue */
- /* Sever's event queue */
- unix_shared_memory_queue_t *vl_input_queue;
+ u32 app_index; /**< Server app index */
+ u32 my_client_index; /**< API client handle */
+ u32 node_index; /**< process node index for evnt scheduling */
- /* API client handle */
- u32 my_client_index;
+ /*
+ * Config params
+ */
+ u8 no_echo; /**< Don't echo traffic */
+ u32 fifo_size; /**< Fifo size */
+ u32 rcv_buffer_size; /**< Rcv buffer size */
+ u32 prealloc_fifos; /**< Preallocate fifos */
- u32 app_index;
+ /*
+ * Test state
+ */
+ u8 **rx_buf; /**< Per-thread RX buffer */
+ u64 byte_index;
- /* process node index for evnt scheduling */
- u32 node_index;
vlib_main_t *vlib_main;
} builtin_server_main_t;
@@ -132,6 +141,29 @@ test_bytes (builtin_server_main_t * bsm, int actual_transfer)
bsm->byte_index += actual_transfer;
}
+/*
+ * If no-echo, just read the data and be done with it
+ */
+int
+builtin_server_rx_callback_no_echo (stream_session_t * s)
+{
+ builtin_server_main_t *bsm = &builtin_server_main;
+ u32 my_thread_id = vlib_get_thread_index ();
+ int actual_transfer;
+ svm_fifo_t *rx_fifo;
+
+ rx_fifo = s->server_rx_fifo;
+
+ do
+ {
+ actual_transfer =
+ svm_fifo_dequeue_nowait (rx_fifo, bsm->rcv_buffer_size,
+ bsm->rx_buf[my_thread_id]);
+ }
+ while (actual_transfer > 0);
+ return 0;
+}
+
int
builtin_server_rx_callback (stream_session_t * s)
{
@@ -143,8 +175,8 @@ builtin_server_rx_callback (stream_session_t * s)
static int serial_number = 0;
u32 my_thread_id = vlib_get_thread_index ();
- tx_fifo = s->server_tx_fifo;
rx_fifo = s->server_rx_fifo;
+ tx_fifo = s->server_tx_fifo;
max_dequeue = svm_fifo_max_dequeue (s->server_rx_fifo);
max_enqueue = svm_fifo_max_enqueue (s->server_tx_fifo);
@@ -164,19 +196,22 @@ builtin_server_rx_callback (stream_session_t * s)
/* Program self-tap to retry */
if (svm_fifo_set_event (rx_fifo))
{
+ unix_shared_memory_queue_t *q;
evt.fifo = rx_fifo;
evt.event_type = FIFO_EVENT_BUILTIN_RX;
evt.event_id = 0;
- unix_shared_memory_queue_add (bsm->vpp_queue[s->thread_index],
- (u8 *) & evt,
- 0 /* do wait for mutex */ );
+
+ q = bsm->vpp_queue[s->thread_index];
+ if (PREDICT_FALSE (q->cursize == q->maxsize))
+ clib_warning ("out of event queue space");
+ else
+ unix_shared_memory_queue_add (q, (u8 *) & evt,
+ 0 /* don't wait for mutex */ );
}
return 0;
}
- vec_validate (bsm->rx_buf, my_thread_id);
- vec_validate (bsm->rx_buf[my_thread_id], max_transfer - 1);
_vec_len (bsm->rx_buf[my_thread_id]) = max_transfer;
actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, max_transfer,
@@ -281,14 +316,21 @@ server_attach ()
memset (a, 0, sizeof (*a));
memset (options, 0, sizeof (options));
+ if (bsm->no_echo)
+ builtin_session_cb_vft.builtin_server_rx_callback =
+ builtin_server_rx_callback_no_echo;
+ else
+ builtin_session_cb_vft.builtin_server_rx_callback =
+ builtin_server_rx_callback;
a->api_client_index = bsm->my_client_index;
a->session_cb_vft = &builtin_session_cb_vft;
a->options = options;
a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 512 << 20;
- a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 64 << 10;
- a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 64 << 10;
+ a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = bsm->fifo_size;
+ a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = bsm->fifo_size;
a->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP;
- a->options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = 8192;
+ a->options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] =
+ bsm->prealloc_fifos ? bsm->prealloc_fifos : 1;
a->segment_name = segment_name;
a->segment_name_length = ARRAY_LEN (segment_name);
@@ -316,17 +358,24 @@ static int
server_create (vlib_main_t * vm)
{
builtin_server_main_t *bsm = &builtin_server_main;
- u32 num_threads;
vlib_thread_main_t *vtm = vlib_get_thread_main ();
+ u32 num_threads;
+ int i;
if (bsm->my_client_index == (u32) ~ 0)
{
if (create_api_loopback (vm))
- return -1;
+ {
+ clib_warning ("failed to create api loopback");
+ return -1;
+ }
}
num_threads = 1 /* main thread */ + vtm->n_threads;
vec_validate (builtin_server_main.vpp_queue, num_threads - 1);
+ vec_validate (bsm->rx_buf, num_threads - 1);
+ for (i = 0; i < num_threads; i++)
+ vec_validate (bsm->rx_buf[i], bsm->rcv_buffer_size);
if (server_attach ())
{
@@ -381,23 +430,35 @@ tcp_builtin_server_api_hookup (vlib_main_t * vm)
}
static clib_error_t *
-server_create_command_fn (vlib_main_t * vm,
- unformat_input_t * input, vlib_cli_command_t * cmd)
+server_create_command_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
{
+ builtin_server_main_t *bsm = &builtin_server_main;
int rv;
-#if 0
+
+ bsm->no_echo = 0;
+ bsm->fifo_size = 64 << 10;
+ bsm->rcv_buffer_size = 128 << 10;
+ bsm->prealloc_fifos = 0;
+
while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
{
- if (unformat (input, "whatever %d", &whatever))
+ if (unformat (input, "no-echo"))
+ bsm->no_echo = 1;
+ else if (unformat (input, "fifo-size %d", &bsm->fifo_size))
+ bsm->fifo_size <<= 10;
+ else if (unformat (input, "rcv-buf-size %d", &bsm->rcv_buffer_size))
+ ;
+ else if (unformat (input, "prealloc-fifos", &bsm->prealloc_fifos))
;
else
return clib_error_return (0, "unknown input `%U'",
format_unformat_error, input);
}
-#endif
tcp_builtin_server_api_hookup (vm);
vnet_session_enable_disable (vm, 1 /* turn on TCP, etc. */ );
+
rv = server_create (vm);
switch (rv)
{
@@ -406,6 +467,7 @@ server_create_command_fn (vlib_main_t * vm,
default:
return clib_error_return (0, "server_create returned %d", rv);
}
+
return 0;
}
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index e0b67a8e5e5..5c554bac5a9 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -726,15 +726,25 @@ tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space)
u32
tcp_snd_space (tcp_connection_t * tc)
{
- int snd_space;
+ int snd_space, snt_limited;
- /* If we haven't gotten dupacks or if we did and have gotten sacked bytes
- * then we can still send */
- if (PREDICT_TRUE (tcp_in_cong_recovery (tc) == 0
- && (tc->rcv_dupacks == 0
- || tc->sack_sb.last_sacked_bytes)))
+ if (PREDICT_TRUE (tcp_in_cong_recovery (tc) == 0))
{
snd_space = tcp_available_snd_space (tc);
+
+ /* If we haven't gotten dupacks or if we did and have gotten sacked
+ * bytes then we can still send as per Limited Transmit (RFC3042) */
+ if (PREDICT_FALSE (tc->rcv_dupacks != 0
+ && (tcp_opts_sack_permitted (tc)
+ && tc->sack_sb.last_sacked_bytes == 0)))
+ {
+ if (tc->rcv_dupacks == 1 && tc->limited_transmit != tc->snd_nxt)
+ tc->limited_transmit = tc->snd_nxt;
+ ASSERT (seq_leq (tc->limited_transmit, tc->snd_nxt));
+
+ snt_limited = tc->snd_nxt - tc->limited_transmit;
+ snd_space = clib_max (2 * tc->snd_mss - snt_limited, 0);
+ }
return tcp_round_snd_space (tc, snd_space);
}
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index 071f1ab185f..e83987182fa 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -31,9 +31,9 @@
#define TCP_MAX_OPTION_SPACE 40
#define TCP_DUPACK_THRESHOLD 3
-#define TCP_MAX_RX_FIFO_SIZE 2 << 20
+#define TCP_MAX_RX_FIFO_SIZE 4 << 20
#define TCP_IW_N_SEGMENTS 10
-#define TCP_ALWAYS_ACK 0 /**< If on, we always ack */
+#define TCP_ALWAYS_ACK 1 /**< On/off delayed acks */
#define TCP_USE_SACKS 1 /**< Disable only for testing */
/** TCP FSM state definitions as per RFC793. */
@@ -100,6 +100,7 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler;
#define TCP_TIMER_PERSIST_MIN 2 /* 0.2s */
#define TCP_RTO_MAX 60 * THZ /* Min max RTO (60s) as per RFC6298 */
+#define TCP_RTO_MIN 0.2 * THZ /* Min RTO (200ms) - lower than standard */
#define TCP_RTT_MAX 30 * THZ /* 30s (probably too much) */
#define TCP_RTO_SYN_RETRIES 3 /* SYN retries without doubling RTO */
#define TCP_RTO_INIT 1 * THZ /* Initial retransmit timer */
@@ -149,7 +150,7 @@ enum
#undef _
};
-#define TCP_MAX_SACK_BLOCKS 5 /**< Max number of SACK blocks stored */
+#define TCP_MAX_SACK_BLOCKS 15 /**< Max number of SACK blocks stored */
#define TCP_INVALID_SACK_HOLE_INDEX ((u32)~0)
typedef struct _sack_scoreboard_hole
@@ -208,6 +209,7 @@ typedef struct _tcp_connection
u32 snd_wl1; /**< seq number used for last snd.wnd update */
u32 snd_wl2; /**< ack number used for last snd.wnd update */
u32 snd_nxt; /**< next seq number to be sent */
+ u16 snd_mss; /**< Effective send max seg (data) size */
/** Receive sequence variables RFC793 */
u32 rcv_nxt; /**< next sequence number expected */
@@ -252,8 +254,8 @@ typedef struct _tcp_connection
u32 rtt_ts; /**< Timestamp for tracked ACK */
u32 rtt_seq; /**< Sequence number for tracked ACK */
- u16 snd_mss; /**< Effective send max seg (data) size */
u16 mss; /**< Our max seg size that includes options */
+ u32 limited_transmit; /**< snd_nxt when limited transmit starts */
} tcp_connection_t;
struct _tcp_cc_algorithm
@@ -433,6 +435,7 @@ tcp_end_seq (tcp_header_t * th, u32 len)
#define seq_leq(_s1, _s2) ((i32)((_s1)-(_s2)) <= 0)
#define seq_gt(_s1, _s2) ((i32)((_s1)-(_s2)) > 0)
#define seq_geq(_s1, _s2) ((i32)((_s1)-(_s2)) >= 0)
+#define seq_max(_s1, _s2) (seq_gt((_s1), (_s2)) ? (_s1) : (_s2))
/* Modulo arithmetic for timestamps */
#define timestamp_lt(_t1, _t2) ((i32)((_t1)-(_t2)) < 0)
@@ -719,6 +722,7 @@ scoreboard_clear (sack_scoreboard_t * sb)
{
scoreboard_remove_hole (sb, hole);
}
+ ASSERT (sb->head == sb->tail && sb->head == TCP_INVALID_SACK_HOLE_INDEX);
sb->sacked_bytes = 0;
sb->last_sacked_bytes = 0;
sb->last_bytes_delivered = 0;
diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h
index 3a16cf63194..ae68ad1b264 100755
--- a/src/vnet/tcp/tcp_debug.h
+++ b/src/vnet/tcp/tcp_debug.h
@@ -19,8 +19,10 @@
#include <vlib/vlib.h>
#define TCP_DEBUG (1)
+#define TCP_DEBUG_SM (0)
#define TCP_DEBUG_CC (1)
-#define TCP_DEBUG_VERBOSE (0)
+#define TCP_DEBUG_CC_STAT (1)
+#define TCP_DEBUG_SM_VERBOSE (0)
#define foreach_tcp_dbg_evt \
_(INIT, "") \
@@ -49,6 +51,8 @@
_(CC_RTX, "retransmit") \
_(CC_EVT, "cc event") \
_(CC_PACK, "cc partial ack") \
+ _(CC_STAT, "cc stats") \
+ _(CC_RTO_STAT, "cc rto stats") \
_(SEG_INVALID, "invalid segment") \
_(PAWS_FAIL, "failed paws check") \
_(ACK_RCV_ERR, "invalid ack") \
@@ -72,6 +76,10 @@ typedef enum _tcp_dbg_evt
#define TRANSPORT_DEBUG (1)
+/*
+ * Infra and evt track setup
+ */
+
#define TCP_DBG(_tc, _evt, _args...) \
{ \
u8 *_tmp = 0; \
@@ -158,6 +166,30 @@ typedef enum _tcp_dbg_evt
TCP_EVT_DEALLOC_HANDLER(_tc); \
}
+#define TCP_EVT_SYN_RCVD_HANDLER(_tc, ...) \
+{ \
+ TCP_EVT_INIT_HANDLER(_tc, "s%d%c"); \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "SYNrx: irs %u", \
+ .format_args = "i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 1); \
+ ed->data[0] = _tc->irs; \
+}
+
+#define CONCAT_HELPER(_a, _b) _a##_b
+#define CC(_a, _b) CONCAT_HELPER(_a, _b)
+#define TCP_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args)
+#else
+#define TCP_EVT_DBG(_evt, _args...)
+#endif
+
+/*
+ * State machine
+ */
+#if TCP_DEBUG_SM
+
#define TCP_EVT_ACK_SENT_HANDLER(_tc, ...) \
{ \
ELOG_TYPE_DECLARE (_e) = \
@@ -234,18 +266,6 @@ typedef enum _tcp_dbg_evt
ed->data[1] = _tc->rcv_nxt - _tc->irs; \
}
-#define TCP_EVT_SYN_RCVD_HANDLER(_tc, ...) \
-{ \
- TCP_EVT_INIT_HANDLER(_tc, "s%d%c"); \
- ELOG_TYPE_DECLARE (_e) = \
- { \
- .format = "SYNrx: irs %u", \
- .format_args = "i4", \
- }; \
- DECLARE_ETD(_tc, _e, 1); \
- ed->data[0] = _tc->irs; \
-}
-
#define TCP_EVT_FIN_RCVD_HANDLER(_tc, ...) \
{ \
ELOG_TYPE_DECLARE (_e) = \
@@ -418,6 +438,74 @@ typedef enum _tcp_dbg_evt
ed->data[4] = _tc->snd_una_max - _tc->iss; \
}
+#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...) \
+{ \
+if (_av > 0) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "huh?: rcv_wnd %u obsd %u av %u rcv_nxt %u rcv_las %u", \
+ .format_args = "i4i4i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 5); \
+ ed->data[0] = _tc->rcv_wnd; \
+ ed->data[1] = _obs; \
+ ed->data[2] = _av; \
+ ed->data[3] = _tc->rcv_nxt - _tc->irs; \
+ ed->data[4] = _tc->rcv_las - _tc->irs; \
+} \
+}
+#else
+#define TCP_EVT_ACK_SENT_HANDLER(_tc, ...)
+#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...)
+#define TCP_EVT_SYN_SENT_HANDLER(_tc, ...)
+#define TCP_EVT_SYN_RTX_HANDLER(_tc, ...)
+#define TCP_EVT_FIN_SENT_HANDLER(_tc, ...)
+#define TCP_EVT_RST_SENT_HANDLER(_tc, ...)
+#define TCP_EVT_FIN_RCVD_HANDLER(_tc, ...)
+#define TCP_EVT_RST_RCVD_HANDLER(_tc, ...)
+#define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...)
+#define TCP_EVT_DUPACK_RCVD_HANDLER(_tc, ...)
+#define TCP_EVT_PKTIZE_HANDLER(_tc, ...)
+#define TCP_EVT_INPUT_HANDLER(_tc, _type, _len, _written, ...)
+#define TCP_EVT_TIMER_POP_HANDLER(_tc_index, _timer_id, ...)
+#define TCP_EVT_SEG_INVALID_HANDLER(_tc, _seq, _end, ...)
+#define TCP_EVT_PAWS_FAIL_HANDLER(_tc, _seq, _end, ...)
+#define TCP_EVT_ACK_RCV_ERR_HANDLER(_tc, _type, _ack, ...)
+#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...)
+#endif
+
+/*
+ * State machine verbose
+ */
+#if TCP_DBG_SM_VERBOSE
+#define TCP_EVT_SND_WND_HANDLER(_tc, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "snd_wnd update: %u ", \
+ .format_args = "i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 1); \
+ ed->data[0] = _tc->snd_wnd; \
+}
+
+#define TCP_EVT_OUTPUT_HANDLER(_tc, flags, n_bytes,...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "out: flags %x, bytes %u", \
+ .format_args = "i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 2); \
+ ed->data[0] = flags; \
+ ed->data[1] = n_bytes; \
+}
+#else
+#define TCP_EVT_SND_WND_HANDLER(_tc, ...)
+#define TCP_EVT_OUTPUT_HANDLER(_tc, flags, n_bytes,...)
+#endif
+
/*
* Congestion Control
*/
@@ -471,67 +559,59 @@ typedef enum _tcp_dbg_evt
ed->data[1] = _tc->snd_una_max - _tc->iss; \
}
-#else
-#define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...)
-#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, _snd_space, ...)
-#define TCP_EVT_CC_PACK_HANDLER(_tc, ...)
-#endif
+/*
+ * Congestion control stats
+ */
+#if TCP_DEBUG_CC_STAT
-#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...) \
+#define STATS_INTERVAL 1
+
+#define TCP_EVT_CC_RTO_STAT_HANDLER(_tc, ...) \
{ \
-if (_av > 0) \
+if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now()) \
{ \
ELOG_TYPE_DECLARE (_e) = \
{ \
- .format = "huh?: rcv_wnd %u obsd %u av %u rcv_nxt %u rcv_las %u", \
- .format_args = "i4i4i4i4i4", \
+ .format = "rto_stat: rto %u srtt %u rttvar %u ", \
+ .format_args = "i4i4i4", \
}; \
- DECLARE_ETD(_tc, _e, 5); \
- ed->data[0] = _tc->rcv_wnd; \
- ed->data[1] = _obs; \
- ed->data[2] = _av; \
- ed->data[3] = _tc->rcv_nxt - _tc->irs; \
- ed->data[4] = _tc->rcv_las - _tc->irs; \
+ DECLARE_ETD(_tc, _e, 3); \
+ ed->data[0] = _tc->rto; \
+ ed->data[1] = _tc->srtt; \
+ ed->data[2] = _tc->rttvar; \
} \
}
-#if TCP_DBG_VERBOSE
-#define TCP_EVT_SND_WND_HANDLER(_tc, ...) \
+#define TCP_EVT_CC_STAT_HANDLER(_tc, ...) \
{ \
- ELOG_TYPE_DECLARE (_e) = \
- { \
- .format = "snd_wnd update: %u ", \
- .format_args = "i4", \
- }; \
- DECLARE_ETD(_tc, _e, 1); \
- ed->data[0] = _tc->snd_wnd; \
-}
-
-#define TCP_EVT_OUTPUT_HANDLER(_tc, flags, n_bytes,...) \
+if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now()) \
{ \
ELOG_TYPE_DECLARE (_e) = \
{ \
- .format = "out: flags %x, bytes %u", \
- .format_args = "i4i4", \
+ .format = "cc_stat: cwnd %u flight %u space %u ssthresh %u snd_wnd %u",\
+ .format_args = "i4i4i4i4i4", \
}; \
- DECLARE_ETD(_tc, _e, 2); \
- ed->data[0] = flags; \
- ed->data[1] = n_bytes; \
+ DECLARE_ETD(_tc, _e, 5); \
+ ed->data[0] = _tc->cwnd; \
+ ed->data[1] = tcp_flight_size (_tc); \
+ ed->data[2] = tcp_snd_space (_tc); \
+ ed->data[3] = _tc->ssthresh; \
+ ed->data[4] = _tc->snd_wnd; \
+ TCP_EVT_CC_RTO_STAT_HANDLER (_tc); \
+ _tc->c_cc_stat_tstamp = tcp_time_now(); \
+} \
}
+
#else
-#define TCP_EVT_SND_WND_HANDLER(_tc, ...)
-#define TCP_EVT_OUTPUT_HANDLER(_tc, flags, n_bytes,...)
+#define TCP_EVT_CC_STAT_HANDLER(_tc, ...)
#endif
-#define CONCAT_HELPER(_a, _b) _a##_b
-#define CC(_a, _b) CONCAT_HELPER(_a, _b)
-#define TCP_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args)
-
#else
-#define TCP_EVT_DBG(_evt, _args...)
+#define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...)
+#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...)
+#define TCP_EVT_CC_PACK_HANDLER(_tc, ...)
#endif
-
#endif /* SRC_VNET_TCP_TCP_DEBUG_H_ */
/*
* fd.io coding-style-patch-verification: ON
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index ff2229b3792..a2e6dad1298 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -378,16 +378,20 @@ tcp_rcv_ack_is_acceptable (tcp_connection_t * tc0, vlib_buffer_t * tb0)
static void
tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt)
{
- int err;
+ int err, diff;
if (tc->srtt != 0)
{
err = mrtt - tc->srtt;
- tc->srtt += err >> 3;
+// tc->srtt += err >> 3;
/* XXX Drop in RTT results in RTTVAR increase and bigger RTO.
* The increase should be bound */
- tc->rttvar += ((int) clib_abs (err) - (int) tc->rttvar) >> 2;
+// tc->rttvar += ((int) clib_abs (err) - (int) tc->rttvar) >> 2;
+
+ tc->srtt = clib_max ((int) tc->srtt + (err >> 3), 1);
+ diff = (clib_abs (err) - (int) tc->rttvar) >> 2;
+ tc->rttvar = clib_max ((int) tc->rttvar + diff, 1);
}
else
{
@@ -401,6 +405,7 @@ void
tcp_update_rto (tcp_connection_t * tc)
{
tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX);
+ tc->rto = clib_max (tc->rto, TCP_RTO_MIN);
}
/** Update RTT estimate and RTO timer
@@ -417,8 +422,8 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack)
u32 mrtt = 0;
u8 rtx_acked;
- /* Determine if only rtx bytes are acked. TODO XXX fast retransmit */
- rtx_acked = tc->rto_boff && (tc->bytes_acked <= tc->snd_mss);
+ /* Determine if only rtx bytes are acked. */
+ rtx_acked = tcp_in_cong_recovery (tc) || !tc->bytes_acked;
/* Karn's rule, part 1. Don't use retransmitted segments to estimate
* RTT because they're ambiguous. */
@@ -428,8 +433,7 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack)
}
/* As per RFC7323 TSecr can be used for RTTM only if the segment advances
* snd_una, i.e., the left side of the send window:
- * seq_lt (tc->snd_una, ack). Note: last condition could be dropped, we don't
- * try to update rtt for dupacks */
+ * seq_lt (tc->snd_una, ack). */
else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr
&& tc->bytes_acked)
{
@@ -550,11 +554,13 @@ scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index,
prev = scoreboard_get_hole (sb, prev_index);
if (prev)
{
- hole->prev = prev - sb->holes;
+ hole->prev = prev_index;
hole->next = prev->next;
if ((next = scoreboard_next_hole (sb, hole)))
next->prev = hole_index;
+ else
+ sb->tail = hole_index;
prev->next = hole_index;
}
@@ -569,12 +575,13 @@ scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index,
}
void
-scoreboard_update_lost (tcp_connection_t * tc, sack_scoreboard_t * sb)
+scoreboard_update_bytes (tcp_connection_t * tc, sack_scoreboard_t * sb)
{
sack_scoreboard_hole_t *hole, *prev;
u32 bytes = 0, blks = 0;
sb->lost_bytes = 0;
+ sb->sacked_bytes = 0;
hole = scoreboard_last_hole (sb);
if (!hole)
return;
@@ -594,13 +601,16 @@ scoreboard_update_lost (tcp_connection_t * tc, sack_scoreboard_t * sb)
hole = prev;
}
- hole = prev;
while (hole)
{
sb->lost_bytes += scoreboard_hole_bytes (hole);
hole->is_lost = 1;
+ prev = hole;
hole = scoreboard_prev_hole (sb, hole);
+ if (hole)
+ bytes += prev->start - hole->end;
}
+ sb->sacked_bytes = bytes;
}
/**
@@ -677,7 +687,7 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
{
sack_scoreboard_t *sb = &tc->sack_sb;
sack_block_t *blk, tmp;
- sack_scoreboard_hole_t *hole, *next_hole, *last_hole, *new_hole;
+ sack_scoreboard_hole_t *hole, *next_hole, *last_hole;
u32 blk_index = 0, old_sacked_bytes, hole_index;
int i, j;
@@ -743,6 +753,10 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
if (seq_gt (tc->snd_una_max, sb->high_sacked)
&& seq_gt (tc->snd_una_max, last_hole->end))
last_hole->end = tc->snd_una_max;
+ /* keep track of max byte sacked for when the last hole
+ * is acked */
+ if (seq_gt (tmp.end, sb->high_sacked))
+ sb->high_sacked = tmp.end;
}
/* Walk the holes with the SACK blocks */
@@ -758,45 +772,20 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
{
next_hole = scoreboard_next_hole (sb, hole);
- /* Byte accounting */
- if (seq_leq (hole->end, ack))
- {
- /* Bytes lost because snd_wnd left edge advances */
- if (next_hole && seq_leq (next_hole->start, ack))
- sb->last_bytes_delivered += next_hole->start - hole->end;
- else
- sb->last_bytes_delivered += ack - hole->end;
- }
- else
- {
- sb->sacked_bytes += scoreboard_hole_bytes (hole);
- }
-
- /* About to remove last hole */
- if (hole == last_hole)
- {
- sb->tail = hole->prev;
- last_hole = scoreboard_last_hole (sb);
- /* keep track of max byte sacked for when the last hole
- * is acked */
- if (seq_gt (hole->end, sb->high_sacked))
- sb->high_sacked = hole->end;
- }
-
- /* snd_una needs to be advanced */
- if (blk->end == ack && seq_geq (ack, hole->end))
+ /* Byte accounting: snd_una needs to be advanced */
+ if (blk->end == ack)
{
- if (next_hole && seq_lt (ack, next_hole->start))
+ if (next_hole)
{
- sb->snd_una_adv = next_hole->start - ack;
-
- /* all these can be delivered */
- sb->last_bytes_delivered += sb->snd_una_adv;
+ if (seq_lt (ack, next_hole->start))
+ sb->snd_una_adv = next_hole->start - ack;
+ sb->last_bytes_delivered +=
+ next_hole->start - hole->end;
}
else if (!next_hole)
{
sb->snd_una_adv = sb->high_sacked - ack;
- sb->last_bytes_delivered += sb->snd_una_adv;
+ sb->last_bytes_delivered += sb->high_sacked - hole->end;
}
}
@@ -808,7 +797,6 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
{
if (seq_gt (blk->end, hole->start))
{
- sb->sacked_bytes += blk->end - hole->start;
hole->start = blk->end;
}
blk_index++;
@@ -819,28 +807,16 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
/* Hole must be split */
if (seq_lt (blk->end, hole->end))
{
- sb->sacked_bytes += blk->end - blk->start;
hole_index = scoreboard_hole_index (sb, hole);
- new_hole = scoreboard_insert_hole (sb, hole_index, blk->end,
- hole->end);
+ scoreboard_insert_hole (sb, hole_index, blk->end, hole->end);
/* Pool might've moved */
hole = scoreboard_get_hole (sb, hole_index);
hole->end = blk->start;
-
- /* New or split of tail */
- if ((last_hole->end == new_hole->end)
- || seq_lt (last_hole->end, new_hole->start))
- {
- last_hole = new_hole;
- sb->tail = scoreboard_hole_index (sb, new_hole);
- }
-
blk_index++;
}
- else if (seq_leq (blk->start, hole->end))
+ else if (seq_lt (blk->start, hole->end))
{
- sb->sacked_bytes += hole->end - blk->start;
hole->end = blk->start;
}
@@ -848,9 +824,13 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
}
}
- sb->last_sacked_bytes = sb->sacked_bytes - old_sacked_bytes;
- sb->sacked_bytes -= sb->last_bytes_delivered;
- scoreboard_update_lost (tc, sb);
+ scoreboard_update_bytes (tc, sb);
+ sb->last_sacked_bytes = sb->sacked_bytes
+ - (old_sacked_bytes - sb->last_bytes_delivered);
+ ASSERT (sb->sacked_bytes == 0
+ || sb->sacked_bytes < tc->snd_una_max - seq_max (tc->snd_una, ack));
+ ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_una_max
+ - seq_max (tc->snd_una, ack));
}
/**
@@ -998,9 +978,14 @@ tcp_should_fastrecover (tcp_connection_t * tc)
|| tcp_should_fastrecover_sack (tc));
}
+/**
+ * One function to rule them all ... and in the darkness bind them
+ */
static void
tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
{
+ u32 rxt_delivered;
+
/*
* Duplicate ACK. Check if we should enter fast recovery, or if already in
* it account for the bytes that left the network.
@@ -1028,10 +1013,15 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
}
/* If of of the two conditions lower hold, reset dupacks
- * 1) Cumulative ack does not cover more than congestion threshold
+ * 1) Cumulative ack does not cover more than congestion threshold,
+ * and the following doesn't hold: the congestion window is
+ * greater than SMSS bytes and the difference between highest_ack
+ * and prev_highest_ack is at most 4*SMSS bytes (XXX)
* 2) RFC6582 heuristic to avoid multiple fast retransmits
*/
- if (seq_leq (tc->snd_una, tc->snd_congestion)
+ if ((seq_gt (tc->snd_una, tc->snd_congestion)
+ || !(tc->cwnd > tc->snd_mss
+ && tc->bytes_acked <= 4 * tc->snd_mss))
|| tc->rcv_opts.tsecr != tc->tsecr_last_ack)
{
tc->rcv_dupacks = 0;
@@ -1089,7 +1079,10 @@ partial_ack:
{
/* If spurious return, we've already updated everything */
if (tcp_cc_recover (tc))
- return;
+ {
+ tc->tsecr_last_ack = tc->rcv_opts.tsecr;
+ return;
+ }
tc->snd_nxt = tc->snd_una_max;
@@ -1115,12 +1108,16 @@ partial_ack:
return;
/* Remove retransmitted bytes that have been delivered */
- if (tc->sack_sb.last_bytes_delivered
- && seq_gt (tc->sack_sb.high_rxt, tc->snd_una))
+ ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv
+ >= tc->sack_sb.last_bytes_delivered);
+ rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv
+ - tc->sack_sb.last_bytes_delivered;
+ if (rxt_delivered && seq_gt (tc->sack_sb.high_rxt, tc->snd_una))
{
/* If we have sacks and we haven't gotten an ack beyond high_rxt,
* remove sacked bytes delivered */
- tc->snd_rxt_bytes -= tc->sack_sb.last_bytes_delivered;
+ ASSERT (tc->snd_rxt_bytes >= rxt_delivered);
+ tc->snd_rxt_bytes -= rxt_delivered;
}
else
{
@@ -1154,6 +1151,8 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
u32 prev_snd_wnd, prev_snd_una;
u8 is_dack;
+ TCP_EVT_DBG (TCP_EVT_CC_STAT, tc);
+
/* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) */
if (PREDICT_FALSE (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)))
{
@@ -1282,6 +1281,10 @@ tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end)
{
vec_add1 (new_list, tc->snd_sacks[i]);
}
+ else
+ {
+ clib_warning ("sack discarded");
+ }
}
ASSERT (vec_len (new_list) <= TCP_MAX_SACK_BLOCKS);
@@ -1358,16 +1361,18 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b,
stream_session_t *s0;
int rv;
+ ASSERT (seq_gt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt));
+
/* Pure ACK. Do nothing */
if (PREDICT_FALSE (data_len == 0))
{
return TCP_ERROR_PURE_ACK;
}
- /* Enqueue out-of-order data with absolute offset */
+ /* Enqueue out-of-order data with relative offset */
rv = stream_session_enqueue_data (&tc->connection, b,
- vnet_buffer (b)->tcp.seq_number,
- 0 /* queue event */ , 0);
+ vnet_buffer (b)->tcp.seq_number -
+ tc->rcv_nxt, 0 /* queue event */ , 0);
/* Nothing written */
if (rv)
@@ -1388,10 +1393,15 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b,
/* Get the newest segment from the fifo */
newest = svm_fifo_newest_ooo_segment (s0->server_rx_fifo);
- start = ooo_segment_offset (s0->server_rx_fifo, newest);
- end = ooo_segment_end_offset (s0->server_rx_fifo, newest);
+ if (newest)
+ {
+ start =
+ tc->rcv_nxt + ooo_segment_offset (s0->server_rx_fifo, newest);
+ end = start + ooo_segment_length (s0->server_rx_fifo, newest);
+ tcp_update_sack_list (tc, start, end);
- tcp_update_sack_list (tc, start, end);
+ ASSERT (seq_gt (start, tc->rcv_nxt));
+ }
}
return TCP_ERROR_ENQUEUED;
@@ -1411,7 +1421,7 @@ tcp_can_delack (tcp_connection_t * tc)
/* constrained to send ack */
|| (tc->flags & TCP_CONN_SNDACK) != 0
/* we're almost out of tx wnd */
- || tcp_available_snd_space (tc) < 2 * tc->snd_mss)
+ || tcp_available_snd_space (tc) < 4 * tc->snd_mss)
return 0;
return 1;
@@ -1434,7 +1444,7 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b,
*next0 = TCP_NEXT_DROP;
/* Completely in the past (possible retransmit) */
- if (seq_lt (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt))
+ if (seq_leq (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt))
goto done;
/* Chop off the bytes in the past */
@@ -1873,8 +1883,8 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
if (tcp_opts_wscale (&new_tc0->rcv_opts))
new_tc0->snd_wscale = new_tc0->rcv_opts.wscale;
- /* No scaling */
- new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window);
+ new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window)
+ << new_tc0->snd_wscale;
new_tc0->snd_wl1 = seq0;
new_tc0->snd_wl2 = ack0;
@@ -1892,8 +1902,15 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
/* Make sure las is initialized for the wnd computation */
new_tc0->rcv_las = new_tc0->rcv_nxt;
- /* Notify app that we have connection */
- stream_session_connect_notify (&new_tc0->connection, sst, 0);
+ /* Notify app that we have connection. If session layer can't
+ * allocate session send reset */
+ if (stream_session_connect_notify (&new_tc0->connection, sst,
+ 0))
+ {
+ tcp_connection_cleanup (new_tc0);
+ tcp_send_reset (b0, is_ip4);
+ goto drop;
+ }
stream_session_init_fifos_pointers (&new_tc0->connection,
new_tc0->irs + 1,
@@ -1907,7 +1924,14 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
new_tc0->state = TCP_STATE_SYN_RCVD;
/* Notify app that we have connection */
- stream_session_connect_notify (&new_tc0->connection, sst, 0);
+ if (stream_session_connect_notify
+ (&new_tc0->connection, sst, 0))
+ {
+ tcp_connection_cleanup (new_tc0);
+ tcp_send_reset (b0, is_ip4);
+ goto drop;
+ }
+
stream_session_init_fifos_pointers (&new_tc0->connection,
new_tc0->irs + 1,
new_tc0->iss + 1);
@@ -2508,8 +2532,8 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
if (tcp_opts_wscale (&child0->rcv_opts))
child0->snd_wscale = child0->rcv_opts.wscale;
- /* No scaling */
- child0->snd_wnd = clib_net_to_host_u16 (th0->window);
+ child0->snd_wnd = clib_net_to_host_u16 (th0->window)
+ << child0->snd_wscale;
child0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number;
child0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
@@ -2892,6 +2916,9 @@ do { \
_(FIN_WAIT_2, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(FIN_WAIT_2, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
TCP_ERROR_NONE);
+ _(CLOSE_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+ _(CLOSE_WAIT, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
+ TCP_ERROR_NONE);
_(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(LAST_ACK, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c
index c66250e41ea..c825e952c9b 100644
--- a/src/vnet/tcp/tcp_newreno.c
+++ b/src/vnet/tcp/tcp_newreno.c
@@ -18,7 +18,6 @@
void
newreno_congestion (tcp_connection_t * tc)
{
- tc->prev_ssthresh = tc->ssthresh;
tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss);
}
@@ -47,7 +46,8 @@ newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type)
{
if (ack_type == TCP_CC_DUPACK)
{
- tc->cwnd += tc->snd_mss;
+ if (!tcp_opts_sack_permitted (tc))
+ tc->cwnd += tc->snd_mss;
}
else if (ack_type == TCP_CC_PARTIALACK)
{
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index 47c94e6daca..554a981d924 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -1052,6 +1052,7 @@ tcp_rtx_timeout_cc (tcp_connection_t * tc)
tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss);
tc->cwnd = tcp_loss_wnd (tc);
tc->snd_congestion = tc->snd_una_max;
+
tcp_recovery_on (tc);
}
@@ -1213,7 +1214,7 @@ tcp_timer_persist_handler (u32 index)
tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID;
/* Problem already solved or worse */
- if (tc->state == TCP_STATE_CLOSED
+ if (tc->state == TCP_STATE_CLOSED || tc->state > TCP_STATE_ESTABLISHED
|| tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc))
return;
@@ -1505,10 +1506,7 @@ tcp46_output_inline (vlib_main_t * vm,
/* Stop DELACK timer and fix flags */
tc0->flags &= ~(TCP_CONN_SNDACK);
- if (tcp_timer_is_active (tc0, TCP_TIMER_DELACK))
- {
- tcp_timer_reset (tc0, TCP_TIMER_DELACK);
- }
+ tcp_timer_reset (tc0, TCP_TIMER_DELACK);
/* If not retransmitting
* 1) update snd_una_max (SYN, SYNACK, FIN)
@@ -1630,7 +1628,7 @@ tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b)
tc = (tcp_connection_t *) tconn;
tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED, 0);
- if (tc->rtt_ts == 0)
+ if (tc->rtt_ts == 0 && !tcp_in_cong_recovery (tc))
{
tc->rtt_ts = tcp_time_now ();
tc->rtt_seq = tc->snd_nxt;
diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c
index 3f8afa40e54..a461e3b8b7d 100644
--- a/src/vnet/tcp/tcp_test.c
+++ b/src/vnet/tcp/tcp_test.c
@@ -190,11 +190,18 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input)
TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes);
TCP_TEST ((pool_elts (sb->holes) == 1),
"scoreboard has %d elements", pool_elts (sb->holes));
+ hole = scoreboard_first_hole (sb);
+ TCP_TEST ((hole->prev == TCP_INVALID_SACK_HOLE_INDEX
+ && hole->next == TCP_INVALID_SACK_HOLE_INDEX), "hole is valid");
+ TCP_TEST ((sb->last_bytes_delivered == 100), "last bytes delivered %d",
+ sb->last_bytes_delivered);
/*
* Add some more blocks and then remove all
*/
vec_reset_length (tc->rcv_opts.sacks);
+ tc->snd_una += sb->snd_una_adv;
+ tc->snd_una_max = 1900;
for (i = 0; i < 5; i++)
{
block.start = i * 100 + 1200;
@@ -242,6 +249,39 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input)
TCP_TEST ((sb->last_sacked_bytes == 0),
"last sacked bytes %d", sb->last_sacked_bytes);
+ /*
+ * Inject one block, ack it and overlap hole
+ */
+
+ tc->snd_una = 0;
+ tc->snd_una_max = 1000;
+ tc->snd_nxt = 1000;
+
+ block.start = 100;
+ block.end = 500;
+ vec_add1 (tc->rcv_opts.sacks, block);
+ tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks);
+
+ tcp_rcv_sacks (tc, 0);
+
+ if (verbose)
+ vlib_cli_output (vm, "sb added [100, 500]:\n%U",
+ format_tcp_scoreboard, sb);
+
+ tcp_rcv_sacks (tc, 800);
+
+ if (verbose)
+ vlib_cli_output (vm, "sb ack [0, 800]:\n%U", format_tcp_scoreboard, sb);
+
+ TCP_TEST ((pool_elts (sb->holes) == 1),
+ "scoreboard has %d elements", pool_elts (sb->holes));
+ TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv);
+ TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes);
+ TCP_TEST ((sb->last_sacked_bytes == 0),
+ "last sacked bytes %d", sb->last_sacked_bytes);
+ TCP_TEST ((sb->last_bytes_delivered == 400),
+ "last bytes delivered %d", sb->last_bytes_delivered);
+
return 0;
}
@@ -571,7 +611,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input)
*/
for (i = 0; i < 3; i++)
{
- offset = (2 * i + 1) * sizeof (u32);
+ offset = (2 * i + 1) * sizeof (u32) - f->tail;
data = (u8 *) (test_data + (2 * i + 1));
if (i == 0)
{
@@ -600,7 +640,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input)
/*
* Try adding a completely overlapped segment
*/
- offset = 3 * sizeof (u32);
+ offset = 3 * sizeof (u32) - f->tail;
data = (u8 *) (test_data + 3);
rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data);
if (rv)
@@ -626,7 +666,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input)
*/
for (i = 3; i > 1; i--)
{
- offset = (2 * i + 0) * sizeof (u32);
+ offset = (2 * i + 0) * sizeof (u32) - f->tail;
data = (u8 *) (test_data + (2 * i + 0));
rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data);
if (verbose)
@@ -688,7 +728,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input)
for (i = 0; i < 4; i++)
{
- offset = (2 * i + 1) * sizeof (u32);
+ offset = (2 * i + 1) * sizeof (u32) - f->tail;
data = (u8 *) (test_data + (2 * i + 1));
rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data);
if (verbose)
@@ -701,7 +741,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input)
}
}
- rv = svm_fifo_enqueue_with_offset (f, 8, 21, data);
+ rv = svm_fifo_enqueue_with_offset (f, 8 - f->tail, 21, data);
TCP_TEST ((rv == 0), "ooo enqueued %u", rv);
TCP_TEST ((svm_fifo_number_ooo_segments (f) == 1),
"number of ooo segments %u", svm_fifo_number_ooo_segments (f));
@@ -722,7 +762,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input)
for (i = 0; i < 4; i++)
{
- offset = (2 * i + 1) * sizeof (u32);
+ offset = (2 * i + 1) * sizeof (u32) - f->tail;
data = (u8 *) (test_data + (2 * i + 1));
rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data);
if (verbose)
@@ -735,7 +775,13 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input)
}
}
+ if (verbose)
+ vlib_cli_output (vm, "fifo after enqueue: %U", format_svm_fifo, f, 1);
+
rv = svm_fifo_enqueue_nowait (f, 29, data);
+ if (verbose)
+ vlib_cli_output (vm, "fifo after enqueueing 29: %U", format_svm_fifo, f,
+ 1);
TCP_TEST ((rv == 32), "ooo enqueued %u", rv);
TCP_TEST ((svm_fifo_number_ooo_segments (f) == 0),
"number of ooo segments %u", svm_fifo_number_ooo_segments (f));
@@ -788,7 +834,8 @@ tcp_test_fifo2 (vlib_main_t * vm)
{
tp = vp + i;
data64 = tp->offset;
- svm_fifo_enqueue_with_offset (f, tp->offset, tp->len, (u8 *) & data64);
+ svm_fifo_enqueue_with_offset (f, tp->offset - f->tail, tp->len,
+ (u8 *) & data64);
}
/* Expected result: one big fat chunk at offset 4 */
@@ -817,7 +864,7 @@ tcp_test_fifo2 (vlib_main_t * vm)
{
tp = &test_data[i];
data64 = tp->offset;
- rv = svm_fifo_enqueue_with_offset (f, tp->offset, tp->len,
+ rv = svm_fifo_enqueue_with_offset (f, tp->offset - f->tail, tp->len,
(u8 *) & data64);
if (rv)
{
@@ -991,8 +1038,9 @@ tcp_test_fifo3 (vlib_main_t * vm, unformat_input_t * input)
for (i = !randomize; i < vec_len (generate); i++)
{
tp = generate + i;
- svm_fifo_enqueue_with_offset (f, fifo_initial_offset + tp->offset,
- tp->len,
+ svm_fifo_enqueue_with_offset (f,
+ fifo_initial_offset + tp->offset -
+ f->tail, tp->len,
(u8 *) data_pattern + tp->offset);
}
@@ -1107,7 +1155,7 @@ tcp_test_fifo4 (vlib_main_t * vm, unformat_input_t * input)
for (i = test_n_bytes - 1; i > 0; i--)
{
- rv = svm_fifo_enqueue_with_offset (f, fifo_initial_offset + i,
+ rv = svm_fifo_enqueue_with_offset (f, fifo_initial_offset + i - f->tail,
sizeof (u8), &test_data[i]);
if (verbose)
vlib_cli_output (vm, "add [%d] [%d, %d]", i, i, i + sizeof (u8));