aboutsummaryrefslogtreecommitdiffstats
path: root/src/vnet/tcp
diff options
context:
space:
mode:
Diffstat (limited to 'src/vnet/tcp')
-rw-r--r--src/vnet/tcp/builtin_client.c770
-rw-r--r--src/vnet/tcp/builtin_client.h121
-rw-r--r--src/vnet/tcp/builtin_http_server.c564
-rw-r--r--src/vnet/tcp/builtin_proxy.c601
-rw-r--r--src/vnet/tcp/builtin_proxy.h100
-rw-r--r--src/vnet/tcp/builtin_server.c455
-rw-r--r--src/vnet/tcp/tcp.api42
-rw-r--r--src/vnet/tcp/tcp.c1943
-rw-r--r--src/vnet/tcp/tcp.h985
-rw-r--r--src/vnet/tcp/tcp_api.c119
-rwxr-xr-xsrc/vnet/tcp/tcp_debug.h761
-rw-r--r--src/vnet/tcp/tcp_error.def43
-rw-r--r--src/vnet/tcp/tcp_format.c137
-rw-r--r--src/vnet/tcp/tcp_input.c3215
-rw-r--r--src/vnet/tcp/tcp_newreno.c107
-rw-r--r--src/vnet/tcp/tcp_output.c2113
-rw-r--r--src/vnet/tcp/tcp_packet.h184
-rw-r--r--src/vnet/tcp/tcp_pg.c244
-rw-r--r--src/vnet/tcp/tcp_syn_filter4.c545
-rw-r--r--src/vnet/tcp/tcp_test.c1764
-rw-r--r--src/vnet/tcp/tcp_timer.h29
21 files changed, 14842 insertions, 0 deletions
diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c
new file mode 100644
index 00000000..527b3289
--- /dev/null
+++ b/src/vnet/tcp/builtin_client.c
@@ -0,0 +1,770 @@
+/*
+ * builtin_client.c - vpp built-in tcp client/connect code
+ *
+ * Copyright (c) 2017 by Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/plugin/plugin.h>
+#include <vnet/tcp/builtin_client.h>
+
+#include <vlibapi/api.h>
+#include <vlibmemory/api.h>
+#include <vlibsocket/api.h>
+#include <vpp/app/version.h>
+
+#define TCP_BUILTIN_CLIENT_DBG (0)
+
+static void
+signal_evt_to_cli_i (int *code)
+{
+ tclient_main_t *tm = &tclient_main;
+ ASSERT (vlib_get_thread_index () == 0);
+ vlib_process_signal_event (tm->vlib_main, tm->cli_node_index, *code, 0);
+}
+
+static void
+signal_evt_to_cli (int code)
+{
+ if (vlib_get_thread_index () != 0)
+ vl_api_rpc_call_main_thread (signal_evt_to_cli_i, (u8 *) & code,
+ sizeof (code));
+ else
+ signal_evt_to_cli_i (&code);
+}
+
+static void
+send_test_chunk (tclient_main_t * tm, session_t * s)
+{
+ u8 *test_data = tm->connect_test_data;
+ int test_buf_offset;
+ u32 bytes_this_chunk;
+ session_fifo_event_t evt;
+ static int serial_number = 0;
+ svm_fifo_t *txf;
+ int rv;
+
+ ASSERT (vec_len (test_data) > 0);
+
+ test_buf_offset = s->bytes_sent % vec_len (test_data);
+ bytes_this_chunk = vec_len (test_data) - test_buf_offset;
+
+ bytes_this_chunk = bytes_this_chunk < s->bytes_to_send
+ ? bytes_this_chunk : s->bytes_to_send;
+
+ txf = s->server_tx_fifo;
+ rv = svm_fifo_enqueue_nowait (txf, bytes_this_chunk,
+ test_data + test_buf_offset);
+
+ /* If we managed to enqueue data... */
+ if (rv > 0)
+ {
+ /* Account for it... */
+ s->bytes_to_send -= rv;
+ s->bytes_sent += rv;
+
+ if (TCP_BUILTIN_CLIENT_DBG)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "tx-enq: xfer %d bytes, sent %u remain %u",
+ .format_args = "i4i4i4",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 data[3];
+ } *ed;
+ ed = ELOG_DATA (&vlib_global_main.elog_main, e);
+ ed->data[0] = rv;
+ ed->data[1] = s->bytes_sent;
+ ed->data[2] = s->bytes_to_send;
+ }
+
+ /* Poke the session layer */
+ if (svm_fifo_set_event (txf))
+ {
+ /* Fabricate TX event, send to vpp */
+ evt.fifo = txf;
+ evt.event_type = FIFO_EVENT_APP_TX;
+ evt.event_id = serial_number++;
+
+ if (unix_shared_memory_queue_add
+ (tm->vpp_event_queue[txf->master_thread_index], (u8 *) & evt,
+ 0 /* do wait for mutex */ ))
+ clib_warning ("could not enqueue event");
+ }
+ }
+}
+
+static void
+receive_test_chunk (tclient_main_t * tm, session_t * s)
+{
+ svm_fifo_t *rx_fifo = s->server_rx_fifo;
+ int n_read, test_bytes = 0;
+ u32 my_thread_index = vlib_get_thread_index ();
+
+ /* Allow enqueuing of new event */
+ // svm_fifo_unset_event (rx_fifo);
+
+ if (test_bytes)
+ {
+ n_read = svm_fifo_dequeue_nowait (rx_fifo,
+ vec_len (tm->rx_buf[my_thread_index]),
+ tm->rx_buf[my_thread_index]);
+ }
+ else
+ {
+ n_read = svm_fifo_max_dequeue (rx_fifo);
+ svm_fifo_dequeue_drop (rx_fifo, n_read);
+ }
+
+ if (n_read > 0)
+ {
+ if (TCP_BUILTIN_CLIENT_DBG)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "rx-deq: %d bytes",
+ .format_args = "i4",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 data[1];
+ } *ed;
+ ed = ELOG_DATA (&vlib_global_main.elog_main, e);
+ ed->data[0] = n_read;
+ }
+
+ if (test_bytes)
+ {
+ int i;
+ for (i = 0; i < n_read; i++)
+ {
+ if (tm->rx_buf[my_thread_index][i]
+ != ((s->bytes_received + i) & 0xff))
+ {
+ clib_warning ("read %d error at byte %lld, 0x%x not 0x%x",
+ n_read, s->bytes_received + i,
+ tm->rx_buf[my_thread_index][i],
+ ((s->bytes_received + i) & 0xff));
+ }
+ }
+ }
+ s->bytes_to_receive -= n_read;
+ s->bytes_received += n_read;
+ }
+}
+
+static uword
+builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ tclient_main_t *tm = &tclient_main;
+ int my_thread_index = vlib_get_thread_index ();
+ session_t *sp;
+ int i;
+ int delete_session;
+ u32 *connection_indices;
+ u32 *connections_this_batch;
+ u32 nconnections_this_batch;
+
+ connection_indices = tm->connection_index_by_thread[my_thread_index];
+ connections_this_batch =
+ tm->connections_this_batch_by_thread[my_thread_index];
+
+ if ((tm->run_test == 0) ||
+ ((vec_len (connection_indices) == 0)
+ && vec_len (connections_this_batch) == 0))
+ return 0;
+
+ /* Grab another pile of connections */
+ if (PREDICT_FALSE (vec_len (connections_this_batch) == 0))
+ {
+ nconnections_this_batch =
+ clib_min (tm->connections_per_batch, vec_len (connection_indices));
+
+ ASSERT (nconnections_this_batch > 0);
+ vec_validate (connections_this_batch, nconnections_this_batch - 1);
+ clib_memcpy (connections_this_batch,
+ connection_indices + vec_len (connection_indices)
+ - nconnections_this_batch,
+ nconnections_this_batch * sizeof (u32));
+ _vec_len (connection_indices) -= nconnections_this_batch;
+ }
+
+ if (PREDICT_FALSE (tm->prev_conns != tm->connections_per_batch
+ && tm->prev_conns == vec_len (connections_this_batch)))
+ {
+ tm->repeats++;
+ tm->prev_conns = vec_len (connections_this_batch);
+ if (tm->repeats == 500000)
+ {
+ clib_warning ("stuck clients");
+ }
+ }
+ else
+ {
+ tm->prev_conns = vec_len (connections_this_batch);
+ tm->repeats = 0;
+ }
+
+ for (i = 0; i < vec_len (connections_this_batch); i++)
+ {
+ delete_session = 1;
+
+ sp = pool_elt_at_index (tm->sessions, connections_this_batch[i]);
+
+ if (sp->bytes_to_send > 0)
+ {
+ send_test_chunk (tm, sp);
+ delete_session = 0;
+ }
+ if (sp->bytes_to_receive > 0)
+ {
+ receive_test_chunk (tm, sp);
+ delete_session = 0;
+ }
+ if (PREDICT_FALSE (delete_session == 1))
+ {
+ u32 index, thread_index;
+ stream_session_t *s;
+
+ __sync_fetch_and_add (&tm->tx_total, sp->bytes_sent);
+ __sync_fetch_and_add (&tm->rx_total, sp->bytes_received);
+
+ stream_session_parse_handle (sp->vpp_session_handle,
+ &index, &thread_index);
+ s = stream_session_get_if_valid (index, thread_index);
+
+ if (s)
+ {
+ vnet_disconnect_args_t _a, *a = &_a;
+ a->handle = stream_session_handle (s);
+ a->app_index = tm->app_index;
+ vnet_disconnect_session (a);
+
+ vec_delete (connections_this_batch, 1, i);
+ i--;
+ __sync_fetch_and_add (&tm->ready_connections, -1);
+ }
+ else
+ clib_warning ("session AWOL?");
+
+ /* Kick the debug CLI process */
+ if (tm->ready_connections == 0)
+ {
+ signal_evt_to_cli (2);
+ }
+ }
+ }
+
+ tm->connection_index_by_thread[my_thread_index] = connection_indices;
+ tm->connections_this_batch_by_thread[my_thread_index] =
+ connections_this_batch;
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (builtin_client_node) =
+{
+ .function = builtin_client_node_fn,
+ .name = "builtin-tcp-client",
+ .type = VLIB_NODE_TYPE_INPUT,
+ .state = VLIB_NODE_STATE_DISABLED,
+};
+/* *INDENT-ON* */
+
+static int
+create_api_loopback (tclient_main_t * tm)
+{
+ api_main_t *am = &api_main;
+ vl_shmem_hdr_t *shmem_hdr;
+
+ shmem_hdr = am->shmem_hdr;
+ tm->vl_input_queue = shmem_hdr->vl_input_queue;
+ tm->my_client_index =
+ vl_api_memclnt_create_internal ("tcp_test_client", tm->vl_input_queue);
+ return 0;
+}
+
+static int
+tcp_test_clients_init (vlib_main_t * vm)
+{
+ tclient_main_t *tm = &tclient_main;
+ vlib_thread_main_t *vtm = vlib_get_thread_main ();
+ u32 num_threads;
+ int i;
+
+ if (create_api_loopback (tm))
+ return -1;
+
+ num_threads = 1 /* main thread */ + vtm->n_threads;
+
+ /* Init test data. Big buffer */
+ vec_validate (tm->connect_test_data, 1024 * 1024 - 1);
+ for (i = 0; i < vec_len (tm->connect_test_data); i++)
+ tm->connect_test_data[i] = i & 0xff;
+
+ vec_validate (tm->rx_buf, num_threads - 1);
+ for (i = 0; i < num_threads; i++)
+ vec_validate (tm->rx_buf[i], vec_len (tm->connect_test_data) - 1);
+
+ tm->is_init = 1;
+
+ vec_validate (tm->connection_index_by_thread, vtm->n_vlib_mains);
+ vec_validate (tm->connections_this_batch_by_thread, vtm->n_vlib_mains);
+ vec_validate (tm->vpp_event_queue, vtm->n_vlib_mains);
+
+ return 0;
+}
+
+static int
+builtin_session_connected_callback (u32 app_index, u32 api_context,
+ stream_session_t * s, u8 is_fail)
+{
+ tclient_main_t *tm = &tclient_main;
+ session_t *session;
+ u32 session_index;
+ u8 thread_index = vlib_get_thread_index ();
+
+ if (is_fail)
+ {
+ clib_warning ("connection %d failed!", api_context);
+ signal_evt_to_cli (-1);
+ return 0;
+ }
+
+ ASSERT (s->thread_index == thread_index);
+
+ if (!tm->vpp_event_queue[thread_index])
+ tm->vpp_event_queue[thread_index] =
+ session_manager_get_vpp_event_queue (thread_index);
+
+ /*
+ * Setup session
+ */
+ clib_spinlock_lock_if_init (&tm->sessions_lock);
+ pool_get (tm->sessions, session);
+ clib_spinlock_unlock_if_init (&tm->sessions_lock);
+
+ memset (session, 0, sizeof (*session));
+ session_index = session - tm->sessions;
+ session->bytes_to_send = tm->bytes_to_send;
+ session->bytes_to_receive = tm->no_return ? 0ULL : tm->bytes_to_send;
+ session->server_rx_fifo = s->server_rx_fifo;
+ session->server_rx_fifo->client_session_index = session_index;
+ session->server_tx_fifo = s->server_tx_fifo;
+ session->server_tx_fifo->client_session_index = session_index;
+ session->vpp_session_handle = stream_session_handle (s);
+
+ vec_add1 (tm->connection_index_by_thread[thread_index], session_index);
+ __sync_fetch_and_add (&tm->ready_connections, 1);
+ if (tm->ready_connections == tm->expected_connections)
+ {
+ tm->run_test = 1;
+ /* Signal the CLI process that the action is starting... */
+ signal_evt_to_cli (1);
+ }
+
+ return 0;
+}
+
+static void
+builtin_session_reset_callback (stream_session_t * s)
+{
+ if (s->session_state == SESSION_STATE_READY)
+ clib_warning ("Reset active connection %U", format_stream_session, s, 2);
+ stream_session_cleanup (s);
+ return;
+}
+
+static int
+builtin_session_create_callback (stream_session_t * s)
+{
+ return 0;
+}
+
+static void
+builtin_session_disconnect_callback (stream_session_t * s)
+{
+ tclient_main_t *tm = &tclient_main;
+ vnet_disconnect_args_t _a, *a = &_a;
+ a->handle = stream_session_handle (s);
+ a->app_index = tm->app_index;
+ vnet_disconnect_session (a);
+ return;
+}
+
+static int
+builtin_server_rx_callback (stream_session_t * s)
+{
+ return 0;
+}
+
+/* *INDENT-OFF* */
+static session_cb_vft_t builtin_clients = {
+ .session_reset_callback = builtin_session_reset_callback,
+ .session_connected_callback = builtin_session_connected_callback,
+ .session_accept_callback = builtin_session_create_callback,
+ .session_disconnect_callback = builtin_session_disconnect_callback,
+ .builtin_server_rx_callback = builtin_server_rx_callback
+};
+/* *INDENT-ON* */
+
+static int
+attach_builtin_test_clients_app (void)
+{
+ tclient_main_t *tm = &tclient_main;
+ vnet_app_attach_args_t _a, *a = &_a;
+ u8 segment_name[128];
+ u32 segment_name_length, prealloc_fifos;
+ u64 options[16];
+
+ segment_name_length = ARRAY_LEN (segment_name);
+
+ memset (a, 0, sizeof (*a));
+ memset (options, 0, sizeof (options));
+
+ a->api_client_index = tm->my_client_index;
+ a->segment_name = segment_name;
+ a->segment_name_length = segment_name_length;
+ a->session_cb_vft = &builtin_clients;
+
+ prealloc_fifos = tm->prealloc_fifos ? tm->expected_connections : 1;
+
+ options[SESSION_OPTIONS_ACCEPT_COOKIE] = 0x12345678;
+ options[SESSION_OPTIONS_SEGMENT_SIZE] = (2ULL << 32);
+ options[SESSION_OPTIONS_RX_FIFO_SIZE] = tm->fifo_size;
+ options[SESSION_OPTIONS_TX_FIFO_SIZE] = tm->fifo_size;
+ options[APP_OPTIONS_PRIVATE_SEGMENT_COUNT] = tm->private_segment_count;
+ options[APP_OPTIONS_PRIVATE_SEGMENT_SIZE] = tm->private_segment_size;
+ options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = prealloc_fifos;
+
+ options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP;
+
+ a->options = options;
+
+ if (vnet_application_attach (a))
+ return -1;
+
+ tm->app_index = a->app_index;
+ return 0;
+}
+
+static void *
+tclient_thread_fn (void *arg)
+{
+ return 0;
+}
+
+/** Start a transmit thread */
+int
+start_tx_pthread (tclient_main_t * tm)
+{
+ if (tm->client_thread_handle == 0)
+ {
+ int rv = pthread_create (&tm->client_thread_handle,
+ NULL /*attr */ ,
+ tclient_thread_fn, 0);
+ if (rv)
+ {
+ tm->client_thread_handle = 0;
+ return -1;
+ }
+ }
+ return 0;
+}
+
+void
+clients_connect (vlib_main_t * vm, u8 * uri, u32 n_clients)
+{
+ tclient_main_t *tm = &tclient_main;
+ vnet_connect_args_t _a, *a = &_a;
+ int i;
+ for (i = 0; i < n_clients; i++)
+ {
+ memset (a, 0, sizeof (*a));
+
+ a->uri = (char *) uri;
+ a->api_context = i;
+ a->app_index = tm->app_index;
+ a->mp = 0;
+ vnet_connect_uri (a);
+
+ /* Crude pacing for call setups */
+ if ((i % 4) == 0)
+ vlib_process_suspend (vm, 10e-6);
+ ASSERT (i + 1 >= tm->ready_connections);
+ while (i + 1 - tm->ready_connections > 1000)
+ {
+ vlib_process_suspend (vm, 100e-6);
+ }
+ }
+}
+
+static clib_error_t *
+test_tcp_clients_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ tclient_main_t *tm = &tclient_main;
+ vlib_thread_main_t *thread_main = vlib_get_thread_main ();
+ uword *event_data = 0, event_type;
+ u8 *default_connect_uri = (u8 *) "tcp://6.0.1.1/1234", *uri;
+ u64 tmp, total_bytes;
+ f64 test_timeout = 20.0, syn_timeout = 20.0, delta;
+ f64 time_before_connects;
+ u32 n_clients = 1;
+ int preallocate_sessions = 0;
+ char *transfer_type;
+ int i;
+
+ tm->bytes_to_send = 8192;
+ tm->no_return = 0;
+ tm->fifo_size = 64 << 10;
+ tm->connections_per_batch = 1000;
+ tm->private_segment_count = 0;
+ tm->private_segment_size = 0;
+ tm->vlib_main = vm;
+ if (thread_main->n_vlib_mains > 1)
+ clib_spinlock_init (&tm->sessions_lock);
+ vec_free (tm->connect_uri);
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "nclients %d", &n_clients))
+ ;
+ else if (unformat (input, "mbytes %lld", &tmp))
+ tm->bytes_to_send = tmp << 20;
+ else if (unformat (input, "gbytes %lld", &tmp))
+ tm->bytes_to_send = tmp << 30;
+ else if (unformat (input, "bytes %lld", &tm->bytes_to_send))
+ ;
+ else if (unformat (input, "uri %s", &tm->connect_uri))
+ ;
+ else if (unformat (input, "test-timeout %f", &test_timeout))
+ ;
+ else if (unformat (input, "syn-timeout %f", &syn_timeout))
+ ;
+ else if (unformat (input, "no-return"))
+ tm->no_return = 1;
+ else if (unformat (input, "fifo-size %d", &tm->fifo_size))
+ tm->fifo_size <<= 10;
+ else if (unformat (input, "private-segment-count %d",
+ &tm->private_segment_count))
+ ;
+ else if (unformat (input, "private-segment-size %U",
+ unformat_memory_size, &tmp))
+ {
+ if (tmp >= 0x100000000ULL)
+ return clib_error_return
+ (0, "private segment size %lld (%llu) too large", tmp, tmp);
+ tm->private_segment_size = tmp;
+ }
+ else if (unformat (input, "preallocate-fifos"))
+ tm->prealloc_fifos = 1;
+ else if (unformat (input, "preallocate-sessions"))
+ preallocate_sessions = 1;
+ else
+ if (unformat (input, "client-batch %d", &tm->connections_per_batch))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ /* Store cli process node index for signalling */
+ tm->cli_node_index = vlib_get_current_process (vm)->node_runtime.node_index;
+
+ if (tm->is_init == 0)
+ {
+ if (tcp_test_clients_init (vm))
+ return clib_error_return (0, "failed init");
+ }
+
+
+ tm->ready_connections = 0;
+ tm->expected_connections = n_clients;
+ tm->rx_total = 0;
+ tm->tx_total = 0;
+
+ uri = default_connect_uri;
+ if (tm->connect_uri)
+ uri = tm->connect_uri;
+
+#if TCP_BUILTIN_CLIENT_PTHREAD
+ start_tx_pthread ();
+#endif
+
+ vlib_worker_thread_barrier_sync (vm);
+ vnet_session_enable_disable (vm, 1 /* turn on TCP, etc. */ );
+ vlib_worker_thread_barrier_release (vm);
+
+ if (tm->test_client_attached == 0)
+ {
+ if (attach_builtin_test_clients_app ())
+ {
+ return clib_error_return (0, "app attach failed");
+ }
+ }
+ tm->test_client_attached = 1;
+
+ /* Turn on the builtin client input nodes */
+ for (i = 0; i < thread_main->n_vlib_mains; i++)
+ vlib_node_set_state (vlib_mains[i], builtin_client_node.index,
+ VLIB_NODE_STATE_POLLING);
+
+ if (preallocate_sessions)
+ {
+ session_t *sp __attribute__ ((unused));
+ for (i = 0; i < n_clients; i++)
+ pool_get (tm->sessions, sp);
+ for (i = 0; i < n_clients; i++)
+ pool_put_index (tm->sessions, i);
+ }
+
+ /* Fire off connect requests */
+ time_before_connects = vlib_time_now (vm);
+ clients_connect (vm, uri, n_clients);
+
+ /* Park until the sessions come up, or ten seconds elapse... */
+ vlib_process_wait_for_event_or_clock (vm, syn_timeout);
+ event_type = vlib_process_get_events (vm, &event_data);
+ switch (event_type)
+ {
+ case ~0:
+ vlib_cli_output (vm, "Timeout with only %d sessions active...",
+ tm->ready_connections);
+ goto cleanup;
+
+ case 1:
+ delta = vlib_time_now (vm) - time_before_connects;
+
+ if (delta != 0.0)
+ {
+ vlib_cli_output
+ (vm, "%d three-way handshakes in %.2f seconds, %.2f/sec",
+ n_clients, delta, ((f64) n_clients) / delta);
+ }
+
+ tm->test_start_time = vlib_time_now (tm->vlib_main);
+ vlib_cli_output (vm, "Test started at %.6f", tm->test_start_time);
+ break;
+
+ default:
+ vlib_cli_output (vm, "unexpected event(1): %d", event_type);
+ goto cleanup;
+ }
+
+ /* Now wait for the sessions to finish... */
+ vlib_process_wait_for_event_or_clock (vm, test_timeout);
+ event_type = vlib_process_get_events (vm, &event_data);
+ switch (event_type)
+ {
+ case ~0:
+ vlib_cli_output (vm, "Timeout with %d sessions still active...",
+ tm->ready_connections);
+ goto cleanup;
+
+ case 2:
+ tm->test_end_time = vlib_time_now (vm);
+ vlib_cli_output (vm, "Test finished at %.6f", tm->test_end_time);
+ break;
+
+ default:
+ vlib_cli_output (vm, "unexpected event(2): %d", event_type);
+ goto cleanup;
+ }
+
+ delta = tm->test_end_time - tm->test_start_time;
+
+ if (delta != 0.0)
+ {
+ total_bytes = (tm->no_return ? tm->tx_total : tm->rx_total);
+ transfer_type = tm->no_return ? "half-duplex" : "full-duplex";
+ vlib_cli_output (vm,
+ "%lld bytes (%lld mbytes, %lld gbytes) in %.2f seconds",
+ total_bytes, total_bytes / (1ULL << 20),
+ total_bytes / (1ULL << 30), delta);
+ vlib_cli_output (vm, "%.2f bytes/second %s",
+ ((f64) total_bytes) / (delta), transfer_type);
+ vlib_cli_output (vm, "%.4f gbit/second %s",
+ (((f64) total_bytes * 8.0) / delta / 1e9),
+ transfer_type);
+ }
+ else
+ vlib_cli_output (vm, "zero delta-t?");
+
+cleanup:
+ tm->run_test = 0;
+ for (i = 0; i < vec_len (tm->connection_index_by_thread); i++)
+ {
+ vec_reset_length (tm->connection_index_by_thread[i]);
+ vec_reset_length (tm->connections_this_batch_by_thread[i]);
+ }
+
+ pool_free (tm->sessions);
+
+ /* Detach the application, so we can use different fifo sizes next time */
+ if (tm->test_client_attached)
+ {
+ vnet_app_detach_args_t _da, *da = &_da;
+ int rv;
+
+ da->app_index = tm->app_index;
+
+ rv = vnet_application_detach (da);
+ if (rv)
+ vlib_cli_output (vm, "WARNING: app detach failed...");
+ tm->test_client_attached = 0;
+ tm->app_index = ~0;
+ }
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (test_clients_command, static) =
+{
+ .path = "test tcp clients",
+ .short_help = "test tcp clients [nclients %d] [[m|g]bytes <bytes>] "
+ "[test-timeout <time>][syn-timeout <time>][no-return][fifo-size <size>]"
+ "[private-segment-count <count>][private-segment-size <bytes>[m|g]]"
+ "[preallocate-fifos][preallocate-sessions][client-batch <batch-size>]"
+ "[uri <tcp://ip/port>]",
+ .function = test_tcp_clients_command_fn,
+ .is_mp_safe = 1,
+};
+/* *INDENT-ON* */
+
+clib_error_t *
+tcp_test_clients_main_init (vlib_main_t * vm)
+{
+ tclient_main_t *tm = &tclient_main;
+ tm->is_init = 0;
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (tcp_test_clients_main_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/tcp/builtin_client.h b/src/vnet/tcp/builtin_client.h
new file mode 100644
index 00000000..06d239ef
--- /dev/null
+++ b/src/vnet/tcp/builtin_client.h
@@ -0,0 +1,121 @@
+
+/*
+ * tclient.h - skeleton vpp engine plug-in header file
+ *
+ * Copyright (c) <current-year> <your-organization>
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_tclient_h__
+#define __included_tclient_h__
+
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h>
+
+#include <vppinfra/hash.h>
+#include <vppinfra/error.h>
+#include <vlibmemory/unix_shared_memory_queue.h>
+#include <svm/svm_fifo_segment.h>
+#include <vnet/session/session.h>
+#include <vnet/session/application_interface.h>
+
+typedef struct
+{
+ u64 bytes_to_send;
+ u64 bytes_sent;
+ u64 bytes_to_receive;
+ u64 bytes_received;
+
+ svm_fifo_t *server_rx_fifo;
+ svm_fifo_t *server_tx_fifo;
+
+ u64 vpp_session_handle;
+} session_t;
+
+typedef struct
+{
+ /*
+ * Application setup parameters
+ */
+ unix_shared_memory_queue_t *vl_input_queue; /**< vpe input queue */
+ unix_shared_memory_queue_t **vpp_event_queue;
+
+ u32 cli_node_index; /**< cli process node index */
+ u32 my_client_index; /**< loopback API client handle */
+ u32 app_index; /**< app index after attach */
+
+ /*
+ * Configuration params
+ */
+ u8 *connect_uri; /**< URI for slave's connect */
+ u64 bytes_to_send; /**< Bytes to send */
+ u32 configured_segment_size;
+ u32 fifo_size;
+ u32 expected_connections; /**< Number of clients/connections */
+ u32 connections_per_batch; /**< Connections to rx/tx at once */
+ u32 private_segment_count; /**< Number of private fifo segs */
+ u32 private_segment_size; /**< size of private fifo segs */
+
+ /*
+ * Test state variables
+ */
+ session_t *sessions; /**< Session pool, shared */
+ clib_spinlock_t sessions_lock;
+ u8 **rx_buf; /**< intermediate rx buffers */
+ u8 *connect_test_data; /**< Pre-computed test data */
+ u32 **connection_index_by_thread;
+ u32 **connections_this_batch_by_thread; /**< active connection batch */
+ pthread_t client_thread_handle;
+
+ volatile u32 ready_connections;
+ volatile u32 finished_connections;
+ volatile u64 rx_total;
+ volatile u64 tx_total;
+ volatile int run_test; /**< Signal start of test */
+
+ f64 test_start_time;
+ f64 test_end_time;
+ u32 prev_conns;
+ u32 repeats;
+ /*
+ * Flags
+ */
+ u8 is_init;
+ u8 test_client_attached;
+ u8 no_return;
+ u8 test_return_packets;
+ int i_am_master;
+ int drop_packets; /**< drop all packets */
+ u8 prealloc_fifos; /**< Request fifo preallocation */
+
+ /*
+ * Convenience
+ */
+ vlib_main_t *vlib_main;
+ vnet_main_t *vnet_main;
+ ethernet_main_t *ethernet_main;
+} tclient_main_t;
+
+tclient_main_t tclient_main;
+
+vlib_node_registration_t tclient_node;
+
+#endif /* __included_tclient_h__ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/tcp/builtin_http_server.c b/src/vnet/tcp/builtin_http_server.c
new file mode 100644
index 00000000..9ba19ce9
--- /dev/null
+++ b/src/vnet/tcp/builtin_http_server.c
@@ -0,0 +1,564 @@
+/*
+* Copyright (c) 2015-2017 Cisco and/or its affiliates.
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <vnet/vnet.h>
+#include <vnet/session/application.h>
+#include <vnet/session/application_interface.h>
+
+typedef enum
+{
+ EVENT_WAKEUP = 1,
+} http_process_event_t;
+
+typedef struct
+{
+ u64 session_handle;
+ u64 node_index;
+ u8 *data;
+} builtin_http_server_args;
+
+typedef struct
+{
+ u8 *rx_buf;
+ unix_shared_memory_queue_t **vpp_queue;
+ u64 byte_index;
+
+ uword *handler_by_get_request;
+
+ u32 *free_http_cli_process_node_indices;
+
+ /* Sever's event queue */
+ unix_shared_memory_queue_t *vl_input_queue;
+
+ /* API client handle */
+ u32 my_client_index;
+
+ u32 app_index;
+
+ /* process node index for evnt scheduling */
+ u32 node_index;
+ vlib_main_t *vlib_main;
+} http_server_main_t;
+
+http_server_main_t http_server_main;
+
+static void
+free_http_process (builtin_http_server_args * args)
+{
+ vlib_node_runtime_t *rt;
+ vlib_main_t *vm = &vlib_global_main;
+ http_server_main_t *hsm = &http_server_main;
+ vlib_node_t *n;
+ u32 node_index;
+ builtin_http_server_args **save_args;
+
+ node_index = args->node_index;
+ ASSERT (node_index != 0);
+
+ n = vlib_get_node (vm, node_index);
+ rt = vlib_node_get_runtime (vm, n->index);
+ save_args = vlib_node_get_runtime_data (vm, n->index);
+
+ /* Reset process session pointer */
+ clib_mem_free (*save_args);
+ *save_args = 0;
+
+ /* Turn off the process node */
+ vlib_node_set_state (vm, rt->node_index, VLIB_NODE_STATE_DISABLED);
+
+ /* add node index to the freelist */
+ vec_add1 (hsm->free_http_cli_process_node_indices, node_index);
+}
+
+static const char
+ *http_response = "HTTP/1.1 200 OK\r\n"
+ "Content-Type: text/html\r\n"
+ "Expires: Mon, 11 Jan 1970 10:10:10 GMT\r\n"
+ "Connection: close\r\n"
+ "Pragma: no-cache\r\n" "Content-Length: %d\r\n\r\n%s";
+
+static const char
+ *http_error_template = "HTTP/1.1 %s\r\n"
+ "Content-Type: text/html\r\n"
+ "Expires: Mon, 11 Jan 1970 10:10:10 GMT\r\n"
+ "Connection: close\r\n" "Pragma: no-cache\r\n" "Content-Length: 0\r\n\r\n";
+
+/* Header, including incantation to suppress favicon.ico requests */
+static const char
+ *html_header_template = "<html><head><title>%v</title>"
+ "</head><link rel=\"icon\" href=\"data:,\"><body><pre>";
+
+static const char *html_footer = "</pre></body></html>\r\n";
+
+static void
+http_cli_output (uword arg, u8 * buffer, uword buffer_bytes)
+{
+ u8 **output_vecp = (u8 **) arg;
+ u8 *output_vec;
+ u32 offset;
+
+ output_vec = *output_vecp;
+
+ offset = vec_len (output_vec);
+ vec_validate (output_vec, offset + buffer_bytes - 1);
+ clib_memcpy (output_vec + offset, buffer, buffer_bytes);
+
+ *output_vecp = output_vec;
+}
+
+void
+send_data (builtin_http_server_args * args, u8 * data)
+{
+ session_fifo_event_t evt;
+ u32 offset, bytes_to_send;
+ f64 delay = 10e-3;
+ http_server_main_t *hsm = &http_server_main;
+ vlib_main_t *vm = hsm->vlib_main;
+ f64 last_sent_timer = vlib_time_now (vm);
+ stream_session_t *s;
+
+ s = stream_session_get_from_handle (args->session_handle);
+ ASSERT (s);
+ bytes_to_send = vec_len (data);
+ offset = 0;
+
+ while (bytes_to_send > 0)
+ {
+ int actual_transfer;
+
+ actual_transfer = svm_fifo_enqueue_nowait
+ (s->server_tx_fifo, bytes_to_send, data + offset);
+
+ /* Made any progress? */
+ if (actual_transfer <= 0)
+ {
+ vlib_process_suspend (vm, delay);
+ /* 10s deadman timer */
+ if (vlib_time_now (vm) > last_sent_timer + 10.0)
+ {
+ /* $$$$ FC: reset transport session here? */
+ break;
+ }
+ /* Exponential backoff, within reason */
+ if (delay < 1.0)
+ delay = delay * 2.0;
+ }
+ else
+ {
+ last_sent_timer = vlib_time_now (vm);
+ offset += actual_transfer;
+ bytes_to_send -= actual_transfer;
+
+ if (svm_fifo_set_event (s->server_tx_fifo))
+ {
+ /* Fabricate TX event, send to vpp */
+ evt.fifo = s->server_tx_fifo;
+ evt.event_type = FIFO_EVENT_APP_TX;
+ evt.event_id = 0;
+
+ unix_shared_memory_queue_add (hsm->vpp_queue[s->thread_index],
+ (u8 *) & evt,
+ 0 /* do wait for mutex */ );
+ }
+ delay = 10e-3;
+ }
+ }
+}
+
+static void
+send_error (builtin_http_server_args * args, char *str)
+{
+ u8 *data;
+
+ data = format (0, http_error_template, str);
+ send_data (args, data);
+ vec_free (data);
+}
+
+static uword
+http_cli_process (vlib_main_t * vm,
+ vlib_node_runtime_t * rt, vlib_frame_t * f)
+{
+ http_server_main_t *hsm = &http_server_main;
+ u8 *request = 0, *reply = 0;
+ builtin_http_server_args **save_args;
+ builtin_http_server_args *args;
+ unformat_input_t input;
+ int i;
+ u8 *http = 0, *html = 0;
+
+ save_args = vlib_node_get_runtime_data (hsm->vlib_main, rt->node_index);
+ args = *save_args;
+
+ request = (u8 *) (void *) (args->data);
+ if (vec_len (request) < 7)
+ {
+ send_error (args, "400 Bad Request");
+ goto out;
+ }
+
+ for (i = 0; i < vec_len (request) - 4; i++)
+ {
+ if (request[i] == 'G' &&
+ request[i + 1] == 'E' &&
+ request[i + 2] == 'T' && request[i + 3] == ' ')
+ goto found;
+ }
+bad_request:
+ send_error (args, "400 Bad Request");
+ goto out;
+
+found:
+ /* Lose "GET " */
+ vec_delete (request, i + 5, 0);
+
+ /* Replace slashes with spaces, stop at the end of the path */
+ i = 0;
+ while (1)
+ {
+ if (request[i] == '/')
+ request[i] = ' ';
+ else if (request[i] == ' ')
+ {
+ /* vlib_cli_input is vector-based, no need for a NULL */
+ _vec_len (request) = i;
+ break;
+ }
+ i++;
+ /* Should never happen */
+ if (i == vec_len (request))
+ goto bad_request;
+ }
+
+ /* Generate the html header */
+ html = format (0, html_header_template, request /* title */ );
+
+ /* Run the command */
+ unformat_init_vector (&input, request);
+ vlib_cli_input (vm, &input, http_cli_output, (uword) & reply);
+ unformat_free (&input);
+ request = 0;
+
+ /* Generate the html page */
+ html = format (html, "%v", reply);
+ html = format (html, html_footer);
+ /* And the http reply */
+ http = format (0, http_response, vec_len (html), html);
+
+ /* Send it */
+ send_data (args, http);
+
+out:
+ /* Cleanup */
+ vec_free (request);
+ vec_free (reply);
+ vec_free (html);
+ vec_free (http);
+
+ free_http_process (args);
+ return (0);
+}
+
+static void
+alloc_http_process (builtin_http_server_args * args)
+{
+ char *name;
+ vlib_node_t *n;
+ http_server_main_t *hsm = &http_server_main;
+ vlib_main_t *vm = hsm->vlib_main;
+ uword l = vec_len (hsm->free_http_cli_process_node_indices);
+ builtin_http_server_args **save_args;
+
+ if (vec_len (hsm->free_http_cli_process_node_indices) > 0)
+ {
+ n = vlib_get_node (vm, hsm->free_http_cli_process_node_indices[l - 1]);
+ vlib_node_set_state (vm, n->index, VLIB_NODE_STATE_POLLING);
+ _vec_len (hsm->free_http_cli_process_node_indices) = l - 1;
+ }
+ else
+ {
+ static vlib_node_registration_t r = {
+ .function = http_cli_process,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .process_log2_n_stack_bytes = 16,
+ .runtime_data_bytes = sizeof (void *),
+ };
+
+ name = (char *) format (0, "http-cli-%d", l);
+ r.name = name;
+ vlib_register_node (vm, &r);
+ vec_free (name);
+
+ n = vlib_get_node (vm, r.index);
+ }
+
+ /* Save the node index in the args. It won't be zero. */
+ args->node_index = n->index;
+
+ /* Save the args (pointer) in the node runtime */
+ save_args = vlib_node_get_runtime_data (vm, n->index);
+ *save_args = args;
+
+ vlib_start_process (vm, n->runtime_index);
+}
+
+static void
+alloc_http_process_callback (void *cb_args)
+{
+ alloc_http_process ((builtin_http_server_args *) cb_args);
+}
+
+static int
+http_server_rx_callback (stream_session_t * s)
+{
+ u32 max_dequeue;
+ int actual_transfer;
+ http_server_main_t *hsm = &http_server_main;
+ svm_fifo_t *rx_fifo;
+ builtin_http_server_args *args;
+
+ rx_fifo = s->server_rx_fifo;
+ max_dequeue = svm_fifo_max_dequeue (rx_fifo);
+ svm_fifo_unset_event (rx_fifo);
+ if (PREDICT_FALSE (max_dequeue == 0))
+ return 0;
+
+ vec_validate (hsm->rx_buf, max_dequeue - 1);
+ _vec_len (hsm->rx_buf) = max_dequeue;
+
+ actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, max_dequeue,
+ hsm->rx_buf);
+ ASSERT (actual_transfer > 0);
+ _vec_len (hsm->rx_buf) = actual_transfer;
+
+ /* send the command to a new/recycled vlib process */
+ args = clib_mem_alloc (sizeof (*args));
+ args->data = vec_dup (hsm->rx_buf);
+ args->session_handle = stream_session_handle (s);
+
+ /* Send an RPC request via the thread-0 input node */
+ if (vlib_get_thread_index () != 0)
+ {
+ session_fifo_event_t evt;
+ evt.rpc_args.fp = alloc_http_process_callback;
+ evt.rpc_args.arg = args;
+ evt.event_type = FIFO_EVENT_RPC;
+ unix_shared_memory_queue_add
+ (session_manager_get_vpp_event_queue (0 /* main thread */ ),
+ (u8 *) & evt, 0 /* do wait for mutex */ );
+ }
+ else
+ alloc_http_process (args);
+ return 0;
+}
+
+static int
+builtin_session_accept_callback (stream_session_t * s)
+{
+ http_server_main_t *bsm = &http_server_main;
+
+ bsm->vpp_queue[s->thread_index] =
+ session_manager_get_vpp_event_queue (s->thread_index);
+ s->session_state = SESSION_STATE_READY;
+ bsm->byte_index = 0;
+ return 0;
+}
+
+static void
+builtin_session_disconnect_callback (stream_session_t * s)
+{
+ http_server_main_t *bsm = &http_server_main;
+ vnet_disconnect_args_t _a, *a = &_a;
+
+ a->handle = stream_session_handle (s);
+ a->app_index = bsm->app_index;
+ vnet_disconnect_session (a);
+}
+
+static void
+builtin_session_reset_callback (stream_session_t * s)
+{
+ clib_warning ("called.. ");
+
+ stream_session_cleanup (s);
+}
+
+static int
+builtin_session_connected_callback (u32 app_index, u32 api_context,
+ stream_session_t * s, u8 is_fail)
+{
+ clib_warning ("called...");
+ return -1;
+}
+
+static int
+builtin_add_segment_callback (u32 client_index,
+ const u8 * seg_name, u32 seg_size)
+{
+ clib_warning ("called...");
+ return -1;
+}
+
+static int
+builtin_redirect_connect_callback (u32 client_index, void *mp)
+{
+ clib_warning ("called...");
+ return -1;
+}
+
+static session_cb_vft_t builtin_session_cb_vft = {
+ .session_accept_callback = builtin_session_accept_callback,
+ .session_disconnect_callback = builtin_session_disconnect_callback,
+ .session_connected_callback = builtin_session_connected_callback,
+ .add_segment_callback = builtin_add_segment_callback,
+ .redirect_connect_callback = builtin_redirect_connect_callback,
+ .builtin_server_rx_callback = http_server_rx_callback,
+ .session_reset_callback = builtin_session_reset_callback
+};
+
+/* Abuse VPP's input queue */
+static int
+create_api_loopback (vlib_main_t * vm)
+{
+ http_server_main_t *hsm = &http_server_main;
+ api_main_t *am = &api_main;
+ vl_shmem_hdr_t *shmem_hdr;
+
+ shmem_hdr = am->shmem_hdr;
+ hsm->vl_input_queue = shmem_hdr->vl_input_queue;
+ hsm->my_client_index =
+ vl_api_memclnt_create_internal ("tcp_test_client", hsm->vl_input_queue);
+ return 0;
+}
+
+static int
+server_attach ()
+{
+ http_server_main_t *hsm = &http_server_main;
+ u8 segment_name[128];
+ u64 options[SESSION_OPTIONS_N_OPTIONS];
+ vnet_app_attach_args_t _a, *a = &_a;
+
+ memset (a, 0, sizeof (*a));
+ memset (options, 0, sizeof (options));
+
+ a->api_client_index = hsm->my_client_index;
+ a->session_cb_vft = &builtin_session_cb_vft;
+ a->options = options;
+ a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 128 << 20;
+ a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 8 << 10;
+ a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 32 << 10;
+ a->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP;
+ a->options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = 16;
+ a->segment_name = segment_name;
+ a->segment_name_length = ARRAY_LEN (segment_name);
+
+ if (vnet_application_attach (a))
+ {
+ clib_warning ("failed to attach server");
+ return -1;
+ }
+ hsm->app_index = a->app_index;
+ return 0;
+}
+
+static int
+server_listen ()
+{
+ http_server_main_t *hsm = &http_server_main;
+ vnet_bind_args_t _a, *a = &_a;
+ memset (a, 0, sizeof (*a));
+ a->app_index = hsm->app_index;
+ a->uri = "tcp://0.0.0.0/80";
+ return vnet_bind_uri (a);
+}
+
+static int
+server_create (vlib_main_t * vm)
+{
+ http_server_main_t *hsm = &http_server_main;
+ u32 num_threads;
+ vlib_thread_main_t *vtm = vlib_get_thread_main ();
+
+ ASSERT (hsm->my_client_index == (u32) ~ 0);
+ if (create_api_loopback (vm))
+ return -1;
+
+ num_threads = 1 /* main thread */ + vtm->n_threads;
+ vec_validate (http_server_main.vpp_queue, num_threads - 1);
+
+ if (server_attach ())
+ {
+ clib_warning ("failed to attach server");
+ return -1;
+ }
+ if (server_listen ())
+ {
+ clib_warning ("failed to start listening");
+ return -1;
+ }
+ return 0;
+}
+
+static clib_error_t *
+server_create_command_fn (vlib_main_t * vm,
+ unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+ http_server_main_t *hsm = &http_server_main;
+ int rv;
+
+ if (hsm->my_client_index != (u32) ~ 0)
+ return clib_error_return (0, "test http server is already running");
+
+ vnet_session_enable_disable (vm, 1 /* turn on TCP, etc. */ );
+ rv = server_create (vm);
+ switch (rv)
+ {
+ case 0:
+ break;
+ default:
+ return clib_error_return (0, "server_create returned %d", rv);
+ }
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (server_create_command, static) =
+{
+ .path = "test http server",
+ .short_help = "test http server",
+ .function = server_create_command_fn,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+builtin_http_server_main_init (vlib_main_t * vm)
+{
+ http_server_main_t *hsm = &http_server_main;
+ hsm->my_client_index = ~0;
+ hsm->vlib_main = vm;
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (builtin_http_server_main_init);
+
+/*
+* fd.io coding-style-patch-verification: ON
+*
+* Local Variables:
+* eval: (c-set-style "gnu")
+* End:
+*/
diff --git a/src/vnet/tcp/builtin_proxy.c b/src/vnet/tcp/builtin_proxy.c
new file mode 100644
index 00000000..91377e76
--- /dev/null
+++ b/src/vnet/tcp/builtin_proxy.c
@@ -0,0 +1,601 @@
+/*
+* Copyright (c) 2015-2017 Cisco and/or its affiliates.
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <vnet/vnet.h>
+#include <vlibmemory/api.h>
+#include <vnet/session/application.h>
+#include <vnet/session/application_interface.h>
+#include <vnet/tcp/builtin_proxy.h>
+
+builtin_proxy_main_t builtin_proxy_main;
+
+static void
+delete_proxy_session (stream_session_t * s, int is_active_open)
+{
+ builtin_proxy_main_t *bpm = &builtin_proxy_main;
+ proxy_session_t *ps = 0;
+ vnet_disconnect_args_t _a, *a = &_a;
+ stream_session_t *active_open_session = 0;
+ stream_session_t *server_session = 0;
+ uword *p;
+ u64 handle;
+
+ handle = stream_session_handle (s);
+
+ clib_spinlock_lock_if_init (&bpm->sessions_lock);
+ if (is_active_open)
+ {
+ active_open_session = s;
+
+ p = hash_get (bpm->proxy_session_by_active_open_handle, handle);
+ if (p == 0)
+ {
+ clib_warning ("proxy session for %s handle %lld (%llx) AWOL",
+ is_active_open ? "active open" : "server",
+ handle, handle);
+ }
+ else
+ {
+ ps = pool_elt_at_index (bpm->sessions, p[0]);
+ if (ps->vpp_server_handle != ~0)
+ server_session = stream_session_get_from_handle
+ (ps->vpp_server_handle);
+ else
+ server_session = 0;
+ }
+ }
+ else
+ {
+ server_session = s;
+
+ p = hash_get (bpm->proxy_session_by_server_handle, handle);
+ if (p == 0)
+ {
+ clib_warning ("proxy session for %s handle %lld (%llx) AWOL",
+ is_active_open ? "active open" : "server",
+ handle, handle);
+ }
+ else
+ {
+ ps = pool_elt_at_index (bpm->sessions, p[0]);
+ if (ps->vpp_server_handle != ~0)
+ active_open_session = stream_session_get_from_handle
+ (ps->vpp_server_handle);
+ else
+ active_open_session = 0;
+ }
+ }
+
+ if (ps)
+ {
+ if (CLIB_DEBUG > 0)
+ memset (ps, 0xFE, sizeof (*ps));
+ pool_put (bpm->sessions, ps);
+ }
+
+ clib_spinlock_unlock_if_init (&bpm->sessions_lock);
+
+ if (active_open_session)
+ {
+ a->handle = stream_session_handle (active_open_session);
+ a->app_index = bpm->active_open_app_index;
+ hash_unset (bpm->proxy_session_by_active_open_handle,
+ stream_session_handle (active_open_session));
+ vnet_disconnect_session (a);
+ }
+
+ if (server_session)
+ {
+ a->handle = stream_session_handle (server_session);
+ a->app_index = bpm->server_app_index;
+ hash_unset (bpm->proxy_session_by_server_handle,
+ stream_session_handle (server_session));
+ vnet_disconnect_session (a);
+ }
+}
+
+static int
+server_accept_callback (stream_session_t * s)
+{
+ builtin_proxy_main_t *bpm = &builtin_proxy_main;
+
+ s->session_state = SESSION_STATE_READY;
+
+ clib_spinlock_lock_if_init (&bpm->sessions_lock);
+
+ return 0;
+}
+
+static void
+server_disconnect_callback (stream_session_t * s)
+{
+ delete_proxy_session (s, 0 /* is_active_open */ );
+}
+
+static void
+server_reset_callback (stream_session_t * s)
+{
+ clib_warning ("Reset session %U", format_stream_session, s, 2);
+ delete_proxy_session (s, 0 /* is_active_open */ );
+}
+
+static int
+server_connected_callback (u32 app_index, u32 api_context,
+ stream_session_t * s, u8 is_fail)
+{
+ clib_warning ("called...");
+ return -1;
+}
+
+static int
+server_add_segment_callback (u32 client_index,
+ const u8 * seg_name, u32 seg_size)
+{
+ clib_warning ("called...");
+ return -1;
+}
+
+static int
+server_redirect_connect_callback (u32 client_index, void *mp)
+{
+ clib_warning ("called...");
+ return -1;
+}
+
+static int
+server_rx_callback (stream_session_t * s)
+{
+ u32 max_dequeue;
+ int actual_transfer __attribute__ ((unused));
+ svm_fifo_t *tx_fifo, *rx_fifo;
+ builtin_proxy_main_t *bpm = &builtin_proxy_main;
+ u32 thread_index = vlib_get_thread_index ();
+ vnet_connect_args_t _a, *a = &_a;
+ proxy_session_t *ps;
+ int proxy_index;
+ uword *p;
+ svm_fifo_t *active_open_tx_fifo;
+ session_fifo_event_t evt;
+
+ ASSERT (s->thread_index == thread_index);
+
+ clib_spinlock_lock_if_init (&bpm->sessions_lock);
+ p =
+ hash_get (bpm->proxy_session_by_server_handle, stream_session_handle (s));
+
+ if (PREDICT_TRUE (p != 0))
+ {
+ clib_spinlock_unlock_if_init (&bpm->sessions_lock);
+ active_open_tx_fifo = s->server_rx_fifo;
+
+ /*
+ * Send event for active open tx fifo
+ */
+ if (svm_fifo_set_event (active_open_tx_fifo))
+ {
+ evt.fifo = active_open_tx_fifo;
+ evt.event_type = FIFO_EVENT_APP_TX;
+ if (unix_shared_memory_queue_add
+ (bpm->active_open_event_queue[thread_index], (u8 *) & evt,
+ 0 /* do wait for mutex */ ))
+ clib_warning ("failed to enqueue tx evt");
+ }
+ }
+ else
+ {
+ rx_fifo = s->server_rx_fifo;
+ tx_fifo = s->server_tx_fifo;
+
+ ASSERT (rx_fifo->master_thread_index == thread_index);
+ ASSERT (tx_fifo->master_thread_index == thread_index);
+
+ max_dequeue = svm_fifo_max_dequeue (s->server_rx_fifo);
+
+ if (PREDICT_FALSE (max_dequeue == 0))
+ return 0;
+
+ actual_transfer = svm_fifo_peek (rx_fifo, 0 /* relative_offset */ ,
+ max_dequeue,
+ bpm->rx_buf[thread_index]);
+
+ /* $$$ your message in this space: parse url, etc. */
+
+ memset (a, 0, sizeof (*a));
+
+ clib_spinlock_lock_if_init (&bpm->sessions_lock);
+ pool_get (bpm->sessions, ps);
+ memset (ps, 0, sizeof (*ps));
+ ps->server_rx_fifo = rx_fifo;
+ ps->server_tx_fifo = tx_fifo;
+ ps->vpp_server_handle = stream_session_handle (s);
+
+ proxy_index = ps - bpm->sessions;
+
+ hash_set (bpm->proxy_session_by_server_handle, ps->vpp_server_handle,
+ proxy_index);
+
+ clib_spinlock_unlock_if_init (&bpm->sessions_lock);
+
+ a->uri = "tcp://6.0.2.2/23";
+ a->api_context = proxy_index;
+ a->app_index = bpm->active_open_app_index;
+ a->mp = 0;
+ vnet_connect_uri (a);
+ }
+
+ return 0;
+}
+
+static session_cb_vft_t builtin_session_cb_vft = {
+ .session_accept_callback = server_accept_callback,
+ .session_disconnect_callback = server_disconnect_callback,
+ .session_connected_callback = server_connected_callback,
+ .add_segment_callback = server_add_segment_callback,
+ .redirect_connect_callback = server_redirect_connect_callback,
+ .builtin_server_rx_callback = server_rx_callback,
+ .session_reset_callback = server_reset_callback
+};
+
+static int
+active_open_connected_callback (u32 app_index, u32 opaque,
+ stream_session_t * s, u8 is_fail)
+{
+ builtin_proxy_main_t *bpm = &builtin_proxy_main;
+ proxy_session_t *ps;
+ u8 thread_index = vlib_get_thread_index ();
+ session_fifo_event_t evt;
+
+ if (is_fail)
+ {
+ clib_warning ("connection %d failed!", opaque);
+ return 0;
+ }
+
+ /*
+ * Setup proxy session handle.
+ */
+ clib_spinlock_lock_if_init (&bpm->sessions_lock);
+
+ ps = pool_elt_at_index (bpm->sessions, opaque);
+ ps->vpp_active_open_handle = stream_session_handle (s);
+
+ s->server_tx_fifo = ps->server_rx_fifo;
+ s->server_rx_fifo = ps->server_tx_fifo;
+
+ /*
+ * Reset the active-open tx-fifo master indices so the active-open session
+ * will receive data, etc.
+ */
+ s->server_tx_fifo->master_session_index = s->session_index;
+ s->server_tx_fifo->master_thread_index = s->thread_index;
+
+ /*
+ * Account for the active-open session's use of the fifos
+ * so they won't disappear until the last session which uses
+ * them disappears
+ */
+ s->server_tx_fifo->refcnt++;
+ s->server_rx_fifo->refcnt++;
+
+ hash_set (bpm->proxy_session_by_active_open_handle,
+ ps->vpp_active_open_handle, opaque);
+
+ clib_spinlock_unlock_if_init (&bpm->sessions_lock);
+
+ /*
+ * Send event for active open tx fifo
+ */
+ if (svm_fifo_set_event (s->server_tx_fifo))
+ {
+ evt.fifo = s->server_tx_fifo;
+ evt.event_type = FIFO_EVENT_APP_TX;
+ if (unix_shared_memory_queue_add
+ (bpm->active_open_event_queue[thread_index], (u8 *) & evt,
+ 0 /* do wait for mutex */ ))
+ clib_warning ("failed to enqueue tx evt");
+ }
+
+ return 0;
+}
+
+static void
+active_open_reset_callback (stream_session_t * s)
+{
+ delete_proxy_session (s, 1 /* is_active_open */ );
+}
+
+static int
+active_open_create_callback (stream_session_t * s)
+{
+ return 0;
+}
+
+static void
+active_open_disconnect_callback (stream_session_t * s)
+{
+ delete_proxy_session (s, 1 /* is_active_open */ );
+}
+
+static int
+active_open_rx_callback (stream_session_t * s)
+{
+ builtin_proxy_main_t *bpm = &builtin_proxy_main;
+ session_fifo_event_t evt;
+ svm_fifo_t *server_rx_fifo;
+ u32 thread_index = vlib_get_thread_index ();
+
+ server_rx_fifo = s->server_rx_fifo;
+
+ /*
+ * Send event for server tx fifo
+ */
+ if (svm_fifo_set_event (server_rx_fifo))
+ {
+ evt.fifo = server_rx_fifo;
+ evt.event_type = FIFO_EVENT_APP_TX;
+ if (unix_shared_memory_queue_add
+ (bpm->server_event_queue[thread_index], (u8 *) & evt,
+ 0 /* do wait for mutex */ ))
+ clib_warning ("failed to enqueue server rx evt");
+ }
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+static session_cb_vft_t builtin_clients = {
+ .session_reset_callback = active_open_reset_callback,
+ .session_connected_callback = active_open_connected_callback,
+ .session_accept_callback = active_open_create_callback,
+ .session_disconnect_callback = active_open_disconnect_callback,
+ .builtin_server_rx_callback = active_open_rx_callback
+};
+/* *INDENT-ON* */
+
+
+static void
+create_api_loopbacks (vlib_main_t * vm)
+{
+ builtin_proxy_main_t *bpm = &builtin_proxy_main;
+ api_main_t *am = &api_main;
+ vl_shmem_hdr_t *shmem_hdr;
+
+ shmem_hdr = am->shmem_hdr;
+ bpm->vl_input_queue = shmem_hdr->vl_input_queue;
+ bpm->server_client_index =
+ vl_api_memclnt_create_internal ("proxy_server", bpm->vl_input_queue);
+ bpm->active_open_client_index =
+ vl_api_memclnt_create_internal ("proxy_active_open", bpm->vl_input_queue);
+}
+
+static int
+server_attach ()
+{
+ builtin_proxy_main_t *bpm = &builtin_proxy_main;
+ u8 segment_name[128];
+ u64 options[SESSION_OPTIONS_N_OPTIONS];
+ vnet_app_attach_args_t _a, *a = &_a;
+
+ memset (a, 0, sizeof (*a));
+ memset (options, 0, sizeof (options));
+
+ a->api_client_index = bpm->server_client_index;
+ a->session_cb_vft = &builtin_session_cb_vft;
+ a->options = options;
+ a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 512 << 20;
+ a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = bpm->fifo_size;
+ a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = bpm->fifo_size;
+ a->options[APP_OPTIONS_PRIVATE_SEGMENT_COUNT] = bpm->private_segment_count;
+ a->options[APP_OPTIONS_PRIVATE_SEGMENT_SIZE] = bpm->private_segment_size;
+ a->options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] =
+ bpm->prealloc_fifos ? bpm->prealloc_fifos : 1;
+
+ a->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP;
+
+ a->segment_name = segment_name;
+ a->segment_name_length = ARRAY_LEN (segment_name);
+
+ if (vnet_application_attach (a))
+ {
+ clib_warning ("failed to attach server");
+ return -1;
+ }
+ bpm->server_app_index = a->app_index;
+
+ return 0;
+}
+
+static int
+active_open_attach (void)
+{
+ builtin_proxy_main_t *bpm = &builtin_proxy_main;
+ vnet_app_attach_args_t _a, *a = &_a;
+ u8 segment_name[128];
+ u32 segment_name_length;
+ u64 options[16];
+
+ segment_name_length = ARRAY_LEN (segment_name);
+
+ memset (a, 0, sizeof (*a));
+ memset (options, 0, sizeof (options));
+
+ a->api_client_index = bpm->active_open_client_index;
+ a->segment_name = segment_name;
+ a->segment_name_length = segment_name_length;
+ a->session_cb_vft = &builtin_clients;
+
+ options[SESSION_OPTIONS_ACCEPT_COOKIE] = 0x12345678;
+ options[SESSION_OPTIONS_SEGMENT_SIZE] = 512 << 20;
+ options[SESSION_OPTIONS_RX_FIFO_SIZE] = bpm->fifo_size;
+ options[SESSION_OPTIONS_TX_FIFO_SIZE] = bpm->fifo_size;
+ options[APP_OPTIONS_PRIVATE_SEGMENT_COUNT] = bpm->private_segment_count;
+ options[APP_OPTIONS_PRIVATE_SEGMENT_SIZE] = bpm->private_segment_size;
+ options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] =
+ bpm->prealloc_fifos ? bpm->prealloc_fifos : 1;
+
+ options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP
+ | APP_OPTIONS_FLAGS_IS_PROXY;
+
+ a->options = options;
+
+ if (vnet_application_attach (a))
+ return -1;
+
+ bpm->active_open_app_index = a->app_index;
+
+ return 0;
+}
+
+static int
+server_listen ()
+{
+ builtin_proxy_main_t *bpm = &builtin_proxy_main;
+ vnet_bind_args_t _a, *a = &_a;
+ memset (a, 0, sizeof (*a));
+ a->app_index = bpm->server_app_index;
+ a->uri = "tcp://0.0.0.0/23";
+ return vnet_bind_uri (a);
+}
+
+static int
+server_create (vlib_main_t * vm)
+{
+ builtin_proxy_main_t *bpm = &builtin_proxy_main;
+ vlib_thread_main_t *vtm = vlib_get_thread_main ();
+ u32 num_threads;
+ int i;
+
+ if (bpm->server_client_index == (u32) ~ 0)
+ create_api_loopbacks (vm);
+
+ num_threads = 1 /* main thread */ + vtm->n_threads;
+ vec_validate (builtin_proxy_main.server_event_queue, num_threads - 1);
+ vec_validate (builtin_proxy_main.active_open_event_queue, num_threads - 1);
+ vec_validate (bpm->rx_buf, num_threads - 1);
+
+ for (i = 0; i < num_threads; i++)
+ vec_validate (bpm->rx_buf[i], bpm->rcv_buffer_size);
+
+ if (server_attach ())
+ {
+ clib_warning ("failed to attach server app");
+ return -1;
+ }
+ if (server_listen ())
+ {
+ clib_warning ("failed to start listening");
+ return -1;
+ }
+ if (active_open_attach ())
+ {
+ clib_warning ("failed to attach active open app");
+ return -1;
+ }
+
+ for (i = 0; i < num_threads; i++)
+ {
+ bpm->active_open_event_queue[i] =
+ session_manager_get_vpp_event_queue (i);
+
+ ASSERT (bpm->active_open_event_queue[i]);
+
+ bpm->server_event_queue[i] = session_manager_get_vpp_event_queue (i);
+ }
+
+ return 0;
+}
+
+static clib_error_t *
+proxy_server_create_command_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ builtin_proxy_main_t *bpm = &builtin_proxy_main;
+ int rv;
+ u64 tmp;
+
+ bpm->fifo_size = 64 << 10;
+ bpm->rcv_buffer_size = 1024;
+ bpm->prealloc_fifos = 0;
+ bpm->private_segment_count = 0;
+ bpm->private_segment_size = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "fifo-size %d", &bpm->fifo_size))
+ bpm->fifo_size <<= 10;
+ else if (unformat (input, "rcv-buf-size %d", &bpm->rcv_buffer_size))
+ ;
+ else if (unformat (input, "prealloc-fifos %d", &bpm->prealloc_fifos))
+ ;
+ else if (unformat (input, "private-segment-count %d",
+ &bpm->private_segment_count))
+ ;
+ else if (unformat (input, "private-segment-size %U",
+ unformat_memory_size, &tmp))
+ {
+ if (tmp >= 0x100000000ULL)
+ return clib_error_return
+ (0, "private segment size %lld (%llu) too large", tmp, tmp);
+ bpm->private_segment_size = tmp;
+ }
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ vnet_session_enable_disable (vm, 1 /* turn on TCP, etc. */ );
+
+ rv = server_create (vm);
+ switch (rv)
+ {
+ case 0:
+ break;
+ default:
+ return clib_error_return (0, "server_create returned %d", rv);
+ }
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (server_create_command, static) =
+{
+ .path = "test proxy server",
+ .short_help = "test proxy server",
+ .function = proxy_server_create_command_fn,
+};
+/* *INDENT-ON* */
+
+clib_error_t *
+builtin_tcp_proxy_main_init (vlib_main_t * vm)
+{
+ builtin_proxy_main_t *bpm = &builtin_proxy_main;
+ bpm->server_client_index = ~0;
+ bpm->active_open_client_index = ~0;
+ bpm->proxy_session_by_active_open_handle = hash_create (0, sizeof (uword));
+ bpm->proxy_session_by_server_handle = hash_create (0, sizeof (uword));
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (builtin_tcp_proxy_main_init);
+
+/*
+* fd.io coding-style-patch-verification: ON
+*
+* Local Variables:
+* eval: (c-set-style "gnu")
+* End:
+*/
diff --git a/src/vnet/tcp/builtin_proxy.h b/src/vnet/tcp/builtin_proxy.h
new file mode 100644
index 00000000..cf707a15
--- /dev/null
+++ b/src/vnet/tcp/builtin_proxy.h
@@ -0,0 +1,100 @@
+
+/*
+ * builtin_proxy.h - skeleton vpp engine plug-in header file
+ *
+ * Copyright (c) <current-year> <your-organization>
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_builtin_proxy_h__
+#define __included_builtin_proxy_h__
+
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h>
+
+#include <vppinfra/hash.h>
+#include <vppinfra/error.h>
+#include <vlibmemory/unix_shared_memory_queue.h>
+#include <svm/svm_fifo_segment.h>
+#include <vnet/session/session.h>
+#include <vnet/session/application_interface.h>
+
+typedef struct
+{
+ svm_fifo_t *server_rx_fifo;
+ svm_fifo_t *server_tx_fifo;
+
+ u64 vpp_server_handle;
+ u64 vpp_active_open_handle;
+} proxy_session_t;
+
+typedef struct
+{
+ unix_shared_memory_queue_t *vl_input_queue; /**< vpe input queue */
+ /** per-thread vectors */
+ unix_shared_memory_queue_t **server_event_queue;
+ unix_shared_memory_queue_t **active_open_event_queue;
+ u8 **rx_buf; /**< intermediate rx buffers */
+
+ u32 cli_node_index; /**< cli process node index */
+ u32 server_client_index; /**< server API client handle */
+ u32 server_app_index; /**< server app index */
+ u32 active_open_client_index; /**< active open API client handle */
+ u32 active_open_app_index; /**< active open index after attach */
+
+ uword *proxy_session_by_server_handle;
+ uword *proxy_session_by_active_open_handle;
+
+ /*
+ * Configuration params
+ */
+ u8 *connect_uri; /**< URI for slave's connect */
+ u32 configured_segment_size;
+ u32 fifo_size;
+ u32 private_segment_count; /**< Number of private fifo segs */
+ u32 private_segment_size; /**< size of private fifo segs */
+ int rcv_buffer_size;
+
+ /*
+ * Test state variables
+ */
+ proxy_session_t *sessions; /**< Session pool, shared */
+ clib_spinlock_t sessions_lock;
+ u32 **connection_index_by_thread;
+ pthread_t client_thread_handle;
+
+ /*
+ * Flags
+ */
+ u8 is_init;
+ u8 prealloc_fifos; /**< Request fifo preallocation */
+
+ /*
+ * Convenience
+ */
+ vlib_main_t *vlib_main;
+ vnet_main_t *vnet_main;
+ ethernet_main_t *ethernet_main;
+} builtin_proxy_main_t;
+
+builtin_proxy_main_t builtin_proxy_main;
+
+#endif /* __included_builtin_proxy_h__ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c
new file mode 100644
index 00000000..93314529
--- /dev/null
+++ b/src/vnet/tcp/builtin_server.c
@@ -0,0 +1,455 @@
+/*
+* Copyright (c) 2015-2017 Cisco and/or its affiliates.
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <vnet/vnet.h>
+#include <vlibmemory/api.h>
+#include <vnet/session/application.h>
+#include <vnet/session/application_interface.h>
+
+typedef struct
+{
+ /*
+ * Server app parameters
+ */
+ unix_shared_memory_queue_t **vpp_queue;
+ unix_shared_memory_queue_t *vl_input_queue; /**< Sever's event queue */
+
+ u32 app_index; /**< Server app index */
+ u32 my_client_index; /**< API client handle */
+ u32 node_index; /**< process node index for evnt scheduling */
+
+ /*
+ * Config params
+ */
+ u8 no_echo; /**< Don't echo traffic */
+ u32 fifo_size; /**< Fifo size */
+ u32 rcv_buffer_size; /**< Rcv buffer size */
+ u32 prealloc_fifos; /**< Preallocate fifos */
+ u32 private_segment_count; /**< Number of private segments */
+ u32 private_segment_size; /**< Size of private segments */
+ char *server_uri; /**< Server URI */
+
+ /*
+ * Test state
+ */
+ u8 **rx_buf; /**< Per-thread RX buffer */
+ u64 byte_index;
+ u32 **rx_retries;
+
+ vlib_main_t *vlib_main;
+} builtin_server_main_t;
+
+builtin_server_main_t builtin_server_main;
+
+int
+builtin_session_accept_callback (stream_session_t * s)
+{
+ builtin_server_main_t *bsm = &builtin_server_main;
+
+ bsm->vpp_queue[s->thread_index] =
+ session_manager_get_vpp_event_queue (s->thread_index);
+ s->session_state = SESSION_STATE_READY;
+ bsm->byte_index = 0;
+ vec_validate (bsm->rx_retries[s->thread_index], s->session_index);
+ bsm->rx_retries[s->thread_index][s->session_index] = 0;
+ return 0;
+}
+
+void
+builtin_session_disconnect_callback (stream_session_t * s)
+{
+ builtin_server_main_t *bsm = &builtin_server_main;
+ vnet_disconnect_args_t _a, *a = &_a;
+
+ a->handle = stream_session_handle (s);
+ a->app_index = bsm->app_index;
+ vnet_disconnect_session (a);
+}
+
+void
+builtin_session_reset_callback (stream_session_t * s)
+{
+ clib_warning ("Reset session %U", format_stream_session, s, 2);
+ stream_session_cleanup (s);
+}
+
+
+int
+builtin_session_connected_callback (u32 app_index, u32 api_context,
+ stream_session_t * s, u8 is_fail)
+{
+ clib_warning ("called...");
+ return -1;
+}
+
+int
+builtin_add_segment_callback (u32 client_index,
+ const u8 * seg_name, u32 seg_size)
+{
+ clib_warning ("called...");
+ return -1;
+}
+
+int
+builtin_redirect_connect_callback (u32 client_index, void *mp)
+{
+ clib_warning ("called...");
+ return -1;
+}
+
+void
+test_bytes (builtin_server_main_t * bsm, int actual_transfer)
+{
+ int i;
+ u32 my_thread_id = vlib_get_thread_index ();
+
+ for (i = 0; i < actual_transfer; i++)
+ {
+ if (bsm->rx_buf[my_thread_id][i] != ((bsm->byte_index + i) & 0xff))
+ {
+ clib_warning ("at %lld expected %d got %d", bsm->byte_index + i,
+ (bsm->byte_index + i) & 0xff,
+ bsm->rx_buf[my_thread_id][i]);
+ }
+ }
+ bsm->byte_index += actual_transfer;
+}
+
+/*
+ * If no-echo, just read the data and be done with it
+ */
+int
+builtin_server_rx_callback_no_echo (stream_session_t * s)
+{
+ builtin_server_main_t *bsm = &builtin_server_main;
+ u32 my_thread_id = vlib_get_thread_index ();
+ int actual_transfer;
+ svm_fifo_t *rx_fifo;
+
+ rx_fifo = s->server_rx_fifo;
+
+ do
+ {
+ actual_transfer =
+ svm_fifo_dequeue_nowait (rx_fifo, bsm->rcv_buffer_size,
+ bsm->rx_buf[my_thread_id]);
+ }
+ while (actual_transfer > 0);
+ return 0;
+}
+
+int
+builtin_server_rx_callback (stream_session_t * s)
+{
+ u32 n_written, max_dequeue, max_enqueue, max_transfer;
+ int actual_transfer;
+ svm_fifo_t *tx_fifo, *rx_fifo;
+ builtin_server_main_t *bsm = &builtin_server_main;
+ session_fifo_event_t evt;
+ static int serial_number = 0;
+ u32 thread_index = vlib_get_thread_index ();
+
+ ASSERT (s->thread_index == thread_index);
+
+ rx_fifo = s->server_rx_fifo;
+ tx_fifo = s->server_tx_fifo;
+
+ ASSERT (rx_fifo->master_thread_index == thread_index);
+ ASSERT (tx_fifo->master_thread_index == thread_index);
+
+ max_dequeue = svm_fifo_max_dequeue (s->server_rx_fifo);
+ max_enqueue = svm_fifo_max_enqueue (s->server_tx_fifo);
+
+ if (PREDICT_FALSE (max_dequeue == 0))
+ return 0;
+
+ /* Number of bytes we're going to copy */
+ max_transfer = (max_dequeue < max_enqueue) ? max_dequeue : max_enqueue;
+
+ /* No space in tx fifo */
+ if (PREDICT_FALSE (max_transfer == 0))
+ {
+ /* XXX timeout for session that are stuck */
+
+ rx_event:
+ /* Program self-tap to retry */
+ if (svm_fifo_set_event (rx_fifo))
+ {
+ unix_shared_memory_queue_t *q;
+ evt.fifo = rx_fifo;
+ evt.event_type = FIFO_EVENT_BUILTIN_RX;
+ evt.event_id = 0;
+
+ q = bsm->vpp_queue[thread_index];
+ if (PREDICT_FALSE (q->cursize == q->maxsize))
+ clib_warning ("out of event queue space");
+ else if (unix_shared_memory_queue_add (q, (u8 *) & evt, 0))
+ clib_warning ("failed to enqueue self-tap");
+
+ if (bsm->rx_retries[thread_index][s->session_index] == 500000)
+ {
+ clib_warning ("session stuck: %U", format_stream_session, s, 2);
+ }
+ if (bsm->rx_retries[thread_index][s->session_index] < 500001)
+ bsm->rx_retries[thread_index][s->session_index]++;
+ }
+
+ return 0;
+ }
+
+ _vec_len (bsm->rx_buf[thread_index]) = max_transfer;
+
+ actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, max_transfer,
+ bsm->rx_buf[thread_index]);
+ ASSERT (actual_transfer == max_transfer);
+
+// test_bytes (bsm, actual_transfer);
+
+ /*
+ * Echo back
+ */
+
+ n_written = svm_fifo_enqueue_nowait (tx_fifo, actual_transfer,
+ bsm->rx_buf[thread_index]);
+
+ if (n_written != max_transfer)
+ clib_warning ("short trout!");
+
+ if (svm_fifo_set_event (tx_fifo))
+ {
+ /* Fabricate TX event, send to vpp */
+ evt.fifo = tx_fifo;
+ evt.event_type = FIFO_EVENT_APP_TX;
+ evt.event_id = serial_number++;
+
+ if (unix_shared_memory_queue_add (bsm->vpp_queue[s->thread_index],
+ (u8 *) & evt,
+ 0 /* do wait for mutex */ ))
+ clib_warning ("failed to enqueue tx evt");
+ }
+
+ if (PREDICT_FALSE (n_written < max_dequeue))
+ goto rx_event;
+
+ return 0;
+}
+
+static session_cb_vft_t builtin_session_cb_vft = {
+ .session_accept_callback = builtin_session_accept_callback,
+ .session_disconnect_callback = builtin_session_disconnect_callback,
+ .session_connected_callback = builtin_session_connected_callback,
+ .add_segment_callback = builtin_add_segment_callback,
+ .redirect_connect_callback = builtin_redirect_connect_callback,
+ .builtin_server_rx_callback = builtin_server_rx_callback,
+ .session_reset_callback = builtin_session_reset_callback
+};
+
+/* Abuse VPP's input queue */
+static int
+create_api_loopback (vlib_main_t * vm)
+{
+ builtin_server_main_t *bsm = &builtin_server_main;
+ api_main_t *am = &api_main;
+ vl_shmem_hdr_t *shmem_hdr;
+
+ shmem_hdr = am->shmem_hdr;
+ bsm->vl_input_queue = shmem_hdr->vl_input_queue;
+ bsm->my_client_index =
+ vl_api_memclnt_create_internal ("tcp_test_server", bsm->vl_input_queue);
+ return 0;
+}
+
+static int
+server_attach ()
+{
+ builtin_server_main_t *bsm = &builtin_server_main;
+ u8 segment_name[128];
+ u64 options[SESSION_OPTIONS_N_OPTIONS];
+ vnet_app_attach_args_t _a, *a = &_a;
+
+ memset (a, 0, sizeof (*a));
+ memset (options, 0, sizeof (options));
+
+ if (bsm->no_echo)
+ builtin_session_cb_vft.builtin_server_rx_callback =
+ builtin_server_rx_callback_no_echo;
+ else
+ builtin_session_cb_vft.builtin_server_rx_callback =
+ builtin_server_rx_callback;
+ a->api_client_index = bsm->my_client_index;
+ a->session_cb_vft = &builtin_session_cb_vft;
+ a->options = options;
+ a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 512 << 20;
+ a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = bsm->fifo_size;
+ a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = bsm->fifo_size;
+ a->options[APP_OPTIONS_PRIVATE_SEGMENT_COUNT] = bsm->private_segment_count;
+ a->options[APP_OPTIONS_PRIVATE_SEGMENT_SIZE] = bsm->private_segment_size;
+ a->options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] =
+ bsm->prealloc_fifos ? bsm->prealloc_fifos : 1;
+
+ a->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP;
+
+ a->segment_name = segment_name;
+ a->segment_name_length = ARRAY_LEN (segment_name);
+
+ if (vnet_application_attach (a))
+ {
+ clib_warning ("failed to attach server");
+ return -1;
+ }
+ bsm->app_index = a->app_index;
+ return 0;
+}
+
+static int
+server_listen ()
+{
+ builtin_server_main_t *bsm = &builtin_server_main;
+ vnet_bind_args_t _a, *a = &_a;
+ memset (a, 0, sizeof (*a));
+ a->app_index = bsm->app_index;
+ a->uri = bsm->server_uri;
+ return vnet_bind_uri (a);
+}
+
+static int
+server_create (vlib_main_t * vm)
+{
+ builtin_server_main_t *bsm = &builtin_server_main;
+ vlib_thread_main_t *vtm = vlib_get_thread_main ();
+ u32 num_threads;
+ int i;
+
+ if (bsm->my_client_index == (u32) ~ 0)
+ {
+ if (create_api_loopback (vm))
+ {
+ clib_warning ("failed to create api loopback");
+ return -1;
+ }
+ }
+
+ num_threads = 1 /* main thread */ + vtm->n_threads;
+ vec_validate (builtin_server_main.vpp_queue, num_threads - 1);
+ vec_validate (bsm->rx_buf, num_threads - 1);
+ vec_validate (bsm->rx_retries, num_threads - 1);
+
+ for (i = 0; i < num_threads; i++)
+ vec_validate (bsm->rx_buf[i], bsm->rcv_buffer_size);
+
+ if (server_attach ())
+ {
+ clib_warning ("failed to attach server");
+ return -1;
+ }
+ if (server_listen ())
+ {
+ clib_warning ("failed to start listening");
+ return -1;
+ }
+ return 0;
+}
+
+static clib_error_t *
+server_create_command_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ builtin_server_main_t *bsm = &builtin_server_main;
+ u8 server_uri_set = 0;
+ int rv;
+ u64 tmp;
+
+ bsm->no_echo = 0;
+ bsm->fifo_size = 64 << 10;
+ bsm->rcv_buffer_size = 128 << 10;
+ bsm->prealloc_fifos = 0;
+ bsm->private_segment_count = 0;
+ bsm->private_segment_size = 0;
+ vec_free (bsm->server_uri);
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "no-echo"))
+ bsm->no_echo = 1;
+ else if (unformat (input, "fifo-size %d", &bsm->fifo_size))
+ bsm->fifo_size <<= 10;
+ else if (unformat (input, "rcv-buf-size %d", &bsm->rcv_buffer_size))
+ ;
+ else if (unformat (input, "prealloc-fifos %d", &bsm->prealloc_fifos))
+ ;
+ else if (unformat (input, "private-segment-count %d",
+ &bsm->private_segment_count))
+ ;
+ else if (unformat (input, "private-segment-size %U",
+ unformat_memory_size, &tmp))
+ {
+ if (tmp >= 0x100000000ULL)
+ return clib_error_return
+ (0, "private segment size %lld (%llu) too large", tmp, tmp);
+ bsm->private_segment_size = tmp;
+ }
+ else if (unformat (input, "uri %s", &bsm->server_uri))
+ server_uri_set = 1;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ vnet_session_enable_disable (vm, 1 /* turn on TCP, etc. */ );
+
+ if (!server_uri_set)
+ bsm->server_uri = (char *) format (0, "tcp://0.0.0.0/1234%c", 0);
+
+ rv = server_create (vm);
+ switch (rv)
+ {
+ case 0:
+ break;
+ default:
+ return clib_error_return (0, "server_create returned %d", rv);
+ }
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (server_create_command, static) =
+{
+ .path = "test tcp server",
+ .short_help = "test tcp server [no echo][fifo-size <mbytes>] "
+ "[rcv-buf-size <bytes>][prealloc-fifos <count>]"
+ "[private-segment-count <count>][private-segment-size <bytes[m|g]>]"
+ "[uri <tcp://ip/port>]",
+ .function = server_create_command_fn,
+};
+/* *INDENT-ON* */
+
+clib_error_t *
+builtin_tcp_server_main_init (vlib_main_t * vm)
+{
+ builtin_server_main_t *bsm = &builtin_server_main;
+ bsm->my_client_index = ~0;
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (builtin_tcp_server_main_init);
+
+/*
+* fd.io coding-style-patch-verification: ON
+*
+* Local Variables:
+* eval: (c-set-style "gnu")
+* End:
+*/
diff --git a/src/vnet/tcp/tcp.api b/src/vnet/tcp/tcp.api
new file mode 100644
index 00000000..093a5a89
--- /dev/null
+++ b/src/vnet/tcp/tcp.api
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2015-2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** \brief Configure TCP source addresses, for active-open TCP sessions
+
+ TCP src/dst ports are 16 bits, with the low-order 1024 ports
+ reserved. So, it's necessary to provide a considerable number of
+ source IP addresses if one wishes to initiate a large number of
+ connections.
+
+ Each of those addresses needs to have a receive adjacency -
+ either a /32 or a /128 - and vpp needs to answer (proxy) arps or
+ neighbor discovery requests for the addresses.
+
+ @param client_index - opaque cookie to identify the sender
+ @param context - sender context, to match reply w/ request
+ @param is_ipv6 - 1 for ipv6, 0 for ipv4
+ @param vrf_id - fib table / vrf id for local adjacencies
+ @param first_address - first address that TCP will use
+ @param last_address - last address that TCP will use
+*/
+autoreply define tcp_configure_src_addresses {
+ u32 client_index;
+ u32 context;
+ u8 is_ipv6;
+ u32 vrf_id;
+ u8 first_address[16];
+ u8 last_address[16];
+ };
+
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
new file mode 100644
index 00000000..a365cb48
--- /dev/null
+++ b/src/vnet/tcp/tcp.c
@@ -0,0 +1,1943 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ * @brief TCP host stack utilities
+ */
+
+#include <vnet/tcp/tcp.h>
+#include <vnet/session/session.h>
+#include <vnet/fib/fib.h>
+#include <vnet/dpo/load_balance.h>
+#include <vnet/dpo/receive_dpo.h>
+#include <vnet/ip/ip6_neighbor.h>
+#include <math.h>
+
+tcp_main_t tcp_main;
+
+static u32
+tcp_connection_bind (u32 session_index, transport_endpoint_t * lcl)
+{
+ tcp_main_t *tm = &tcp_main;
+ tcp_connection_t *listener;
+
+ pool_get (tm->listener_pool, listener);
+ memset (listener, 0, sizeof (*listener));
+
+ listener->c_c_index = listener - tm->listener_pool;
+ listener->c_lcl_port = lcl->port;
+
+ if (lcl->is_ip4)
+ {
+ listener->c_lcl_ip4.as_u32 = lcl->ip.ip4.as_u32;
+ listener->c_is_ip4 = 1;
+ }
+ else
+ {
+ clib_memcpy (&listener->c_lcl_ip6, &lcl->ip.ip6,
+ sizeof (ip6_address_t));
+
+ }
+ listener->c_transport_proto = TRANSPORT_PROTO_TCP;
+ listener->c_s_index = session_index;
+ listener->state = TCP_STATE_LISTEN;
+
+ tcp_connection_timers_init (listener);
+
+ TCP_EVT_DBG (TCP_EVT_BIND, listener);
+
+ return listener->c_c_index;
+}
+
+u32
+tcp_session_bind (u32 session_index, transport_endpoint_t * tep)
+{
+ return tcp_connection_bind (session_index, tep);
+}
+
+static void
+tcp_connection_unbind (u32 listener_index)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ tcp_connection_t *tc;
+
+ tc = pool_elt_at_index (tm->listener_pool, listener_index);
+
+ TCP_EVT_DBG (TCP_EVT_UNBIND, tc);
+
+ /* Poison the entry */
+ if (CLIB_DEBUG > 0)
+ memset (tc, 0xFA, sizeof (*tc));
+
+ pool_put_index (tm->listener_pool, listener_index);
+}
+
+u32
+tcp_session_unbind (u32 listener_index)
+{
+ tcp_connection_unbind (listener_index);
+ return 0;
+}
+
+transport_connection_t *
+tcp_session_get_listener (u32 listener_index)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ tcp_connection_t *tc;
+ tc = pool_elt_at_index (tm->listener_pool, listener_index);
+ return &tc->connection;
+}
+
+always_inline void
+transport_endpoint_del (u32 tepi)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ clib_spinlock_lock_if_init (&tm->local_endpoints_lock);
+ pool_put_index (tm->local_endpoints, tepi);
+ clib_spinlock_unlock_if_init (&tm->local_endpoints_lock);
+}
+
+always_inline transport_endpoint_t *
+transport_endpoint_new (void)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ transport_endpoint_t *tep;
+ pool_get (tm->local_endpoints, tep);
+ return tep;
+}
+
+/**
+ * Cleanup half-open connection
+ *
+ */
+void
+tcp_half_open_connection_del (tcp_connection_t * tc)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ clib_spinlock_lock_if_init (&tm->half_open_lock);
+ pool_put_index (tm->half_open_connections, tc->c_c_index);
+ if (CLIB_DEBUG)
+ memset (tc, 0xFA, sizeof (*tc));
+ clib_spinlock_unlock_if_init (&tm->half_open_lock);
+}
+
+/**
+ * Try to cleanup half-open connection
+ *
+ * If called from a thread that doesn't own tc, the call won't have any
+ * effect.
+ *
+ * @param tc - connection to be cleaned up
+ * @return non-zero if cleanup failed.
+ */
+int
+tcp_half_open_connection_cleanup (tcp_connection_t * tc)
+{
+ /* Make sure this is the owning thread */
+ if (tc->c_thread_index != vlib_get_thread_index ())
+ return 1;
+ tcp_timer_reset (tc, TCP_TIMER_ESTABLISH);
+ tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT_SYN);
+ tcp_half_open_connection_del (tc);
+ return 0;
+}
+
+tcp_connection_t *
+tcp_half_open_connection_new (void)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ tcp_connection_t *tc = 0;
+ ASSERT (vlib_get_thread_index () == 0);
+ pool_get (tm->half_open_connections, tc);
+ memset (tc, 0, sizeof (*tc));
+ tc->c_c_index = tc - tm->half_open_connections;
+ return tc;
+}
+
+/**
+ * Cleans up connection state.
+ *
+ * No notifications.
+ */
+void
+tcp_connection_cleanup (tcp_connection_t * tc)
+{
+ tcp_main_t *tm = &tcp_main;
+ u32 tepi;
+ transport_endpoint_t *tep;
+
+ /* Cleanup local endpoint if this was an active connect */
+ tepi = transport_endpoint_lookup (&tm->local_endpoints_table, &tc->c_lcl_ip,
+ clib_net_to_host_u16 (tc->c_lcl_port));
+ if (tepi != TRANSPORT_ENDPOINT_INVALID_INDEX)
+ {
+ tep = pool_elt_at_index (tm->local_endpoints, tepi);
+ transport_endpoint_table_del (&tm->local_endpoints_table, tep);
+ transport_endpoint_del (tepi);
+ }
+
+ /* Check if connection is not yet fully established */
+ if (tc->state == TCP_STATE_SYN_SENT)
+ {
+ /* Try to remove the half-open connection. If this is not the owning
+ * thread, tc won't be removed. Retransmit or establish timers will
+ * eventually expire and call again cleanup on the right thread. */
+ tcp_half_open_connection_cleanup (tc);
+ }
+ else
+ {
+ int thread_index = tc->c_thread_index;
+
+ /* Make sure all timers are cleared */
+ tcp_connection_timers_reset (tc);
+
+ /* Poison the entry */
+ if (CLIB_DEBUG > 0)
+ memset (tc, 0xFA, sizeof (*tc));
+ pool_put (tm->connections[thread_index], tc);
+ }
+}
+
+/**
+ * Connection removal.
+ *
+ * This should be called only once connection enters CLOSED state. Note
+ * that it notifies the session of the removal event, so if the goal is to
+ * just remove the connection, call tcp_connection_cleanup instead.
+ */
+void
+tcp_connection_del (tcp_connection_t * tc)
+{
+ TCP_EVT_DBG (TCP_EVT_DELETE, tc);
+ stream_session_delete_notify (&tc->connection);
+ tcp_connection_cleanup (tc);
+}
+
+tcp_connection_t *
+tcp_connection_new (u8 thread_index)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ tcp_connection_t *tc;
+
+ pool_get (tm->connections[thread_index], tc);
+ memset (tc, 0, sizeof (*tc));
+ tc->c_c_index = tc - tm->connections[thread_index];
+ tc->c_thread_index = thread_index;
+ return tc;
+}
+
+/** Notify session that connection has been reset.
+ *
+ * Switch state to closed and wait for session to call cleanup.
+ */
+void
+tcp_connection_reset (tcp_connection_t * tc)
+{
+ TCP_EVT_DBG (TCP_EVT_RST_RCVD, tc);
+ switch (tc->state)
+ {
+ case TCP_STATE_SYN_RCVD:
+ /* Cleanup everything. App wasn't notified yet */
+ stream_session_delete_notify (&tc->connection);
+ tcp_connection_cleanup (tc);
+ break;
+ case TCP_STATE_SYN_SENT:
+ stream_session_connect_notify (&tc->connection, 1 /* fail */ );
+ tcp_connection_cleanup (tc);
+ break;
+ case TCP_STATE_ESTABLISHED:
+ case TCP_STATE_CLOSE_WAIT:
+ case TCP_STATE_FIN_WAIT_1:
+ case TCP_STATE_FIN_WAIT_2:
+ case TCP_STATE_CLOSING:
+ tc->state = TCP_STATE_CLOSED;
+ TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc);
+
+ /* Make sure all timers are cleared */
+ tcp_connection_timers_reset (tc);
+ stream_session_reset_notify (&tc->connection);
+
+ /* Wait for cleanup from session layer but not forever */
+ tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME);
+ break;
+ case TCP_STATE_CLOSED:
+ return;
+ }
+}
+
+/**
+ * Begin connection closing procedure.
+ *
+ * If at the end the connection is not in CLOSED state, it is not removed.
+ * Instead, we rely on on TCP to advance through state machine to either
+ * 1) LAST_ACK (passive close) whereby when the last ACK is received
+ * tcp_connection_del is called. This notifies session of the delete and
+ * calls cleanup.
+ * 2) TIME_WAIT (active close) whereby after 2MSL the 2MSL timer triggers
+ * and cleanup is called.
+ *
+ * N.B. Half-close connections are not supported
+ */
+void
+tcp_connection_close (tcp_connection_t * tc)
+{
+ TCP_EVT_DBG (TCP_EVT_CLOSE, tc);
+
+ /* Send/Program FIN if needed and switch state */
+ switch (tc->state)
+ {
+ case TCP_STATE_SYN_SENT:
+ tc->state = TCP_STATE_CLOSED;
+ break;
+ case TCP_STATE_SYN_RCVD:
+ tcp_send_fin (tc);
+ tc->state = TCP_STATE_FIN_WAIT_1;
+ break;
+ case TCP_STATE_ESTABLISHED:
+ if (!stream_session_tx_fifo_max_dequeue (&tc->connection))
+ tcp_send_fin (tc);
+ else
+ tc->flags |= TCP_CONN_FINPNDG;
+ tc->state = TCP_STATE_FIN_WAIT_1;
+ break;
+ case TCP_STATE_CLOSE_WAIT:
+ tcp_send_fin (tc);
+ tc->state = TCP_STATE_LAST_ACK;
+ break;
+ case TCP_STATE_FIN_WAIT_1:
+ break;
+ default:
+ clib_warning ("state: %u", tc->state);
+ }
+
+ TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc);
+
+ /* If in CLOSED and WAITCLOSE timer is not set, delete connection now */
+ if (tc->timers[TCP_TIMER_WAITCLOSE] == TCP_TIMER_HANDLE_INVALID
+ && tc->state == TCP_STATE_CLOSED)
+ tcp_connection_del (tc);
+}
+
+void
+tcp_session_close (u32 conn_index, u32 thread_index)
+{
+ tcp_connection_t *tc;
+ tc = tcp_connection_get (conn_index, thread_index);
+ tcp_connection_close (tc);
+}
+
+void
+tcp_session_cleanup (u32 conn_index, u32 thread_index)
+{
+ tcp_connection_t *tc;
+ tc = tcp_connection_get (conn_index, thread_index);
+
+ /* Wait for the session tx events to clear */
+ tc->state = TCP_STATE_CLOSED;
+ TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc);
+ tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME);
+}
+
+void *
+ip_interface_get_first_ip (u32 sw_if_index, u8 is_ip4)
+{
+ ip_lookup_main_t *lm4 = &ip4_main.lookup_main;
+ ip_lookup_main_t *lm6 = &ip6_main.lookup_main;
+ ip_interface_address_t *ia = 0;
+
+ if (is_ip4)
+ {
+ /* *INDENT-OFF* */
+ foreach_ip_interface_address (lm4, ia, sw_if_index, 1 /* unnumbered */ ,
+ ({
+ return ip_interface_address_get_address (lm4, ia);
+ }));
+ /* *INDENT-ON* */
+ }
+ else
+ {
+ /* *INDENT-OFF* */
+ foreach_ip_interface_address (lm6, ia, sw_if_index, 1 /* unnumbered */ ,
+ ({
+ ip6_address_t *rv;
+ rv = ip_interface_address_get_address (lm6, ia);
+ /* Trying to use a link-local ip6 src address is a fool's errand */
+ if (!ip6_address_is_link_local_unicast (rv))
+ return rv;
+ }));
+ /* *INDENT-ON* */
+ }
+
+ return 0;
+}
+
+#define PORT_MASK ((1 << 16)- 1)
+/**
+ * Allocate local port and add if successful add entry to local endpoint
+ * table to mark the pair as used.
+ */
+int
+tcp_allocate_local_port (ip46_address_t * ip)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ transport_endpoint_t *tep;
+ u32 tei;
+ u16 min = 1024, max = 65535; /* XXX configurable ? */
+ int tries, limit;
+
+ limit = max - min;
+
+ /* Only support active opens from thread 0 */
+ ASSERT (vlib_get_thread_index () == 0);
+
+ /* Search for first free slot */
+ for (tries = 0; tries < limit; tries++)
+ {
+ u16 port = 0;
+
+ /* Find a port in the specified range */
+ while (1)
+ {
+ port = random_u32 (&tm->port_allocator_seed) & PORT_MASK;
+ if (PREDICT_TRUE (port >= min && port < max))
+ break;
+ }
+
+ /* Look it up */
+ tei = transport_endpoint_lookup (&tm->local_endpoints_table, ip, port);
+ /* If not found, we're done */
+ if (tei == TRANSPORT_ENDPOINT_INVALID_INDEX)
+ {
+ clib_spinlock_lock_if_init (&tm->local_endpoints_lock);
+ tep = transport_endpoint_new ();
+ clib_memcpy (&tep->ip, ip, sizeof (*ip));
+ tep->port = port;
+ transport_endpoint_table_add (&tm->local_endpoints_table, tep,
+ tep - tm->local_endpoints);
+ clib_spinlock_unlock_if_init (&tm->local_endpoints_lock);
+
+ return tep->port;
+ }
+ }
+ return -1;
+}
+
+/**
+ * Initialize all connection timers as invalid
+ */
+void
+tcp_connection_timers_init (tcp_connection_t * tc)
+{
+ int i;
+
+ /* Set all to invalid */
+ for (i = 0; i < TCP_N_TIMERS; i++)
+ {
+ tc->timers[i] = TCP_TIMER_HANDLE_INVALID;
+ }
+
+ tc->rto = TCP_RTO_INIT;
+}
+
+/**
+ * Stop all connection timers
+ */
+void
+tcp_connection_timers_reset (tcp_connection_t * tc)
+{
+ int i;
+ for (i = 0; i < TCP_N_TIMERS; i++)
+ {
+ tcp_timer_reset (tc, i);
+ }
+}
+
+#if 0
+typedef struct ip4_tcp_hdr
+{
+ ip4_header_t ip;
+ tcp_header_t tcp;
+} ip4_tcp_hdr_t;
+
+typedef struct ip6_tcp_hdr
+{
+ ip6_header_t ip;
+ tcp_header_t tcp;
+} ip6_tcp_hdr_t;
+
+static void
+tcp_connection_select_lb_bucket (tcp_connection_t * tc, const dpo_id_t * dpo,
+ dpo_id_t * result)
+{
+ const dpo_id_t *choice;
+ load_balance_t *lb;
+ int hash;
+
+ lb = load_balance_get (dpo->dpoi_index);
+ if (tc->c_is_ip4)
+ {
+ ip4_tcp_hdr_t hdr;
+ memset (&hdr, 0, sizeof (hdr));
+ hdr.ip.protocol = IP_PROTOCOL_TCP;
+ hdr.ip.address_pair.src.as_u32 = tc->c_lcl_ip.ip4.as_u32;
+ hdr.ip.address_pair.dst.as_u32 = tc->c_rmt_ip.ip4.as_u32;
+ hdr.tcp.src_port = tc->c_lcl_port;
+ hdr.tcp.dst_port = tc->c_rmt_port;
+ hash = ip4_compute_flow_hash (&hdr.ip, lb->lb_hash_config);
+ }
+ else
+ {
+ ip6_tcp_hdr_t hdr;
+ memset (&hdr, 0, sizeof (hdr));
+ hdr.ip.protocol = IP_PROTOCOL_TCP;
+ clib_memcpy (&hdr.ip.src_address, &tc->c_lcl_ip.ip6,
+ sizeof (ip6_address_t));
+ clib_memcpy (&hdr.ip.dst_address, &tc->c_rmt_ip.ip6,
+ sizeof (ip6_address_t));
+ hdr.tcp.src_port = tc->c_lcl_port;
+ hdr.tcp.dst_port = tc->c_rmt_port;
+ hash = ip6_compute_flow_hash (&hdr.ip, lb->lb_hash_config);
+ }
+ choice = load_balance_get_bucket_i (lb, hash & lb->lb_n_buckets_minus_1);
+ dpo_copy (result, choice);
+}
+
+fib_node_index_t
+tcp_lookup_rmt_in_fib (tcp_connection_t * tc)
+{
+ fib_prefix_t prefix;
+ u32 fib_index;
+
+ clib_memcpy (&prefix.fp_addr, &tc->c_rmt_ip, sizeof (prefix.fp_addr));
+ prefix.fp_proto = tc->c_is_ip4 ? FIB_PROTOCOL_IP4 : FIB_PROTOCOL_IP6;
+ prefix.fp_len = tc->c_is_ip4 ? 32 : 128;
+ fib_index = fib_table_find (prefix.fp_proto, tc->c_vrf);
+ return fib_table_lookup (fib_index, &prefix);
+}
+
+static int
+tcp_connection_stack_on_fib_entry (tcp_connection_t * tc)
+{
+ dpo_id_t choice = DPO_INVALID;
+ u32 output_node_index;
+ fib_entry_t *fe;
+
+ fe = fib_entry_get (tc->c_rmt_fei);
+ if (fe->fe_lb.dpoi_type != DPO_LOAD_BALANCE)
+ return -1;
+
+ tcp_connection_select_lb_bucket (tc, &fe->fe_lb, &choice);
+
+ output_node_index =
+ tc->c_is_ip4 ? tcp4_output_node.index : tcp6_output_node.index;
+ dpo_stack_from_node (output_node_index, &tc->c_rmt_dpo, &choice);
+ return 0;
+}
+
+/** Stack tcp connection on peer's fib entry.
+ *
+ * This ultimately populates the dpo the connection will use to send packets.
+ */
+static void
+tcp_connection_fib_attach (tcp_connection_t * tc)
+{
+ tc->c_rmt_fei = tcp_lookup_rmt_in_fib (tc);
+
+ ASSERT (tc->c_rmt_fei != FIB_NODE_INDEX_INVALID);
+
+ tcp_connection_stack_on_fib_entry (tc);
+}
+#endif /* 0 */
+
+/**
+ * Initialize connection send variables.
+ */
+void
+tcp_init_snd_vars (tcp_connection_t * tc)
+{
+ u32 time_now;
+
+ /*
+ * We use the time to randomize iss and for setting up the initial
+ * timestamp. Make sure it's updated otherwise syn and ack in the
+ * handshake may make it look as if time has flown in the opposite
+ * direction for us.
+ */
+ tcp_set_time_now (vlib_get_thread_index ());
+ time_now = tcp_time_now ();
+
+ tc->iss = random_u32 (&time_now);
+ tc->snd_una = tc->iss;
+ tc->snd_nxt = tc->iss + 1;
+ tc->snd_una_max = tc->snd_nxt;
+}
+
+/** Initialize tcp connection variables
+ *
+ * Should be called after having received a msg from the peer, i.e., a SYN or
+ * a SYNACK, such that connection options have already been exchanged. */
+void
+tcp_connection_init_vars (tcp_connection_t * tc)
+{
+ tcp_connection_timers_init (tc);
+ tcp_init_mss (tc);
+ scoreboard_init (&tc->sack_sb);
+ tcp_cc_init (tc);
+ if (tc->state == TCP_STATE_SYN_RCVD)
+ tcp_init_snd_vars (tc);
+
+ // tcp_connection_fib_attach (tc);
+}
+
+int
+tcp_connection_open (transport_endpoint_t * rmt)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ tcp_connection_t *tc;
+ fib_prefix_t prefix;
+ fib_node_index_t fei;
+ u32 sw_if_index, fib_index;
+ ip46_address_t lcl_addr;
+ int lcl_port;
+
+ /*
+ * Find the local address and allocate port
+ */
+ memset (&lcl_addr, 0, sizeof (lcl_addr));
+
+ /* Find a FIB path to the destination */
+ clib_memcpy (&prefix.fp_addr, &rmt->ip, sizeof (rmt->ip));
+ prefix.fp_proto = rmt->is_ip4 ? FIB_PROTOCOL_IP4 : FIB_PROTOCOL_IP6;
+ prefix.fp_len = rmt->is_ip4 ? 32 : 128;
+
+ fib_index = fib_table_find (prefix.fp_proto, rmt->vrf);
+ if (fib_index == (u32) ~ 0)
+ {
+ clib_warning ("no fib table");
+ return -1;
+ }
+
+ fei = fib_table_lookup (fib_index, &prefix);
+
+ /* Couldn't find route to destination. Bail out. */
+ if (fei == FIB_NODE_INDEX_INVALID)
+ {
+ clib_warning ("no route to destination");
+ return -1;
+ }
+
+ sw_if_index = fib_entry_get_resolving_interface (fei);
+
+ if (sw_if_index == (u32) ~ 0)
+ {
+ clib_warning ("no resolving interface for %U", format_ip46_address,
+ &rmt->ip, IP46_TYPE_IP4);
+ return -1;
+ }
+
+ if (rmt->is_ip4)
+ {
+ ip4_address_t *ip4;
+ int index;
+ if (vec_len (tm->ip4_src_addresses))
+ {
+ index = tm->last_v4_address_rotor++;
+ if (tm->last_v4_address_rotor >= vec_len (tm->ip4_src_addresses))
+ tm->last_v4_address_rotor = 0;
+ lcl_addr.ip4.as_u32 = tm->ip4_src_addresses[index].as_u32;
+ }
+ else
+ {
+ ip4 = ip_interface_get_first_ip (sw_if_index, 1);
+ lcl_addr.ip4.as_u32 = ip4->as_u32;
+ }
+ }
+ else
+ {
+ ip6_address_t *ip6;
+ int index;
+
+ if (vec_len (tm->ip6_src_addresses))
+ {
+ index = tm->last_v6_address_rotor++;
+ if (tm->last_v6_address_rotor >= vec_len (tm->ip6_src_addresses))
+ tm->last_v6_address_rotor = 0;
+ clib_memcpy (&lcl_addr.ip6, &tm->ip6_src_addresses[index],
+ sizeof (*ip6));
+ }
+ else
+ {
+ ip6 = ip_interface_get_first_ip (sw_if_index, 0);
+ if (ip6 == 0)
+ {
+ clib_warning ("no routable ip6 addresses on %U",
+ format_vnet_sw_if_index_name, vnet_get_main (),
+ sw_if_index);
+ return -1;
+ }
+
+ clib_memcpy (&lcl_addr.ip6, ip6, sizeof (*ip6));
+ }
+ }
+
+ /* Allocate source port */
+ lcl_port = tcp_allocate_local_port (&lcl_addr);
+ if (lcl_port < 1)
+ {
+ clib_warning ("Failed to allocate src port");
+ return -1;
+ }
+
+ /*
+ * Create connection and send SYN
+ */
+ clib_spinlock_lock_if_init (&tm->half_open_lock);
+ tc = tcp_half_open_connection_new ();
+ clib_memcpy (&tc->c_rmt_ip, &rmt->ip, sizeof (ip46_address_t));
+ clib_memcpy (&tc->c_lcl_ip, &lcl_addr, sizeof (ip46_address_t));
+ tc->c_rmt_port = rmt->port;
+ tc->c_lcl_port = clib_host_to_net_u16 (lcl_port);
+ tc->c_is_ip4 = rmt->is_ip4;
+ tc->c_transport_proto = TRANSPORT_PROTO_TCP;
+ tc->c_vrf = rmt->vrf;
+ /* The other connection vars will be initialized after SYN ACK */
+ tcp_connection_timers_init (tc);
+
+ TCP_EVT_DBG (TCP_EVT_OPEN, tc);
+ tc->state = TCP_STATE_SYN_SENT;
+ tcp_init_snd_vars (tc);
+ tcp_send_syn (tc);
+ clib_spinlock_unlock_if_init (&tm->half_open_lock);
+
+ return tc->c_c_index;
+}
+
+int
+tcp_session_open (transport_endpoint_t * tep)
+{
+ return tcp_connection_open (tep);
+}
+
+const char *tcp_dbg_evt_str[] = {
+#define _(sym, str) str,
+ foreach_tcp_dbg_evt
+#undef _
+};
+
+const char *tcp_fsm_states[] = {
+#define _(sym, str) str,
+ foreach_tcp_fsm_state
+#undef _
+};
+
+u8 *
+format_tcp_state (u8 * s, va_list * args)
+{
+ u32 state = va_arg (*args, u32);
+
+ if (state < TCP_N_STATES)
+ s = format (s, "%s", tcp_fsm_states[state]);
+ else
+ s = format (s, "UNKNOWN (%d (0x%x))", state, state);
+ return s;
+}
+
+const char *tcp_conn_timers[] = {
+#define _(sym, str) str,
+ foreach_tcp_timer
+#undef _
+};
+
+u8 *
+format_tcp_timers (u8 * s, va_list * args)
+{
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ int i, last = -1;
+
+ for (i = 0; i < TCP_N_TIMERS; i++)
+ if (tc->timers[i] != TCP_TIMER_HANDLE_INVALID)
+ last = i;
+
+ s = format (s, "[");
+ for (i = 0; i < last; i++)
+ {
+ if (tc->timers[i] != TCP_TIMER_HANDLE_INVALID)
+ s = format (s, "%s,", tcp_conn_timers[i]);
+ }
+
+ if (last >= 0)
+ s = format (s, "%s]", tcp_conn_timers[i]);
+ else
+ s = format (s, "]");
+
+ return s;
+}
+
+u8 *
+format_tcp_congestion_status (u8 * s, va_list * args)
+{
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ if (tcp_in_recovery (tc))
+ s = format (s, "recovery");
+ else if (tcp_in_fastrecovery (tc))
+ s = format (s, "fastrecovery");
+ else
+ s = format (s, "none");
+ return s;
+}
+
+u8 *
+format_tcp_vars (u8 * s, va_list * args)
+{
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ s = format (s, " snd_una %u snd_nxt %u snd_una_max %u",
+ tc->snd_una - tc->iss, tc->snd_nxt - tc->iss,
+ tc->snd_una_max - tc->iss);
+ s = format (s, " rcv_nxt %u rcv_las %u\n",
+ tc->rcv_nxt - tc->irs, tc->rcv_las - tc->irs);
+ s = format (s, " snd_wnd %u rcv_wnd %u snd_wl1 %u snd_wl2 %u\n",
+ tc->snd_wnd, tc->rcv_wnd, tc->snd_wl1 - tc->irs,
+ tc->snd_wl2 - tc->iss);
+ s = format (s, " flight size %u send space %u rcv_wnd_av %d\n",
+ tcp_flight_size (tc), tcp_available_output_snd_space (tc),
+ tcp_rcv_wnd_available (tc));
+ s = format (s, " cong %U ", format_tcp_congestion_status, tc);
+ s = format (s, "cwnd %u ssthresh %u rtx_bytes %u bytes_acked %u\n",
+ tc->cwnd, tc->ssthresh, tc->snd_rxt_bytes, tc->bytes_acked);
+ s = format (s, " prev_ssthresh %u snd_congestion %u dupack %u",
+ tc->prev_ssthresh, tc->snd_congestion - tc->iss,
+ tc->rcv_dupacks);
+ s = format (s, " limited_transmit %u\n", tc->limited_transmit - tc->iss);
+ s = format (s, " tsecr %u tsecr_last_ack %u\n", tc->rcv_opts.tsecr,
+ tc->tsecr_last_ack);
+ s = format (s, " rto %u rto_boff %u srtt %u rttvar %u rtt_ts %u ", tc->rto,
+ tc->rto_boff, tc->srtt, tc->rttvar, tc->rtt_ts);
+ s = format (s, "rtt_seq %u\n", tc->rtt_seq);
+ s = format (s, " tsval_recent %u tsval_recent_age %u\n", tc->tsval_recent,
+ tcp_time_now () - tc->tsval_recent_age);
+ if (tc->state >= TCP_STATE_ESTABLISHED)
+ s = format (s, " scoreboard: %U\n", format_tcp_scoreboard, &tc->sack_sb,
+ tc);
+ if (vec_len (tc->snd_sacks))
+ s = format (s, " sacks tx: %U\n", format_tcp_sacks, tc);
+
+ return s;
+}
+
+u8 *
+format_tcp_connection_id (u8 * s, va_list * args)
+{
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ if (!tc)
+ return s;
+ if (tc->c_is_ip4)
+ {
+ s = format (s, "[#%d][%s] %U:%d->%U:%d", tc->c_thread_index, "T",
+ format_ip4_address, &tc->c_lcl_ip4,
+ clib_net_to_host_u16 (tc->c_lcl_port), format_ip4_address,
+ &tc->c_rmt_ip4, clib_net_to_host_u16 (tc->c_rmt_port));
+ }
+ else
+ {
+ s = format (s, "[#%d][%s] %U:%d->%U:%d", tc->c_thread_index, "T",
+ format_ip6_address, &tc->c_lcl_ip6,
+ clib_net_to_host_u16 (tc->c_lcl_port), format_ip6_address,
+ &tc->c_rmt_ip6, clib_net_to_host_u16 (tc->c_rmt_port));
+ }
+
+ return s;
+}
+
+u8 *
+format_tcp_connection (u8 * s, va_list * args)
+{
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ u32 verbose = va_arg (*args, u32);
+
+ if (!tc)
+ return s;
+ s = format (s, "%-50U", format_tcp_connection_id, tc);
+ if (verbose)
+ {
+ s = format (s, "%-15U", format_tcp_state, tc->state);
+ if (verbose > 1)
+ s = format (s, " %U\n%U", format_tcp_timers, tc, format_tcp_vars, tc);
+ }
+
+ return s;
+}
+
+u8 *
+format_tcp_session (u8 * s, va_list * args)
+{
+ u32 tci = va_arg (*args, u32);
+ u32 thread_index = va_arg (*args, u32);
+ u32 verbose = va_arg (*args, u32);
+ tcp_connection_t *tc;
+
+ tc = tcp_connection_get (tci, thread_index);
+ if (tc)
+ s = format (s, "%U", format_tcp_connection, tc, verbose);
+ else
+ s = format (s, "empty\n");
+ return s;
+}
+
+u8 *
+format_tcp_listener_session (u8 * s, va_list * args)
+{
+ u32 tci = va_arg (*args, u32);
+ tcp_connection_t *tc = tcp_listener_get (tci);
+ return format (s, "%U", format_tcp_connection_id, tc);
+}
+
+u8 *
+format_tcp_half_open_session (u8 * s, va_list * args)
+{
+ u32 tci = va_arg (*args, u32);
+ tcp_connection_t *tc = tcp_half_open_connection_get (tci);
+ return format (s, "%U", format_tcp_connection_id, tc);
+}
+
+u8 *
+format_tcp_sacks (u8 * s, va_list * args)
+{
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ sack_block_t *sacks = tc->snd_sacks;
+ sack_block_t *block;
+ int i, len = 0;
+
+ len = vec_len (sacks);
+ for (i = 0; i < len - 1; i++)
+ {
+ block = &sacks[i];
+ s = format (s, " start %u end %u\n", block->start - tc->irs,
+ block->end - tc->irs);
+ }
+ if (len)
+ {
+ block = &sacks[len - 1];
+ s = format (s, " start %u end %u", block->start - tc->irs,
+ block->end - tc->irs);
+ }
+ return s;
+}
+
+u8 *
+format_tcp_rcv_sacks (u8 * s, va_list * args)
+{
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ sack_block_t *sacks = tc->rcv_opts.sacks;
+ sack_block_t *block;
+ int i, len = 0;
+
+ len = vec_len (sacks);
+ for (i = 0; i < len - 1; i++)
+ {
+ block = &sacks[i];
+ s = format (s, " start %u end %u\n", block->start - tc->iss,
+ block->end - tc->iss);
+ }
+ if (len)
+ {
+ block = &sacks[len - 1];
+ s = format (s, " start %u end %u", block->start - tc->iss,
+ block->end - tc->iss);
+ }
+ return s;
+}
+
+u8 *
+format_tcp_sack_hole (u8 * s, va_list * args)
+{
+ sack_scoreboard_hole_t *hole = va_arg (*args, sack_scoreboard_hole_t *);
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ if (tc)
+ s = format (s, " [%u, %u]", hole->start - tc->iss, hole->end - tc->iss);
+ else
+ s = format (s, " [%u, %u]", hole->start, hole->end);
+ return s;
+}
+
+u8 *
+format_tcp_scoreboard (u8 * s, va_list * args)
+{
+ sack_scoreboard_t *sb = va_arg (*args, sack_scoreboard_t *);
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ sack_scoreboard_hole_t *hole;
+ s = format (s, "sacked_bytes %u last_sacked_bytes %u lost_bytes %u\n",
+ sb->sacked_bytes, sb->last_sacked_bytes, sb->lost_bytes);
+ s = format (s, " last_bytes_delivered %u high_sacked %u snd_una_adv %u\n",
+ sb->last_bytes_delivered, sb->high_sacked, sb->snd_una_adv);
+ s = format (s, " cur_rxt_hole %u high_rxt %u rescue_rxt %u",
+ sb->cur_rxt_hole, sb->high_rxt, sb->rescue_rxt);
+
+ hole = scoreboard_first_hole (sb);
+ if (hole)
+ s = format (s, "\n head %u tail %u holes:\n", sb->head, sb->tail);
+
+ while (hole)
+ {
+ s = format (s, "%U", format_tcp_sack_hole, hole, tc);
+ hole = scoreboard_next_hole (sb, hole);
+ }
+
+ return s;
+}
+
+transport_connection_t *
+tcp_session_get_transport (u32 conn_index, u32 thread_index)
+{
+ tcp_connection_t *tc = tcp_connection_get (conn_index, thread_index);
+ return &tc->connection;
+}
+
+transport_connection_t *
+tcp_half_open_session_get_transport (u32 conn_index)
+{
+ tcp_connection_t *tc = tcp_half_open_connection_get (conn_index);
+ return &tc->connection;
+}
+
+/**
+ * Compute maximum segment size for session layer.
+ *
+ * Since the result needs to be the actual data length, it first computes
+ * the tcp options to be used in the next burst and subtracts their
+ * length from the connection's snd_mss.
+ */
+u16
+tcp_session_send_mss (transport_connection_t * trans_conn)
+{
+ tcp_connection_t *tc = (tcp_connection_t *) trans_conn;
+
+ /* Ensure snd_mss does accurately reflect the amount of data we can push
+ * in a segment. This also makes sure that options are updated according to
+ * the current state of the connection. */
+ tcp_update_snd_mss (tc);
+
+ return tc->snd_mss;
+}
+
+always_inline u32
+tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space)
+{
+ if (PREDICT_FALSE (tc->snd_wnd < tc->snd_mss))
+ {
+ return tc->snd_wnd <= snd_space ? tc->snd_wnd : 0;
+ }
+
+ /* If not snd_wnd constrained and we can't write at least a segment,
+ * don't try at all */
+ if (PREDICT_FALSE (snd_space < tc->snd_mss))
+ return snd_space < tc->cwnd ? 0 : snd_space;
+
+ /* round down to mss multiple */
+ return snd_space - (snd_space % tc->snd_mss);
+}
+
+/**
+ * Compute tx window session is allowed to fill.
+ *
+ * Takes into account available send space, snd_mss and the congestion
+ * state of the connection. If possible, the value returned is a multiple
+ * of snd_mss.
+ *
+ * @param tc tcp connection
+ * @return number of bytes session is allowed to write
+ */
+u32
+tcp_snd_space (tcp_connection_t * tc)
+{
+ int snd_space, snt_limited;
+
+ if (PREDICT_TRUE (tcp_in_cong_recovery (tc) == 0))
+ {
+ snd_space = tcp_available_output_snd_space (tc);
+
+ /* If we haven't gotten dupacks or if we did and have gotten sacked
+ * bytes then we can still send as per Limited Transmit (RFC3042) */
+ if (PREDICT_FALSE (tc->rcv_dupacks != 0
+ && (tcp_opts_sack_permitted (tc)
+ && tc->sack_sb.last_sacked_bytes == 0)))
+ {
+ if (tc->rcv_dupacks == 1 && tc->limited_transmit != tc->snd_nxt)
+ tc->limited_transmit = tc->snd_nxt;
+ ASSERT (seq_leq (tc->limited_transmit, tc->snd_nxt));
+
+ snt_limited = tc->snd_nxt - tc->limited_transmit;
+ snd_space = clib_max (2 * tc->snd_mss - snt_limited, 0);
+ }
+ return tcp_round_snd_space (tc, snd_space);
+ }
+
+ if (tcp_in_recovery (tc))
+ {
+ tc->snd_nxt = tc->snd_una_max;
+ snd_space = tcp_available_snd_wnd (tc) - tc->snd_rxt_bytes
+ - (tc->snd_una_max - tc->snd_congestion);
+ if (snd_space <= 0 || (tc->snd_una_max - tc->snd_una) >= tc->snd_wnd)
+ return 0;
+ return tcp_round_snd_space (tc, snd_space);
+ }
+
+ /* RFC 5681: When previously unsent data is available and the new value of
+ * cwnd and the receiver's advertised window allow, a TCP SHOULD send 1*SMSS
+ * bytes of previously unsent data. */
+ if (tcp_in_fastrecovery (tc) && !tcp_fastrecovery_sent_1_smss (tc))
+ {
+ if (tcp_available_output_snd_space (tc) < tc->snd_mss)
+ return 0;
+ tcp_fastrecovery_1_smss_on (tc);
+ return tc->snd_mss;
+ }
+
+ return 0;
+}
+
+u32
+tcp_session_send_space (transport_connection_t * trans_conn)
+{
+ tcp_connection_t *tc = (tcp_connection_t *) trans_conn;
+ return clib_min (tcp_snd_space (tc),
+ tc->snd_wnd - (tc->snd_nxt - tc->snd_una));
+}
+
+i32
+tcp_rcv_wnd_available (tcp_connection_t * tc)
+{
+ return (i32) tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las);
+}
+
+u32
+tcp_session_tx_fifo_offset (transport_connection_t * trans_conn)
+{
+ tcp_connection_t *tc = (tcp_connection_t *) trans_conn;
+
+ ASSERT (seq_geq (tc->snd_nxt, tc->snd_una));
+
+ /* This still works if fast retransmit is on */
+ return (tc->snd_nxt - tc->snd_una);
+}
+
+/* *INDENT-OFF* */
+const static transport_proto_vft_t tcp_proto = {
+ .bind = tcp_session_bind,
+ .unbind = tcp_session_unbind,
+ .push_header = tcp_push_header,
+ .get_connection = tcp_session_get_transport,
+ .get_listener = tcp_session_get_listener,
+ .get_half_open = tcp_half_open_session_get_transport,
+ .open = tcp_session_open,
+ .close = tcp_session_close,
+ .cleanup = tcp_session_cleanup,
+ .send_mss = tcp_session_send_mss,
+ .send_space = tcp_session_send_space,
+ .tx_fifo_offset = tcp_session_tx_fifo_offset,
+ .format_connection = format_tcp_session,
+ .format_listener = format_tcp_listener_session,
+ .format_half_open = format_tcp_half_open_session,
+};
+/* *INDENT-ON* */
+
+void
+tcp_timer_keep_handler (u32 conn_index)
+{
+ u32 thread_index = vlib_get_thread_index ();
+ tcp_connection_t *tc;
+
+ tc = tcp_connection_get (conn_index, thread_index);
+ tc->timers[TCP_TIMER_KEEP] = TCP_TIMER_HANDLE_INVALID;
+
+ tcp_connection_close (tc);
+}
+
+void
+tcp_timer_establish_handler (u32 conn_index)
+{
+ tcp_connection_t *tc;
+
+ tc = tcp_half_open_connection_get (conn_index);
+ if (tc)
+ {
+ ASSERT (tc->state == TCP_STATE_SYN_SENT);
+ stream_session_connect_notify (&tc->connection, 1 /* fail */ );
+ TCP_DBG ("establish pop: %U", format_tcp_connection, tc, 2);
+ }
+ else
+ {
+ tc = tcp_connection_get (conn_index, vlib_get_thread_index ());
+ /* note: the connection may have already disappeared */
+ if (PREDICT_FALSE (tc == 0))
+ return;
+ TCP_DBG ("establish pop: %U", format_tcp_connection, tc, 2);
+ ASSERT (tc->state == TCP_STATE_SYN_RCVD);
+ /* Start cleanup. App wasn't notified yet so use delete notify as
+ * opposed to delete to cleanup session layer state. */
+ stream_session_delete_notify (&tc->connection);
+ }
+ tc->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID;
+ tcp_connection_cleanup (tc);
+}
+
+void
+tcp_timer_waitclose_handler (u32 conn_index)
+{
+ u32 thread_index = vlib_get_thread_index ();
+ tcp_connection_t *tc;
+
+ tc = tcp_connection_get (conn_index, thread_index);
+ if (!tc)
+ return;
+ tc->timers[TCP_TIMER_WAITCLOSE] = TCP_TIMER_HANDLE_INVALID;
+
+ /* Session didn't come back with a close(). Send FIN either way
+ * and switch to LAST_ACK. */
+ if (tc->state == TCP_STATE_CLOSE_WAIT)
+ {
+ if (tc->flags & TCP_CONN_FINSNT)
+ {
+ clib_warning ("FIN was sent and still in CLOSE WAIT. Weird!");
+ }
+
+ tcp_send_fin (tc);
+ tc->state = TCP_STATE_LAST_ACK;
+
+ /* Make sure we don't wait in LAST ACK forever */
+ tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
+
+ /* Don't delete the connection yet */
+ return;
+ }
+
+ tcp_connection_del (tc);
+}
+
+/* *INDENT-OFF* */
+static timer_expiration_handler *timer_expiration_handlers[TCP_N_TIMERS] =
+{
+ tcp_timer_retransmit_handler,
+ tcp_timer_delack_handler,
+ tcp_timer_persist_handler,
+ tcp_timer_keep_handler,
+ tcp_timer_waitclose_handler,
+ tcp_timer_retransmit_syn_handler,
+ tcp_timer_establish_handler
+};
+/* *INDENT-ON* */
+
+static void
+tcp_expired_timers_dispatch (u32 * expired_timers)
+{
+ int i;
+ u32 connection_index, timer_id;
+
+ for (i = 0; i < vec_len (expired_timers); i++)
+ {
+ /* Get session index and timer id */
+ connection_index = expired_timers[i] & 0x0FFFFFFF;
+ timer_id = expired_timers[i] >> 28;
+
+ TCP_EVT_DBG (TCP_EVT_TIMER_POP, connection_index, timer_id);
+
+ /* Handle expiration */
+ (*timer_expiration_handlers[timer_id]) (connection_index);
+ }
+}
+
+void
+tcp_initialize_timer_wheels (tcp_main_t * tm)
+{
+ tw_timer_wheel_16t_2w_512sl_t *tw;
+ /* *INDENT-OFF* */
+ foreach_vlib_main (({
+ tw = &tm->timer_wheels[ii];
+ tw_timer_wheel_init_16t_2w_512sl (tw, tcp_expired_timers_dispatch,
+ 100e-3 /* timer period 100ms */ , ~0);
+ tw->last_run_time = vlib_time_now (this_vlib_main);
+ }));
+ /* *INDENT-ON* */
+}
+
+clib_error_t *
+tcp_main_enable (vlib_main_t * vm)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ ip_protocol_info_t *pi;
+ ip_main_t *im = &ip_main;
+ vlib_thread_main_t *vtm = vlib_get_thread_main ();
+ clib_error_t *error = 0;
+ u32 num_threads;
+ int thread;
+ tcp_connection_t *tc __attribute__ ((unused));
+ u32 preallocated_connections_per_thread;
+
+ if ((error = vlib_call_init_function (vm, ip_main_init)))
+ return error;
+ if ((error = vlib_call_init_function (vm, ip4_lookup_init)))
+ return error;
+ if ((error = vlib_call_init_function (vm, ip6_lookup_init)))
+ return error;
+
+ /*
+ * Registrations
+ */
+
+ /* Register with IP */
+ pi = ip_get_protocol_info (im, IP_PROTOCOL_TCP);
+ if (pi == 0)
+ return clib_error_return (0, "TCP protocol info AWOL");
+ pi->format_header = format_tcp_header;
+ pi->unformat_pg_edit = unformat_pg_tcp_header;
+
+ ip4_register_protocol (IP_PROTOCOL_TCP, tcp4_input_node.index);
+ ip6_register_protocol (IP_PROTOCOL_TCP, tcp6_input_node.index);
+
+ /* Register as transport with session layer */
+ session_register_transport (TRANSPORT_PROTO_TCP, 1, &tcp_proto);
+ session_register_transport (TRANSPORT_PROTO_TCP, 0, &tcp_proto);
+
+ /*
+ * Initialize data structures
+ */
+
+ num_threads = 1 /* main thread */ + vtm->n_threads;
+ vec_validate (tm->connections, num_threads - 1);
+
+ /*
+ * Preallocate connections. Assume that thread 0 won't
+ * use preallocated threads when running multi-core
+ */
+ if (num_threads == 1)
+ {
+ thread = 0;
+ preallocated_connections_per_thread = tm->preallocated_connections;
+ }
+ else
+ {
+ thread = 1;
+ preallocated_connections_per_thread =
+ tm->preallocated_connections / (num_threads - 1);
+ }
+ for (; thread < num_threads; thread++)
+ {
+ if (preallocated_connections_per_thread)
+ pool_init_fixed (tm->connections[thread],
+ preallocated_connections_per_thread);
+ }
+
+ /*
+ * Use a preallocated half-open connection pool?
+ */
+ if (tm->preallocated_half_open_connections)
+ pool_init_fixed (tm->half_open_connections,
+ tm->preallocated_half_open_connections);
+
+ /* Initialize per worker thread tx buffers (used for control messages) */
+ vec_validate (tm->tx_buffers, num_threads - 1);
+
+ /* Initialize timer wheels */
+ vec_validate (tm->timer_wheels, num_threads - 1);
+ tcp_initialize_timer_wheels (tm);
+
+ /* Initialize clocks per tick for TCP timestamp. Used to compute
+ * monotonically increasing timestamps. */
+ tm->tstamp_ticks_per_clock = vm->clib_time.seconds_per_clock
+ / TCP_TSTAMP_RESOLUTION;
+
+ if (tm->local_endpoints_table_buckets == 0)
+ tm->local_endpoints_table_buckets = 250000;
+ if (tm->local_endpoints_table_memory == 0)
+ tm->local_endpoints_table_memory = 512 << 20;
+
+ clib_bihash_init_24_8 (&tm->local_endpoints_table, "local endpoint table",
+ tm->local_endpoints_table_buckets,
+ tm->local_endpoints_table_memory);
+
+ /* Initialize [port-allocator] random number seed */
+ tm->port_allocator_seed = (u32) clib_cpu_time_now ();
+
+ if (num_threads > 1)
+ {
+ clib_spinlock_init (&tm->half_open_lock);
+ clib_spinlock_init (&tm->local_endpoints_lock);
+ }
+
+ vec_validate (tm->tx_frames[0], num_threads - 1);
+ vec_validate (tm->tx_frames[1], num_threads - 1);
+ vec_validate (tm->ip_lookup_tx_frames[0], num_threads - 1);
+ vec_validate (tm->ip_lookup_tx_frames[1], num_threads - 1);
+
+ tm->bytes_per_buffer = vlib_buffer_free_list_buffer_size
+ (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+
+ vec_validate (tm->time_now, num_threads - 1);
+ return error;
+}
+
+clib_error_t *
+vnet_tcp_enable_disable (vlib_main_t * vm, u8 is_en)
+{
+ if (is_en)
+ {
+ if (tcp_main.is_enabled)
+ return 0;
+
+ return tcp_main_enable (vm);
+ }
+ else
+ {
+ tcp_main.is_enabled = 0;
+ }
+
+ return 0;
+}
+
+void
+tcp_punt_unknown (vlib_main_t * vm, u8 is_ip4, u8 is_add)
+{
+ tcp_main_t *tm = &tcp_main;
+ if (is_ip4)
+ tm->punt_unknown4 = is_add;
+ else
+ tm->punt_unknown6 = is_add;
+}
+
+clib_error_t *
+tcp_init (vlib_main_t * vm)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ tm->is_enabled = 0;
+ tcp_api_reference ();
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (tcp_init);
+
+static clib_error_t *
+tcp_config_fn (vlib_main_t * vm, unformat_input_t * input)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ u64 tmp;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat
+ (input, "preallocated-connections %d",
+ &tm->preallocated_connections))
+ ;
+ else if (unformat (input, "preallocated-half-open-connections %d",
+ &tm->preallocated_half_open_connections))
+ ;
+ else if (unformat (input, "local-endpoints-table-memory %U",
+ unformat_memory_size, &tmp))
+ {
+ if (tmp >= 0x100000000)
+ return clib_error_return (0, "memory size %llx (%lld) too large",
+ tmp, tmp);
+ tm->local_endpoints_table_memory = tmp;
+ }
+ else if (unformat (input, "local-endpoints-table-buckets %d",
+ &tm->local_endpoints_table_buckets))
+ ;
+
+
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ return 0;
+}
+
+VLIB_CONFIG_FUNCTION (tcp_config_fn, "tcp");
+
+
+/**
+ * \brief Configure an ipv4 source address range
+ * @param vm vlib_main_t pointer
+ * @param start first ipv4 address in the source address range
+ * @param end last ipv4 address in the source address range
+ * @param table_id VRF / table ID, 0 for the default FIB
+ * @return 0 if all OK, else an error indication from api_errno.h
+ */
+
+int
+tcp_configure_v4_source_address_range (vlib_main_t * vm,
+ ip4_address_t * start,
+ ip4_address_t * end, u32 table_id)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ vnet_main_t *vnm = vnet_get_main ();
+ u32 start_host_byte_order, end_host_byte_order;
+ fib_prefix_t prefix;
+ vnet_sw_interface_t *si;
+ fib_node_index_t fei;
+ u32 fib_index = 0;
+ u32 sw_if_index;
+ int rv;
+ int vnet_proxy_arp_add_del (ip4_address_t * lo_addr,
+ ip4_address_t * hi_addr, u32 fib_index,
+ int is_del);
+
+ memset (&prefix, 0, sizeof (prefix));
+
+ fib_index = fib_table_find (FIB_PROTOCOL_IP4, table_id);
+
+ if (fib_index == ~0)
+ return VNET_API_ERROR_NO_SUCH_FIB;
+
+ start_host_byte_order = clib_net_to_host_u32 (start->as_u32);
+ end_host_byte_order = clib_net_to_host_u32 (end->as_u32);
+
+ /* sanity check for reversed args or some such */
+ if ((end_host_byte_order - start_host_byte_order) > (10 << 10))
+ return VNET_API_ERROR_INVALID_ARGUMENT;
+
+ /* Lookup the last address, to identify the interface involved */
+ prefix.fp_len = 32;
+ prefix.fp_proto = FIB_PROTOCOL_IP4;
+ memcpy (&prefix.fp_addr.ip4, end, sizeof (ip4_address_t));
+
+ fei = fib_table_lookup (fib_index, &prefix);
+
+ /* Couldn't find route to destination. Bail out. */
+ if (fei == FIB_NODE_INDEX_INVALID)
+ return VNET_API_ERROR_NEXT_HOP_NOT_IN_FIB;
+
+ sw_if_index = fib_entry_get_resolving_interface (fei);
+
+ /* Enable proxy arp on the interface */
+ si = vnet_get_sw_interface (vnm, sw_if_index);
+ si->flags |= VNET_SW_INTERFACE_FLAG_PROXY_ARP;
+
+ /* Configure proxy arp across the range */
+ rv = vnet_proxy_arp_add_del (start, end, fib_index, 0 /* is_del */ );
+
+ if (rv)
+ return rv;
+
+ do
+ {
+ dpo_id_t dpo = DPO_INVALID;
+
+ vec_add1 (tm->ip4_src_addresses, start[0]);
+
+ /* Add local adjacencies for the range */
+
+ receive_dpo_add_or_lock (DPO_PROTO_IP4, ~0 /* sw_if_index */ ,
+ NULL, &dpo);
+ prefix.fp_len = 32;
+ prefix.fp_proto = FIB_PROTOCOL_IP4;
+ prefix.fp_addr.ip4.as_u32 = start->as_u32;
+
+ fib_table_entry_special_dpo_update (fib_index,
+ &prefix,
+ FIB_SOURCE_API,
+ FIB_ENTRY_FLAG_EXCLUSIVE, &dpo);
+ dpo_reset (&dpo);
+
+ start_host_byte_order++;
+ start->as_u32 = clib_host_to_net_u32 (start_host_byte_order);
+ }
+ while (start_host_byte_order <= end_host_byte_order);
+
+ return 0;
+}
+
+/**
+ * \brief Configure an ipv6 source address range
+ * @param vm vlib_main_t pointer
+ * @param start first ipv6 address in the source address range
+ * @param end last ipv6 address in the source address range
+ * @param table_id VRF / table ID, 0 for the default FIB
+ * @return 0 if all OK, else an error indication from api_errno.h
+ */
+
+int
+tcp_configure_v6_source_address_range (vlib_main_t * vm,
+ ip6_address_t * start,
+ ip6_address_t * end, u32 table_id)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ fib_prefix_t prefix;
+ u32 fib_index = 0;
+ fib_node_index_t fei;
+ u32 sw_if_index;
+
+ memset (&prefix, 0, sizeof (prefix));
+
+ fib_index = fib_table_find (FIB_PROTOCOL_IP6, table_id);
+
+ if (fib_index == ~0)
+ return VNET_API_ERROR_NO_SUCH_FIB;
+
+ while (1)
+ {
+ int i;
+ ip6_address_t tmp;
+ dpo_id_t dpo = DPO_INVALID;
+
+ /* Remember this address */
+ vec_add1 (tm->ip6_src_addresses, start[0]);
+
+ /* Lookup the prefix, to identify the interface involved */
+ prefix.fp_len = 128;
+ prefix.fp_proto = FIB_PROTOCOL_IP6;
+ memcpy (&prefix.fp_addr.ip6, start, sizeof (ip6_address_t));
+
+ fei = fib_table_lookup (fib_index, &prefix);
+
+ /* Couldn't find route to destination. Bail out. */
+ if (fei == FIB_NODE_INDEX_INVALID)
+ return VNET_API_ERROR_NEXT_HOP_NOT_IN_FIB;
+
+ sw_if_index = fib_entry_get_resolving_interface (fei);
+
+ if (sw_if_index == (u32) ~ 0)
+ return VNET_API_ERROR_NO_MATCHING_INTERFACE;
+
+ /* Add a proxy neighbor discovery entry for this address */
+ ip6_neighbor_proxy_add_del (sw_if_index, start, 0 /* is_del */ );
+
+ /* Add a receive adjacency for this address */
+ receive_dpo_add_or_lock (DPO_PROTO_IP6, ~0 /* sw_if_index */ ,
+ NULL, &dpo);
+
+ fib_table_entry_special_dpo_update (fib_index,
+ &prefix,
+ FIB_SOURCE_API,
+ FIB_ENTRY_FLAG_EXCLUSIVE, &dpo);
+ dpo_reset (&dpo);
+
+ /* Done with the entire range? */
+ if (!memcmp (start, end, sizeof (start[0])))
+ break;
+
+ /* Increment the address. DGMS. */
+ tmp = start[0];
+ for (i = 15; i >= 0; i--)
+ {
+ tmp.as_u8[i] += 1;
+ if (tmp.as_u8[i] != 0)
+ break;
+ }
+ start[0] = tmp;
+ }
+ return 0;
+}
+
+static clib_error_t *
+tcp_src_address (vlib_main_t * vm,
+ unformat_input_t * input, vlib_cli_command_t * cmd_arg)
+{
+ ip4_address_t v4start, v4end;
+ ip6_address_t v6start, v6end;
+ u32 table_id = 0;
+ int v4set = 0;
+ int v6set = 0;
+ int rv;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "%U - %U", unformat_ip4_address, &v4start,
+ unformat_ip4_address, &v4end))
+ v4set = 1;
+ else if (unformat (input, "%U", unformat_ip4_address, &v4start))
+ {
+ memcpy (&v4end, &v4start, sizeof (v4start));
+ v4set = 1;
+ }
+ else if (unformat (input, "%U - %U", unformat_ip6_address, &v6start,
+ unformat_ip6_address, &v6end))
+ v6set = 1;
+ else if (unformat (input, "%U", unformat_ip6_address, &v6start))
+ {
+ memcpy (&v6end, &v6start, sizeof (v6start));
+ v6set = 1;
+ }
+ else if (unformat (input, "fib-table %d", &table_id))
+ ;
+ else
+ break;
+ }
+
+ if (!v4set && !v6set)
+ return clib_error_return (0, "at least one v4 or v6 address required");
+
+ if (v4set)
+ {
+ rv = tcp_configure_v4_source_address_range (vm, &v4start, &v4end,
+ table_id);
+ switch (rv)
+ {
+ case 0:
+ break;
+
+ case VNET_API_ERROR_NO_SUCH_FIB:
+ return clib_error_return (0, "Invalid table-id %d", table_id);
+
+ case VNET_API_ERROR_INVALID_ARGUMENT:
+ return clib_error_return (0, "Invalid address range %U - %U",
+ format_ip4_address, &v4start,
+ format_ip4_address, &v4end);
+ default:
+ return clib_error_return (0, "error %d", rv);
+ break;
+ }
+ }
+ if (v6set)
+ {
+ rv = tcp_configure_v6_source_address_range (vm, &v6start, &v6end,
+ table_id);
+ switch (rv)
+ {
+ case 0:
+ break;
+
+ case VNET_API_ERROR_NO_SUCH_FIB:
+ return clib_error_return (0, "Invalid table-id %d", table_id);
+
+ default:
+ return clib_error_return (0, "error %d", rv);
+ break;
+ }
+ }
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (tcp_src_address_command, static) =
+{
+ .path = "tcp src-address",
+ .short_help = "tcp src-address <ip-addr> [- <ip-addr>] add src address range",
+ .function = tcp_src_address,
+};
+/* *INDENT-ON* */
+
+static u8 *
+tcp_scoreboard_dump_trace (u8 * s, sack_scoreboard_t * sb)
+{
+#if TCP_SCOREBOARD_TRACE
+
+ scoreboard_trace_elt_t *block;
+ int i = 0;
+
+ if (!sb->trace)
+ return s;
+
+ s = format (s, "scoreboard trace:");
+ vec_foreach (block, sb->trace)
+ {
+ s = format (s, "{%u, %u, %u, %u, %u}, ", block->start, block->end,
+ block->ack, block->snd_una_max, block->group);
+ if ((++i % 3) == 0)
+ s = format (s, "\n");
+ }
+ return s;
+#else
+ return 0;
+#endif
+}
+
+static clib_error_t *
+tcp_show_scoreboard_trace_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd_arg)
+{
+ transport_connection_t *tconn = 0;
+ tcp_connection_t *tc;
+ u8 *s = 0;
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "%U", unformat_transport_connection, &tconn,
+ TRANSPORT_PROTO_TCP))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ if (!TCP_SCOREBOARD_TRACE)
+ {
+ vlib_cli_output (vm, "scoreboard tracing not enabled");
+ return 0;
+ }
+
+ tc = tcp_get_connection_from_transport (tconn);
+ s = tcp_scoreboard_dump_trace (s, &tc->sack_sb);
+ vlib_cli_output (vm, "%v", s);
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (tcp_show_scoreboard_trace_command, static) =
+{
+ .path = "show tcp scoreboard trace",
+ .short_help = "show tcp scoreboard trace <connection>",
+ .function = tcp_show_scoreboard_trace_fn,
+};
+/* *INDENT-ON* */
+
+u8 *
+tcp_scoreboard_replay (u8 * s, tcp_connection_t * tc, u8 verbose)
+{
+ int i, trace_len;
+ scoreboard_trace_elt_t *trace;
+ u32 next_ack, left, group, has_new_ack = 0;
+ tcp_connection_t _dummy_tc, *dummy_tc = &_dummy_tc;
+ sack_block_t *block;
+
+ if (!tc)
+ return s;
+
+ memset (dummy_tc, 0, sizeof (*dummy_tc));
+ tcp_connection_timers_init (dummy_tc);
+ scoreboard_init (&dummy_tc->sack_sb);
+ dummy_tc->rcv_opts.flags |= TCP_OPTS_FLAG_SACK;
+
+#if TCP_SCOREBOARD_TRACE
+ trace = tc->sack_sb.trace;
+ trace_len = vec_len (tc->sack_sb.trace);
+#else
+ trace = 0;
+ trace_len = 0;
+#endif
+
+ for (i = 0; i < trace_len; i++)
+ {
+ if (trace[i].ack != 0)
+ {
+ dummy_tc->snd_una = trace[i].ack - 1448;
+ dummy_tc->snd_una_max = trace[i].ack;
+ }
+ }
+
+ left = 0;
+ while (left < trace_len)
+ {
+ group = trace[left].group;
+ vec_reset_length (dummy_tc->rcv_opts.sacks);
+ has_new_ack = 0;
+ while (trace[left].group == group)
+ {
+ if (trace[left].ack != 0)
+ {
+ if (verbose)
+ s = format (s, "Adding ack %u, snd_una_max %u, segs: ",
+ trace[left].ack, trace[left].snd_una_max);
+ dummy_tc->snd_una_max = trace[left].snd_una_max;
+ next_ack = trace[left].ack;
+ has_new_ack = 1;
+ }
+ else
+ {
+ if (verbose)
+ s = format (s, "[%u, %u], ", trace[left].start,
+ trace[left].end);
+ vec_add2 (dummy_tc->rcv_opts.sacks, block, 1);
+ block->start = trace[left].start;
+ block->end = trace[left].end;
+ }
+ left++;
+ }
+
+ /* Push segments */
+ tcp_rcv_sacks (dummy_tc, next_ack);
+ if (has_new_ack)
+ dummy_tc->snd_una = next_ack + dummy_tc->sack_sb.snd_una_adv;
+
+ if (verbose)
+ s = format (s, "result: %U", format_tcp_scoreboard,
+ &dummy_tc->sack_sb);
+
+ }
+ s = format (s, "result: %U", format_tcp_scoreboard, &dummy_tc->sack_sb);
+
+ return s;
+}
+
+static clib_error_t *
+tcp_scoreboard_trace_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd_arg)
+{
+ transport_connection_t *tconn = 0;
+ tcp_connection_t *tc = 0;
+ u8 *str = 0;
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "%U", unformat_transport_connection, &tconn,
+ TRANSPORT_PROTO_TCP))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ if (!TCP_SCOREBOARD_TRACE)
+ {
+ vlib_cli_output (vm, "scoreboard tracing not enabled");
+ return 0;
+ }
+
+ tc = tcp_get_connection_from_transport (tconn);
+ if (!tc)
+ {
+ vlib_cli_output (vm, "connection not found");
+ return 0;
+ }
+ str = tcp_scoreboard_replay (str, tc, 1);
+ vlib_cli_output (vm, "%v", str);
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (tcp_replay_scoreboard_command, static) =
+{
+ .path = "tcp replay scoreboard",
+ .short_help = "tcp replay scoreboard <connection>",
+ .function = tcp_scoreboard_trace_fn,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+show_tcp_punt_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd_arg)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ if (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ return clib_error_return (0, "unknown input `%U'", format_unformat_error,
+ input);
+ vlib_cli_output (vm, "IPv4 TCP punt: %s",
+ tm->punt_unknown4 ? "enabled" : "disabled");
+ vlib_cli_output (vm, "IPv6 TCP punt: %s",
+ tm->punt_unknown6 ? "enabled" : "disabled");
+ return 0;
+}
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (show_tcp_punt_command, static) =
+{
+ .path = "show tcp punt",
+ .short_help = "show tcp punt",
+ .function = show_tcp_punt_fn,
+};
+/* *INDENT-ON* */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
new file mode 100644
index 00000000..259dbca1
--- /dev/null
+++ b/src/vnet/tcp/tcp.h
@@ -0,0 +1,985 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _vnet_tcp_h_
+#define _vnet_tcp_h_
+
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/tcp/tcp_packet.h>
+#include <vnet/tcp/tcp_timer.h>
+#include <vnet/session/transport.h>
+#include <vnet/session/session.h>
+#include <vnet/tcp/tcp_debug.h>
+
+#define TCP_TICK 0.001 /**< TCP tick period (s) */
+#define THZ (u32) (1/TCP_TICK) /**< TCP tick frequency */
+#define TCP_TSTAMP_RESOLUTION TCP_TICK /**< Time stamp resolution */
+#define TCP_PAWS_IDLE 24 * 24 * 60 * 60 * THZ /**< 24 days */
+#define TCP_FIB_RECHECK_PERIOD 1 * THZ /**< Recheck every 1s */
+#define TCP_MAX_OPTION_SPACE 40
+
+#define TCP_DUPACK_THRESHOLD 3
+#define TCP_MAX_RX_FIFO_SIZE 4 << 20
+#define TCP_MIN_RX_FIFO_SIZE 4 << 10
+#define TCP_IW_N_SEGMENTS 10
+#define TCP_ALWAYS_ACK 1 /**< On/off delayed acks */
+#define TCP_USE_SACKS 1 /**< Disable only for testing */
+
+/** TCP FSM state definitions as per RFC793. */
+#define foreach_tcp_fsm_state \
+ _(CLOSED, "CLOSED") \
+ _(LISTEN, "LISTEN") \
+ _(SYN_SENT, "SYN_SENT") \
+ _(SYN_RCVD, "SYN_RCVD") \
+ _(ESTABLISHED, "ESTABLISHED") \
+ _(CLOSE_WAIT, "CLOSE_WAIT") \
+ _(FIN_WAIT_1, "FIN_WAIT_1") \
+ _(LAST_ACK, "LAST_ACK") \
+ _(CLOSING, "CLOSING") \
+ _(FIN_WAIT_2, "FIN_WAIT_2") \
+ _(TIME_WAIT, "TIME_WAIT")
+
+typedef enum _tcp_state
+{
+#define _(sym, str) TCP_STATE_##sym,
+ foreach_tcp_fsm_state
+#undef _
+ TCP_N_STATES
+} tcp_state_t;
+
+format_function_t format_tcp_state;
+format_function_t format_tcp_flags;
+format_function_t format_tcp_sacks;
+format_function_t format_tcp_rcv_sacks;
+
+/** TCP timers */
+#define foreach_tcp_timer \
+ _(RETRANSMIT, "RETRANSMIT") \
+ _(DELACK, "DELAYED ACK") \
+ _(PERSIST, "PERSIST") \
+ _(KEEP, "KEEP") \
+ _(WAITCLOSE, "WAIT CLOSE") \
+ _(RETRANSMIT_SYN, "RETRANSMIT SYN") \
+ _(ESTABLISH, "ESTABLISH")
+
+typedef enum _tcp_timers
+{
+#define _(sym, str) TCP_TIMER_##sym,
+ foreach_tcp_timer
+#undef _
+ TCP_N_TIMERS
+} tcp_timers_e;
+
+typedef void (timer_expiration_handler) (u32 index);
+
+extern timer_expiration_handler tcp_timer_delack_handler;
+extern timer_expiration_handler tcp_timer_retransmit_handler;
+extern timer_expiration_handler tcp_timer_persist_handler;
+extern timer_expiration_handler tcp_timer_retransmit_syn_handler;
+
+#define TCP_TIMER_HANDLE_INVALID ((u32) ~0)
+
+/* Timer delays as multiples of 100ms */
+#define TCP_TO_TIMER_TICK TCP_TICK*10 /* Period for converting from TCP
+ * ticks to timer units */
+#define TCP_DELACK_TIME 1 /* 0.1s */
+#define TCP_ESTABLISH_TIME 750 /* 75s */
+#define TCP_SYN_RCVD_TIME 600 /* 60s */
+#define TCP_2MSL_TIME 300 /* 30s */
+#define TCP_CLOSEWAIT_TIME 20 /* 2s */
+#define TCP_TIMEWAIT_TIME 20 /* 2s */
+#define TCP_CLEANUP_TIME 10 /* 1s Time to wait before cleanup */
+#define TCP_TIMER_PERSIST_MIN 2 /* 0.2s */
+
+#define TCP_RTO_MAX 60 * THZ /* Min max RTO (60s) as per RFC6298 */
+#define TCP_RTO_MIN 0.2 * THZ /* Min RTO (200ms) - lower than standard */
+#define TCP_RTT_MAX 30 * THZ /* 30s (probably too much) */
+#define TCP_RTO_SYN_RETRIES 3 /* SYN retries without doubling RTO */
+#define TCP_RTO_INIT 1 * THZ /* Initial retransmit timer */
+
+/** TCP connection flags */
+#define foreach_tcp_connection_flag \
+ _(SNDACK, "Send ACK") \
+ _(FINSNT, "FIN sent") \
+ _(SENT_RCV_WND0, "Sent 0 receive window") \
+ _(RECOVERY, "Recovery on") \
+ _(FAST_RECOVERY, "Fast Recovery on") \
+ _(FR_1_SMSS, "Sent 1 SMSS") \
+ _(HALF_OPEN_DONE, "Half-open completed") \
+ _(FINPNDG, "FIN pending")
+
+typedef enum _tcp_connection_flag_bits
+{
+#define _(sym, str) TCP_CONN_##sym##_BIT,
+ foreach_tcp_connection_flag
+#undef _
+ TCP_CONN_N_FLAG_BITS
+} tcp_connection_flag_bits_e;
+
+typedef enum _tcp_connection_flag
+{
+#define _(sym, str) TCP_CONN_##sym = 1 << TCP_CONN_##sym##_BIT,
+ foreach_tcp_connection_flag
+#undef _
+ TCP_CONN_N_FLAGS
+} tcp_connection_flags_e;
+
+/** TCP buffer flags */
+#define foreach_tcp_buf_flag \
+ _ (ACK) /**< Sending ACK. */ \
+ _ (DUPACK) /**< Sending DUPACK. */ \
+
+enum
+{
+#define _(f) TCP_BUF_BIT_##f,
+ foreach_tcp_buf_flag
+#undef _
+ TCP_N_BUF_BITS,
+};
+
+enum
+{
+#define _(f) TCP_BUF_FLAG_##f = 1 << TCP_BUF_BIT_##f,
+ foreach_tcp_buf_flag
+#undef _
+};
+
+#define TCP_SCOREBOARD_TRACE (0)
+#define TCP_MAX_SACK_BLOCKS 15 /**< Max number of SACK blocks stored */
+#define TCP_INVALID_SACK_HOLE_INDEX ((u32)~0)
+
+typedef struct _scoreboard_trace_elt
+{
+ u32 start;
+ u32 end;
+ u32 ack;
+ u32 snd_una_max;
+ u32 group;
+} scoreboard_trace_elt_t;
+
+typedef struct _sack_scoreboard_hole
+{
+ u32 next; /**< Index for next entry in linked list */
+ u32 prev; /**< Index for previous entry in linked list */
+ u32 start; /**< Start sequence number */
+ u32 end; /**< End sequence number */
+ u8 is_lost; /**< Mark hole as lost */
+} sack_scoreboard_hole_t;
+
+typedef struct _sack_scoreboard
+{
+ sack_scoreboard_hole_t *holes; /**< Pool of holes */
+ u32 head; /**< Index of first entry */
+ u32 tail; /**< Index of last entry */
+ u32 sacked_bytes; /**< Number of bytes sacked in sb */
+ u32 last_sacked_bytes; /**< Number of bytes last sacked */
+ u32 last_bytes_delivered; /**< Number of sack bytes delivered */
+ u32 snd_una_adv; /**< Bytes to add to snd_una */
+ u32 high_sacked; /**< Highest byte sacked (fack) */
+ u32 high_rxt; /**< Highest retransmitted sequence */
+ u32 rescue_rxt; /**< Rescue sequence number */
+ u32 lost_bytes; /**< Bytes lost as per RFC6675 */
+ u32 cur_rxt_hole; /**< Retransmitting from this hole */
+
+#if TCP_SCOREBOARD_TRACE
+ scoreboard_trace_elt_t *trace;
+#endif
+
+} sack_scoreboard_t;
+
+#if TCP_SCOREBOARD_TRACE
+#define tcp_scoreboard_trace_add(_tc, _ack) \
+{ \
+ static u64 _group = 0; \
+ sack_scoreboard_t *_sb = &_tc->sack_sb; \
+ sack_block_t *_sack, *_sacks; \
+ scoreboard_trace_elt_t *_elt; \
+ int i; \
+ _group++; \
+ _sacks = _tc->rcv_opts.sacks; \
+ for (i = 0; i < vec_len (_sacks); i++) \
+ { \
+ _sack = &_sacks[i]; \
+ vec_add2 (_sb->trace, _elt, 1); \
+ _elt->start = _sack->start; \
+ _elt->end = _sack->end; \
+ _elt->ack = _elt->end == _ack ? _ack : 0; \
+ _elt->snd_una_max = _elt->end == _ack ? _tc->snd_una_max : 0; \
+ _elt->group = _group; \
+ } \
+}
+#else
+#define tcp_scoreboard_trace_add(_tc, _ack)
+#endif
+
+typedef enum _tcp_cc_algorithm_type
+{
+ TCP_CC_NEWRENO,
+} tcp_cc_algorithm_type_e;
+
+typedef struct _tcp_cc_algorithm tcp_cc_algorithm_t;
+
+typedef enum _tcp_cc_ack_t
+{
+ TCP_CC_ACK,
+ TCP_CC_DUPACK,
+ TCP_CC_PARTIALACK
+} tcp_cc_ack_t;
+
+typedef struct _tcp_connection
+{
+ transport_connection_t connection; /**< Common transport data. First! */
+
+ u8 state; /**< TCP state as per tcp_state_t */
+ u16 flags; /**< Connection flags (see tcp_conn_flags_e) */
+ u32 timers[TCP_N_TIMERS]; /**< Timer handles into timer wheel */
+
+ /* TODO RFC4898 */
+
+ /** Send sequence variables RFC793 */
+ u32 snd_una; /**< oldest unacknowledged sequence number */
+ u32 snd_una_max; /**< newest unacknowledged sequence number + 1*/
+ u32 snd_wnd; /**< send window */
+ u32 snd_wl1; /**< seq number used for last snd.wnd update */
+ u32 snd_wl2; /**< ack number used for last snd.wnd update */
+ u32 snd_nxt; /**< next seq number to be sent */
+ u16 snd_mss; /**< Effective send max seg (data) size */
+
+ /** Receive sequence variables RFC793 */
+ u32 rcv_nxt; /**< next sequence number expected */
+ u32 rcv_wnd; /**< receive window we expect */
+
+ u32 rcv_las; /**< rcv_nxt at last ack sent/rcv_wnd update */
+ u32 iss; /**< initial sent sequence */
+ u32 irs; /**< initial remote sequence */
+
+ /* Options */
+ tcp_options_t rcv_opts; /**< Rx options for connection */
+ tcp_options_t snd_opts; /**< Tx options for connection */
+ u8 snd_opts_len; /**< Tx options len */
+ u8 rcv_wscale; /**< Window scale to advertise to peer */
+ u8 snd_wscale; /**< Window scale to use when sending */
+ u32 tsval_recent; /**< Last timestamp received */
+ u32 tsval_recent_age; /**< When last updated tstamp_recent*/
+
+ sack_block_t *snd_sacks; /**< Vector of SACKs to send. XXX Fixed size? */
+ sack_scoreboard_t sack_sb; /**< SACK "scoreboard" that tracks holes */
+
+ u16 rcv_dupacks; /**< Number of DUPACKs received */
+ u8 snt_dupacks; /**< Number of DUPACKs sent in a burst */
+
+ /* Congestion control */
+ u32 cwnd; /**< Congestion window */
+ u32 ssthresh; /**< Slow-start threshold */
+ u32 prev_ssthresh; /**< ssthresh before congestion */
+ u32 prev_cwnd; /**< ssthresh before congestion */
+ u32 bytes_acked; /**< Bytes acknowledged by current segment */
+ u32 snd_rxt_bytes; /**< Retransmitted bytes */
+ u32 snd_rxt_ts; /**< Timestamp when first packet is retransmitted */
+ u32 tsecr_last_ack; /**< Timestamp echoed to us in last healthy ACK */
+ u32 snd_congestion; /**< snd_una_max when congestion is detected */
+ tcp_cc_algorithm_t *cc_algo; /**< Congestion control algorithm */
+
+ /* RTT and RTO */
+ u32 rto; /**< Retransmission timeout */
+ u32 rto_boff; /**< Index for RTO backoff */
+ u32 srtt; /**< Smoothed RTT */
+ u32 rttvar; /**< Smoothed mean RTT difference. Approximates variance */
+ u32 rtt_ts; /**< Timestamp for tracked ACK */
+ u32 rtt_seq; /**< Sequence number for tracked ACK */
+
+ u16 mss; /**< Our max seg size that includes options */
+ u32 limited_transmit; /**< snd_nxt when limited transmit starts */
+ u32 last_fib_check; /**< Last time we checked fib route for peer */
+} tcp_connection_t;
+
+struct _tcp_cc_algorithm
+{
+ void (*rcv_ack) (tcp_connection_t * tc);
+ void (*rcv_cong_ack) (tcp_connection_t * tc, tcp_cc_ack_t ack);
+ void (*congestion) (tcp_connection_t * tc);
+ void (*recovered) (tcp_connection_t * tc);
+ void (*init) (tcp_connection_t * tc);
+};
+
+#define tcp_fastrecovery_on(tc) (tc)->flags |= TCP_CONN_FAST_RECOVERY
+#define tcp_fastrecovery_off(tc) (tc)->flags &= ~TCP_CONN_FAST_RECOVERY
+#define tcp_recovery_on(tc) (tc)->flags |= TCP_CONN_RECOVERY
+#define tcp_recovery_off(tc) (tc)->flags &= ~TCP_CONN_RECOVERY
+#define tcp_in_fastrecovery(tc) ((tc)->flags & TCP_CONN_FAST_RECOVERY)
+#define tcp_in_recovery(tc) ((tc)->flags & (TCP_CONN_RECOVERY))
+#define tcp_in_slowstart(tc) (tc->cwnd < tc->ssthresh)
+#define tcp_fastrecovery_sent_1_smss(tc) ((tc)->flags & TCP_CONN_FR_1_SMSS)
+#define tcp_fastrecovery_1_smss_on(tc) ((tc)->flags |= TCP_CONN_FR_1_SMSS)
+#define tcp_fastrecovery_1_smss_off(tc) ((tc)->flags &= ~TCP_CONN_FR_1_SMSS)
+
+#define tcp_in_cong_recovery(tc) ((tc)->flags & \
+ (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY))
+
+always_inline void
+tcp_cong_recovery_off (tcp_connection_t * tc)
+{
+ tc->flags &= ~(TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY);
+ tcp_fastrecovery_1_smss_off (tc);
+}
+
+typedef enum
+{
+ TCP_IP4,
+ TCP_IP6,
+ TCP_N_AF,
+} tcp_af_t;
+
+typedef enum _tcp_error
+{
+#define tcp_error(n,s) TCP_ERROR_##n,
+#include <vnet/tcp/tcp_error.def>
+#undef tcp_error
+ TCP_N_ERROR,
+} tcp_error_t;
+
+typedef struct _tcp_lookup_dispatch
+{
+ u8 next, error;
+} tcp_lookup_dispatch_t;
+
+typedef struct _tcp_main
+{
+ /* Per-worker thread tcp connection pools */
+ tcp_connection_t **connections;
+
+ /* Pool of listeners. */
+ tcp_connection_t *listener_pool;
+
+ /** Dispatch table by state and flags */
+ tcp_lookup_dispatch_t dispatch_table[TCP_N_STATES][64];
+
+ u8 log2_tstamp_clocks_per_tick;
+ f64 tstamp_ticks_per_clock;
+ u32 *time_now;
+
+ /** per-worker tx buffer free lists */
+ u32 **tx_buffers;
+ /** per-worker tx frames to tcp 4/6 output nodes */
+ vlib_frame_t **tx_frames[2];
+ /** per-worker tx frames to ip 4/6 lookup nodes */
+ vlib_frame_t **ip_lookup_tx_frames[2];
+
+ /* Per worker-thread timer wheel for connections timers */
+ tw_timer_wheel_16t_2w_512sl_t *timer_wheels;
+
+ /* Pool of half-open connections on which we've sent a SYN */
+ tcp_connection_t *half_open_connections;
+ clib_spinlock_t half_open_lock;
+
+ /* Pool of local TCP endpoints */
+ transport_endpoint_t *local_endpoints;
+
+ /* Local endpoints lookup table */
+ transport_endpoint_table_t local_endpoints_table;
+ clib_spinlock_t local_endpoints_lock;
+
+ /* Congestion control algorithms registered */
+ tcp_cc_algorithm_t *cc_algos;
+
+ /* Flag that indicates if stack is on or off */
+ u8 is_enabled;
+
+ /** Number of preallocated connections */
+ u32 preallocated_connections;
+ u32 preallocated_half_open_connections;
+
+ /** Transport table (preallocation) size parameters */
+ u32 local_endpoints_table_memory;
+ u32 local_endpoints_table_buckets;
+
+ /** Vectors of src addresses. Optional unless one needs > 63K active-opens */
+ ip4_address_t *ip4_src_addresses;
+ u32 last_v4_address_rotor;
+ u32 last_v6_address_rotor;
+ ip6_address_t *ip6_src_addresses;
+
+ /** Port allocator random number generator seed */
+ u32 port_allocator_seed;
+
+ /** vlib buffer size */
+ u32 bytes_per_buffer;
+
+ u8 punt_unknown4;
+ u8 punt_unknown6;
+} tcp_main_t;
+
+extern tcp_main_t tcp_main;
+extern vlib_node_registration_t tcp4_input_node;
+extern vlib_node_registration_t tcp6_input_node;
+extern vlib_node_registration_t tcp4_output_node;
+extern vlib_node_registration_t tcp6_output_node;
+
+always_inline tcp_main_t *
+vnet_get_tcp_main ()
+{
+ return &tcp_main;
+}
+
+always_inline tcp_header_t *
+tcp_buffer_hdr (vlib_buffer_t * b)
+{
+ ASSERT ((signed) b->current_data >= (signed) -VLIB_BUFFER_PRE_DATA_SIZE);
+ return (tcp_header_t *) (b->data + b->current_data
+ + vnet_buffer (b)->tcp.hdr_offset);
+}
+
+clib_error_t *vnet_tcp_enable_disable (vlib_main_t * vm, u8 is_en);
+
+void tcp_punt_unknown (vlib_main_t * vm, u8 is_ip4, u8 is_add);
+
+always_inline tcp_connection_t *
+tcp_connection_get (u32 conn_index, u32 thread_index)
+{
+ if (PREDICT_FALSE
+ (pool_is_free_index (tcp_main.connections[thread_index], conn_index)))
+ return 0;
+ return pool_elt_at_index (tcp_main.connections[thread_index], conn_index);
+}
+
+always_inline tcp_connection_t *
+tcp_connection_get_if_valid (u32 conn_index, u32 thread_index)
+{
+ if (tcp_main.connections[thread_index] == 0)
+ return 0;
+ if (pool_is_free_index (tcp_main.connections[thread_index], conn_index))
+ return 0;
+ return pool_elt_at_index (tcp_main.connections[thread_index], conn_index);
+}
+
+always_inline tcp_connection_t *
+tcp_get_connection_from_transport (transport_connection_t * tconn)
+{
+ return (tcp_connection_t *) tconn;
+}
+
+void tcp_connection_close (tcp_connection_t * tc);
+void tcp_connection_cleanup (tcp_connection_t * tc);
+void tcp_connection_del (tcp_connection_t * tc);
+int tcp_half_open_connection_cleanup (tcp_connection_t * tc);
+tcp_connection_t *tcp_connection_new (u8 thread_index);
+void tcp_connection_reset (tcp_connection_t * tc);
+int tcp_configure_v4_source_address_range (vlib_main_t * vm,
+ ip4_address_t * start,
+ ip4_address_t * end, u32 table_id);
+int tcp_configure_v6_source_address_range (vlib_main_t * vm,
+ ip6_address_t * start,
+ ip6_address_t * end, u32 table_id);
+void tcp_api_reference (void);
+u8 *format_tcp_connection_id (u8 * s, va_list * args);
+u8 *format_tcp_connection (u8 * s, va_list * args);
+u8 *format_tcp_scoreboard (u8 * s, va_list * args);
+
+u8 *tcp_scoreboard_replay (u8 * s, tcp_connection_t * tc, u8 verbose);
+
+always_inline tcp_connection_t *
+tcp_listener_get (u32 tli)
+{
+ return pool_elt_at_index (tcp_main.listener_pool, tli);
+}
+
+always_inline tcp_connection_t *
+tcp_half_open_connection_get (u32 conn_index)
+{
+ tcp_connection_t *tc = 0;
+ clib_spinlock_lock_if_init (&tcp_main.half_open_lock);
+ if (!pool_is_free_index (tcp_main.half_open_connections, conn_index))
+ tc = pool_elt_at_index (tcp_main.half_open_connections, conn_index);
+ clib_spinlock_unlock_if_init (&tcp_main.half_open_lock);
+ return tc;
+}
+
+void tcp_make_ack (tcp_connection_t * ts, vlib_buffer_t * b);
+void tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b);
+void tcp_make_synack (tcp_connection_t * ts, vlib_buffer_t * b);
+void tcp_send_reset_w_pkt (tcp_connection_t * tc, vlib_buffer_t * pkt,
+ u8 is_ip4);
+void tcp_send_reset (tcp_connection_t * tc);
+void tcp_send_syn (tcp_connection_t * tc);
+void tcp_send_fin (tcp_connection_t * tc);
+void tcp_init_mss (tcp_connection_t * tc);
+void tcp_update_snd_mss (tcp_connection_t * tc);
+void tcp_update_rto (tcp_connection_t * tc);
+void tcp_flush_frame_to_output (vlib_main_t * vm, u8 thread_index, u8 is_ip4);
+void tcp_flush_frames_to_output (u8 thread_index);
+
+always_inline u32
+tcp_end_seq (tcp_header_t * th, u32 len)
+{
+ return th->seq_number + tcp_is_syn (th) + tcp_is_fin (th) + len;
+}
+
+/* Modulo arithmetic for TCP sequence numbers */
+#define seq_lt(_s1, _s2) ((i32)((_s1)-(_s2)) < 0)
+#define seq_leq(_s1, _s2) ((i32)((_s1)-(_s2)) <= 0)
+#define seq_gt(_s1, _s2) ((i32)((_s1)-(_s2)) > 0)
+#define seq_geq(_s1, _s2) ((i32)((_s1)-(_s2)) >= 0)
+#define seq_max(_s1, _s2) (seq_gt((_s1), (_s2)) ? (_s1) : (_s2))
+
+/* Modulo arithmetic for timestamps */
+#define timestamp_lt(_t1, _t2) ((i32)((_t1)-(_t2)) < 0)
+#define timestamp_leq(_t1, _t2) ((i32)((_t1)-(_t2)) <= 0)
+
+/**
+ * Our estimate of the number of bytes that have left the network
+ */
+always_inline u32
+tcp_bytes_out (const tcp_connection_t * tc)
+{
+ if (tcp_opts_sack_permitted (&tc->rcv_opts))
+ return tc->sack_sb.sacked_bytes + tc->sack_sb.lost_bytes;
+ else
+ return tc->rcv_dupacks * tc->snd_mss;
+}
+
+/**
+ * Our estimate of the number of bytes in flight (pipe size)
+ */
+always_inline u32
+tcp_flight_size (const tcp_connection_t * tc)
+{
+ int flight_size;
+
+ flight_size = (int) (tc->snd_una_max - tc->snd_una) - tcp_bytes_out (tc)
+ + tc->snd_rxt_bytes;
+
+ if (flight_size < 0)
+ {
+ if (0)
+ clib_warning
+ ("Negative: %u %u %u dupacks %u sacked bytes %u flags %d",
+ tc->snd_una_max - tc->snd_una, tcp_bytes_out (tc),
+ tc->snd_rxt_bytes, tc->rcv_dupacks, tc->sack_sb.sacked_bytes,
+ tc->rcv_opts.flags);
+ return 0;
+ }
+
+ return flight_size;
+}
+
+/**
+ * Initial cwnd as per RFC5681
+ */
+always_inline u32
+tcp_initial_cwnd (const tcp_connection_t * tc)
+{
+ if (tc->snd_mss > 2190)
+ return 2 * tc->snd_mss;
+ else if (tc->snd_mss > 1095)
+ return 3 * tc->snd_mss;
+ else
+ return 4 * tc->snd_mss;
+}
+
+always_inline u32
+tcp_loss_wnd (const tcp_connection_t * tc)
+{
+ return tc->snd_mss;
+}
+
+always_inline u32
+tcp_available_snd_wnd (const tcp_connection_t * tc)
+{
+ return clib_min (tc->cwnd, tc->snd_wnd);
+}
+
+always_inline u32
+tcp_available_output_snd_space (const tcp_connection_t * tc)
+{
+ u32 available_wnd = tcp_available_snd_wnd (tc);
+ int flight_size = (int) (tc->snd_nxt - tc->snd_una);
+
+ if (available_wnd <= flight_size)
+ return 0;
+
+ return available_wnd - flight_size;
+}
+
+/**
+ * Estimate of how many bytes we can still push into the network
+ */
+always_inline u32
+tcp_available_snd_space (const tcp_connection_t * tc)
+{
+ u32 available_wnd = tcp_available_snd_wnd (tc);
+ u32 flight_size = tcp_flight_size (tc);
+
+ if (available_wnd <= flight_size)
+ return 0;
+
+ return available_wnd - flight_size;
+}
+
+always_inline u8
+tcp_is_lost_fin (tcp_connection_t * tc)
+{
+ if ((tc->flags & TCP_CONN_FINSNT) && tc->snd_una_max - tc->snd_una == 1)
+ return 1;
+ return 0;
+}
+
+i32 tcp_rcv_wnd_available (tcp_connection_t * tc);
+u32 tcp_snd_space (tcp_connection_t * tc);
+void tcp_update_rcv_wnd (tcp_connection_t * tc);
+
+void tcp_retransmit_first_unacked (tcp_connection_t * tc);
+void tcp_fast_retransmit_no_sack (tcp_connection_t * tc);
+void tcp_fast_retransmit_sack (tcp_connection_t * tc);
+void tcp_fast_retransmit (tcp_connection_t * tc);
+void tcp_cc_init_congestion (tcp_connection_t * tc);
+int tcp_cc_recover (tcp_connection_t * tc);
+void tcp_cc_fastrecovery_exit (tcp_connection_t * tc);
+
+fib_node_index_t tcp_lookup_rmt_in_fib (tcp_connection_t * tc);
+
+/* Made public for unit testing only */
+void tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end);
+
+always_inline u32
+tcp_time_now (void)
+{
+ return tcp_main.time_now[vlib_get_thread_index ()];
+}
+
+always_inline u32
+tcp_set_time_now (u32 thread_index)
+{
+ tcp_main.time_now[thread_index] = clib_cpu_time_now ()
+ * tcp_main.tstamp_ticks_per_clock;
+ return tcp_main.time_now[thread_index];
+}
+
+always_inline void
+tcp_update_time (f64 now, u32 thread_index)
+{
+ tcp_set_time_now (thread_index);
+ tw_timer_expire_timers_16t_2w_512sl (&tcp_main.timer_wheels[thread_index],
+ now);
+ tcp_flush_frames_to_output (thread_index);
+}
+
+u32 tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b);
+
+u32
+tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset,
+ u32 max_bytes, vlib_buffer_t ** b);
+
+void tcp_connection_timers_init (tcp_connection_t * tc);
+void tcp_connection_timers_reset (tcp_connection_t * tc);
+void tcp_init_snd_vars (tcp_connection_t * tc);
+void tcp_connection_init_vars (tcp_connection_t * tc);
+
+always_inline void
+tcp_connection_force_ack (tcp_connection_t * tc, vlib_buffer_t * b)
+{
+ /* Reset flags, make sure ack is sent */
+ tc->flags = TCP_CONN_SNDACK;
+ vnet_buffer (b)->tcp.flags &= ~TCP_BUF_FLAG_DUPACK;
+}
+
+always_inline void
+tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval)
+{
+ ASSERT (tc->c_thread_index == vlib_get_thread_index ());
+ ASSERT (tc->timers[timer_id] == TCP_TIMER_HANDLE_INVALID);
+ tc->timers[timer_id]
+ = tw_timer_start_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index],
+ tc->c_c_index, timer_id, interval);
+}
+
+always_inline void
+tcp_timer_reset (tcp_connection_t * tc, u8 timer_id)
+{
+ ASSERT (tc->c_thread_index == vlib_get_thread_index ());
+ if (tc->timers[timer_id] == TCP_TIMER_HANDLE_INVALID)
+ return;
+
+ tw_timer_stop_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index],
+ tc->timers[timer_id]);
+ tc->timers[timer_id] = TCP_TIMER_HANDLE_INVALID;
+}
+
+always_inline void
+tcp_timer_update (tcp_connection_t * tc, u8 timer_id, u32 interval)
+{
+ ASSERT (tc->c_thread_index == vlib_get_thread_index ());
+ if (tc->timers[timer_id] != TCP_TIMER_HANDLE_INVALID)
+ tw_timer_stop_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index],
+ tc->timers[timer_id]);
+ tc->timers[timer_id] =
+ tw_timer_start_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index],
+ tc->c_c_index, timer_id, interval);
+}
+
+always_inline void
+tcp_retransmit_timer_set (tcp_connection_t * tc)
+{
+ ASSERT (tc->snd_una != tc->snd_una_max);
+ tcp_timer_set (tc, TCP_TIMER_RETRANSMIT,
+ clib_max (tc->rto * TCP_TO_TIMER_TICK, 1));
+}
+
+always_inline void
+tcp_retransmit_timer_reset (tcp_connection_t * tc)
+{
+ tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT);
+}
+
+always_inline void
+tcp_retransmit_timer_force_update (tcp_connection_t * tc)
+{
+ tcp_timer_update (tc, TCP_TIMER_RETRANSMIT,
+ clib_max (tc->rto * TCP_TO_TIMER_TICK, 1));
+}
+
+always_inline void
+tcp_persist_timer_set (tcp_connection_t * tc)
+{
+ /* Reuse RTO. It's backed off in handler */
+ tcp_timer_set (tc, TCP_TIMER_PERSIST,
+ clib_max (tc->rto * TCP_TO_TIMER_TICK,
+ TCP_TIMER_PERSIST_MIN));
+}
+
+always_inline void
+tcp_persist_timer_update (tcp_connection_t * tc)
+{
+ tcp_timer_update (tc, TCP_TIMER_PERSIST,
+ clib_max (tc->rto * TCP_TO_TIMER_TICK,
+ TCP_TIMER_PERSIST_MIN));
+}
+
+always_inline void
+tcp_persist_timer_reset (tcp_connection_t * tc)
+{
+ tcp_timer_reset (tc, TCP_TIMER_PERSIST);
+}
+
+always_inline void
+tcp_retransmit_timer_update (tcp_connection_t * tc)
+{
+ if (tc->snd_una == tc->snd_una_max)
+ {
+ tcp_retransmit_timer_reset (tc);
+ if (tc->snd_wnd < tc->snd_mss)
+ tcp_persist_timer_update (tc);
+ }
+ else
+ tcp_timer_update (tc, TCP_TIMER_RETRANSMIT,
+ clib_max (tc->rto * TCP_TO_TIMER_TICK, 1));
+}
+
+always_inline u8
+tcp_timer_is_active (tcp_connection_t * tc, tcp_timers_e timer)
+{
+ return tc->timers[timer] != TCP_TIMER_HANDLE_INVALID;
+}
+
+#define tcp_validate_txf_size(_tc, _a) \
+ ASSERT(_tc->state != TCP_STATE_ESTABLISHED \
+ || stream_session_tx_fifo_max_dequeue (&_tc->connection) >= _a)
+
+void
+scoreboard_remove_hole (sack_scoreboard_t * sb,
+ sack_scoreboard_hole_t * hole);
+void scoreboard_update_lost (tcp_connection_t * tc, sack_scoreboard_t * sb);
+sack_scoreboard_hole_t *scoreboard_insert_hole (sack_scoreboard_t * sb,
+ u32 prev_index, u32 start,
+ u32 end);
+sack_scoreboard_hole_t *scoreboard_next_rxt_hole (sack_scoreboard_t * sb,
+ sack_scoreboard_hole_t *
+ start, u8 have_sent_1_smss,
+ u8 * can_rescue,
+ u8 * snd_limited);
+void scoreboard_init_high_rxt (sack_scoreboard_t * sb, u32 seq);
+
+always_inline sack_scoreboard_hole_t *
+scoreboard_get_hole (sack_scoreboard_t * sb, u32 index)
+{
+ if (index != TCP_INVALID_SACK_HOLE_INDEX)
+ return pool_elt_at_index (sb->holes, index);
+ return 0;
+}
+
+always_inline sack_scoreboard_hole_t *
+scoreboard_next_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole)
+{
+ if (hole->next != TCP_INVALID_SACK_HOLE_INDEX)
+ return pool_elt_at_index (sb->holes, hole->next);
+ return 0;
+}
+
+always_inline sack_scoreboard_hole_t *
+scoreboard_prev_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole)
+{
+ if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX)
+ return pool_elt_at_index (sb->holes, hole->prev);
+ return 0;
+}
+
+always_inline sack_scoreboard_hole_t *
+scoreboard_first_hole (sack_scoreboard_t * sb)
+{
+ if (sb->head != TCP_INVALID_SACK_HOLE_INDEX)
+ return pool_elt_at_index (sb->holes, sb->head);
+ return 0;
+}
+
+always_inline sack_scoreboard_hole_t *
+scoreboard_last_hole (sack_scoreboard_t * sb)
+{
+ if (sb->tail != TCP_INVALID_SACK_HOLE_INDEX)
+ return pool_elt_at_index (sb->holes, sb->tail);
+ return 0;
+}
+
+always_inline void
+scoreboard_clear (sack_scoreboard_t * sb)
+{
+ sack_scoreboard_hole_t *hole;
+ while ((hole = scoreboard_first_hole (sb)))
+ {
+ scoreboard_remove_hole (sb, hole);
+ }
+ ASSERT (sb->head == sb->tail && sb->head == TCP_INVALID_SACK_HOLE_INDEX);
+ ASSERT (pool_elts (sb->holes) == 0);
+ sb->sacked_bytes = 0;
+ sb->last_sacked_bytes = 0;
+ sb->last_bytes_delivered = 0;
+ sb->snd_una_adv = 0;
+ sb->high_sacked = 0;
+ sb->high_rxt = 0;
+ sb->lost_bytes = 0;
+ sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
+}
+
+always_inline u32
+scoreboard_hole_bytes (sack_scoreboard_hole_t * hole)
+{
+ return hole->end - hole->start;
+}
+
+always_inline u32
+scoreboard_hole_index (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole)
+{
+ ASSERT (!pool_is_free_index (sb->holes, hole - sb->holes));
+ return hole - sb->holes;
+}
+
+always_inline void
+scoreboard_init (sack_scoreboard_t * sb)
+{
+ sb->head = TCP_INVALID_SACK_HOLE_INDEX;
+ sb->tail = TCP_INVALID_SACK_HOLE_INDEX;
+ sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
+}
+
+void tcp_rcv_sacks (tcp_connection_t * tc, u32 ack);
+
+always_inline void
+tcp_cc_algo_register (tcp_cc_algorithm_type_e type,
+ const tcp_cc_algorithm_t * vft)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ vec_validate (tm->cc_algos, type);
+
+ tm->cc_algos[type] = *vft;
+}
+
+always_inline tcp_cc_algorithm_t *
+tcp_cc_algo_get (tcp_cc_algorithm_type_e type)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ return &tm->cc_algos[type];
+}
+
+void tcp_cc_init (tcp_connection_t * tc);
+
+/**
+ * Push TCP header to buffer
+ *
+ * @param vm - vlib_main
+ * @param b - buffer to write the header to
+ * @param sp_net - source port net order
+ * @param dp_net - destination port net order
+ * @param seq - sequence number net order
+ * @param ack - ack number net order
+ * @param tcp_hdr_opts_len - header and options length in bytes
+ * @param flags - header flags
+ * @param wnd - window size
+ *
+ * @return - pointer to start of TCP header
+ */
+always_inline void *
+vlib_buffer_push_tcp_net_order (vlib_buffer_t * b, u16 sp, u16 dp, u32 seq,
+ u32 ack, u8 tcp_hdr_opts_len, u8 flags,
+ u16 wnd)
+{
+ tcp_header_t *th;
+
+ th = vlib_buffer_push_uninit (b, tcp_hdr_opts_len);
+
+ th->src_port = sp;
+ th->dst_port = dp;
+ th->seq_number = seq;
+ th->ack_number = ack;
+ th->data_offset_and_reserved = (tcp_hdr_opts_len >> 2) << 4;
+ th->flags = flags;
+ th->window = wnd;
+ th->checksum = 0;
+ th->urgent_pointer = 0;
+ return th;
+}
+
+/**
+ * Push TCP header to buffer
+ *
+ * @param b - buffer to write the header to
+ * @param sp_net - source port net order
+ * @param dp_net - destination port net order
+ * @param seq - sequence number host order
+ * @param ack - ack number host order
+ * @param tcp_hdr_opts_len - header and options length in bytes
+ * @param flags - header flags
+ * @param wnd - window size
+ *
+ * @return - pointer to start of TCP header
+ */
+always_inline void *
+vlib_buffer_push_tcp (vlib_buffer_t * b, u16 sp_net, u16 dp_net, u32 seq,
+ u32 ack, u8 tcp_hdr_opts_len, u8 flags, u16 wnd)
+{
+ return vlib_buffer_push_tcp_net_order (b, sp_net, dp_net,
+ clib_host_to_net_u32 (seq),
+ clib_host_to_net_u32 (ack),
+ tcp_hdr_opts_len, flags,
+ clib_host_to_net_u16 (wnd));
+}
+
+#endif /* _vnet_tcp_h_ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/tcp/tcp_api.c b/src/vnet/tcp/tcp_api.c
new file mode 100644
index 00000000..4c3e49ee
--- /dev/null
+++ b/src/vnet/tcp/tcp_api.c
@@ -0,0 +1,119 @@
+/*
+ *------------------------------------------------------------------
+ * tcp_api.c - vnet tcp-layer apis
+ *
+ * Copyright (c) 2017 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vnet/vnet.h>
+#include <vlibmemory/api.h>
+
+#include <vnet/tcp/tcp.h>
+
+#include <vnet/vnet_msg_enum.h>
+
+#define vl_typedefs /* define message structures */
+#include <vnet/vnet_all_api_h.h>
+#undef vl_typedefs
+
+#define vl_endianfun /* define message structures */
+#include <vnet/vnet_all_api_h.h>
+#undef vl_endianfun
+
+/* instantiate all the print functions we know about */
+#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__)
+#define vl_printfun
+#include <vnet/vnet_all_api_h.h>
+#undef vl_printfun
+
+#include <vlibapi/api_helper_macros.h>
+
+#define foreach_tcp_api_msg \
+_(TCP_CONFIGURE_SRC_ADDRESSES, tcp_configure_src_addresses)
+
+static void
+ vl_api_tcp_configure_src_addresses_t_handler
+ (vl_api_tcp_configure_src_addresses_t * mp)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ vl_api_tcp_configure_src_addresses_reply_t *rmp;
+ u32 vrf_id;
+ int rv;
+
+ vrf_id = clib_net_to_host_u32 (mp->vrf_id);
+
+ if (mp->is_ipv6)
+ rv = tcp_configure_v6_source_address_range
+ (vm,
+ (ip6_address_t *) mp->first_address,
+ (ip6_address_t *) mp->last_address, vrf_id);
+ else
+ rv = tcp_configure_v4_source_address_range
+ (vm,
+ (ip4_address_t *) mp->first_address,
+ (ip4_address_t *) mp->last_address, vrf_id);
+
+ REPLY_MACRO (VL_API_TCP_CONFIGURE_SRC_ADDRESSES_REPLY);
+}
+
+#define vl_msg_name_crc_list
+#include <vnet/tcp/tcp.api.h>
+#undef vl_msg_name_crc_list
+
+static void
+setup_message_id_table (api_main_t * am)
+{
+#define _(id,n,crc) vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id);
+ foreach_vl_msg_name_crc_tcp;
+#undef _
+}
+
+static clib_error_t *
+tcp_api_hookup (vlib_main_t * vm)
+{
+ api_main_t *am = &api_main;
+
+#define _(N,n) \
+ vl_msg_api_set_handlers(VL_API_##N, #n, \
+ vl_api_##n##_t_handler, \
+ vl_noop_handler, \
+ vl_api_##n##_t_endian, \
+ vl_api_##n##_t_print, \
+ sizeof(vl_api_##n##_t), 1);
+ foreach_tcp_api_msg;
+#undef _
+
+ /*
+ * Set up the (msg_name, crc, message-id) table
+ */
+ setup_message_id_table (am);
+
+ return 0;
+}
+
+VLIB_API_INIT_FUNCTION (tcp_api_hookup);
+
+void
+tcp_api_reference (void)
+{
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h
new file mode 100755
index 00000000..eb318cde
--- /dev/null
+++ b/src/vnet/tcp/tcp_debug.h
@@ -0,0 +1,761 @@
+/*
+ * Copyright (c) 2017 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SRC_VNET_TCP_TCP_DEBUG_H_
+#define SRC_VNET_TCP_TCP_DEBUG_H_
+
+#include <vlib/vlib.h>
+
+#define TCP_DEBUG (1)
+#define TCP_DEBUG_SM (0)
+#define TCP_DEBUG_CC (0)
+#define TCP_DEBUG_CC_STAT (1)
+
+#define foreach_tcp_dbg_evt \
+ _(INIT, "") \
+ _(DEALLOC, "") \
+ _(OPEN, "open") \
+ _(CLOSE, "close") \
+ _(BIND, "bind") \
+ _(UNBIND, "unbind") \
+ _(DELETE, "delete") \
+ _(SYN_SENT, "SYN sent") \
+ _(SYNACK_SENT, "SYNACK sent") \
+ _(SYNACK_RCVD, "SYNACK rcvd") \
+ _(SYN_RXT, "SYN retransmit") \
+ _(FIN_SENT, "FIN sent") \
+ _(ACK_SENT, "ACK sent") \
+ _(DUPACK_SENT, "DUPACK sent") \
+ _(RST_SENT, "RST sent") \
+ _(SYN_RCVD, "SYN rcvd") \
+ _(ACK_RCVD, "ACK rcvd") \
+ _(DUPACK_RCVD, "DUPACK rcvd") \
+ _(FIN_RCVD, "FIN rcvd") \
+ _(RST_RCVD, "RST rcvd") \
+ _(STATE_CHANGE, "state change") \
+ _(PKTIZE, "packetize") \
+ _(INPUT, "in") \
+ _(SND_WND, "snd_wnd update") \
+ _(OUTPUT, "output") \
+ _(TIMER_POP, "timer pop") \
+ _(CC_RTX, "retransmit") \
+ _(CC_EVT, "cc event") \
+ _(CC_PACK, "cc partial ack") \
+ _(CC_STAT, "cc stats") \
+ _(CC_RTO_STAT, "cc rto stats") \
+ _(SEG_INVALID, "invalid segment") \
+ _(PAWS_FAIL, "failed paws check") \
+ _(ACK_RCV_ERR, "invalid ack") \
+ _(RCV_WND_SHRUNK, "shrunk rcv_wnd") \
+
+typedef enum _tcp_dbg
+{
+#define _(sym, str) TCP_DBG_##sym,
+ foreach_tcp_dbg_evt
+#undef _
+} tcp_dbg_e;
+
+typedef enum _tcp_dbg_evt
+{
+#define _(sym, str) TCP_EVT_##sym,
+ foreach_tcp_dbg_evt
+#undef _
+} tcp_dbg_evt_e;
+
+#if TCP_DEBUG
+
+#define TRANSPORT_DEBUG (1)
+
+/*
+ * Infra and evt track setup
+ */
+
+#define TCP_DBG(_fmt, _args...) clib_warning (_fmt, ##_args)
+
+#define DECLARE_ETD(_tc, _e, _size) \
+ struct \
+ { \
+ u32 data[_size]; \
+ } * ed; \
+ ed = ELOG_TRACK_DATA (&vlib_global_main.elog_main, \
+ _e, _tc->c_elog_track)
+
+#define TCP_DBG_IP_TAG_LCL(_tc) \
+{ \
+ if (_tc->c_is_ip4) \
+ { \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "lcl: %d.%d.%d.%d:%d", \
+ .format_args = "i4i4i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 5); \
+ ed->data[0] = _tc->c_lcl_ip.ip4.as_u8[0]; \
+ ed->data[1] = _tc->c_lcl_ip.ip4.as_u8[1]; \
+ ed->data[2] = _tc->c_lcl_ip.ip4.as_u8[2]; \
+ ed->data[3] = _tc->c_lcl_ip.ip4.as_u8[3]; \
+ ed->data[4] = clib_net_to_host_u16(_tc->c_lcl_port); \
+ } \
+}
+
+#define TCP_DBG_IP_TAG_RMT(_tc) \
+{ \
+ if (_tc->c_is_ip4) \
+ { \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "rmt: %d.%d.%d.%d:%d", \
+ .format_args = "i4i4i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 5); \
+ ed->data[0] = _tc->c_rmt_ip.ip4.as_u8[0]; \
+ ed->data[1] = _tc->c_rmt_ip.ip4.as_u8[1]; \
+ ed->data[2] = _tc->c_rmt_ip.ip4.as_u8[2]; \
+ ed->data[3] = _tc->c_rmt_ip.ip4.as_u8[3]; \
+ ed->data[4] = clib_net_to_host_u16(_tc->c_rmt_port); \
+ } \
+}
+
+#define TCP_EVT_INIT_HANDLER(_tc, _is_l, ...) \
+{ \
+ char *_fmt = _is_l ? "l[%d].%d:%d%c" : "[%d].%d:%d->.%d:%d%c"; \
+ if (_tc->c_is_ip4) \
+ { \
+ _tc->c_elog_track.name = \
+ (char *) format (0, _fmt, _tc->c_thread_index, \
+ _tc->c_lcl_ip.ip4.as_u8[3], \
+ clib_net_to_host_u16(_tc->c_lcl_port), \
+ _tc->c_rmt_ip.ip4.as_u8[3], \
+ clib_net_to_host_u16(_tc->c_rmt_port), 0); \
+ } \
+ else \
+ _tc->c_elog_track.name = \
+ (char *) format (0, _fmt, _tc->c_thread_index, \
+ _tc->c_lcl_ip.ip6.as_u8[15], \
+ clib_net_to_host_u16(_tc->c_lcl_port), \
+ _tc->c_rmt_ip.ip6.as_u8[15], \
+ clib_net_to_host_u16(_tc->c_rmt_port), 0); \
+ elog_track_register (&vlib_global_main.elog_main, &_tc->c_elog_track);\
+ TCP_DBG_IP_TAG_LCL(_tc); \
+ TCP_DBG_IP_TAG_RMT(_tc); \
+}
+
+#define TCP_EVT_DEALLOC_HANDLER(_tc, ...) \
+{ \
+ vec_free (_tc->c_elog_track.name); \
+}
+
+#define TCP_EVT_OPEN_HANDLER(_tc, ...) \
+{ \
+ TCP_EVT_INIT_HANDLER(_tc, 0); \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "open: index %d", \
+ .format_args = "i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 1); \
+ ed->data[0] = _tc->c_c_index; \
+}
+
+#define TCP_EVT_CLOSE_HANDLER(_tc, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "close: %d", \
+ .format_args = "i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 1); \
+ ed->data[0] = _tc->c_c_index; \
+}
+
+#define TCP_EVT_BIND_HANDLER(_tc, ...) \
+{ \
+ TCP_EVT_INIT_HANDLER(_tc, 1); \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "bind: listener %d", \
+ }; \
+ DECLARE_ETD(_tc, _e, 1); \
+ ed->data[0] = _tc->c_c_index; \
+}
+
+#define TCP_EVT_SYN_RCVD_HANDLER(_tc,_init, ...) \
+{ \
+ if (_init) \
+ TCP_EVT_INIT_HANDLER(_tc, 0); \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "syn-rx: irs %u", \
+ .format_args = "i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 1); \
+ ed->data[0] = _tc->irs; \
+ TCP_EVT_STATE_CHANGE_HANDLER(_tc); \
+}
+
+#define TCP_EVT_UNBIND_HANDLER(_tc, ...) \
+{ \
+ TCP_EVT_DEALLOC_HANDLER(_tc); \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "unbind: listener %d", \
+ }; \
+ DECLARE_ETD(_tc, _e, 1); \
+ ed->data[0] = _tc->c_c_index; \
+ TCP_EVT_DEALLOC_HANDLER(_tc); \
+}
+
+#define TCP_EVT_DELETE_HANDLER(_tc, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "delete: %d", \
+ .format_args = "i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 1); \
+ ed->data[0] = _tc->c_c_index; \
+ TCP_EVT_DEALLOC_HANDLER(_tc); \
+}
+
+#define CONCAT_HELPER(_a, _b) _a##_b
+#define CC(_a, _b) CONCAT_HELPER(_a, _b)
+#define TCP_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args)
+#else
+#define TCP_EVT_DBG(_evt, _args...)
+#define TCP_DBG(_fmt, _args...)
+#endif
+
+/*
+ * State machine
+ */
+#if TCP_DEBUG_SM
+
+#define TCP_EVT_STATE_CHANGE_HANDLER(_tc, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "state: %s", \
+ .format_args = "t4", \
+ .n_enum_strings = 11, \
+ .enum_strings = { \
+ "closed", \
+ "listen", \
+ "syn-sent", \
+ "syn-rcvd", \
+ "established", \
+ "close_wait", \
+ "fin-wait-1", \
+ "last-ack", \
+ "closing", \
+ "fin-wait-2", \
+ "time-wait", \
+ }, \
+ }; \
+ DECLARE_ETD(_tc, _e, 1); \
+ ed->data[0] = _tc->state; \
+}
+
+#define TCP_EVT_SYN_SENT_HANDLER(_tc, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "syn-tx: iss %u snd_una %u snd_una_max %u snd_nxt %u", \
+ .format_args = "i4i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 4); \
+ ed->data[0] = _tc->iss; \
+ ed->data[1] = _tc->snd_una - _tc->iss; \
+ ed->data[2] = _tc->snd_una_max - _tc->iss; \
+ ed->data[3] = _tc->snd_nxt - _tc->iss; \
+ TCP_EVT_STATE_CHANGE_HANDLER(_tc); \
+}
+
+#define TCP_EVT_SYNACK_SENT_HANDLER(_tc, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "synack-tx: iss %u irs %u snd_una %u snd_nxt %u rcv_nxt %u",\
+ .format_args = "i4i4i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 5); \
+ ed->data[0] = _tc->iss; \
+ ed->data[1] = _tc->irs; \
+ ed->data[2] = _tc->snd_una - _tc->iss; \
+ ed->data[3] = _tc->snd_nxt - _tc->iss; \
+ ed->data[4] = _tc->rcv_nxt - _tc->irs; \
+}
+
+#define TCP_EVT_SYNACK_RCVD_HANDLER(_tc, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "synack-rx: iss %u irs %u snd_una %u snd_nxt %u rcv_nxt %u",\
+ .format_args = "i4i4i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 5); \
+ ed->data[0] = _tc->iss; \
+ ed->data[1] = _tc->irs; \
+ ed->data[2] = _tc->snd_una - _tc->iss; \
+ ed->data[3] = _tc->snd_nxt - _tc->iss; \
+ ed->data[4] = _tc->rcv_nxt - _tc->irs; \
+ TCP_EVT_STATE_CHANGE_HANDLER(_tc); \
+}
+
+#define TCP_EVT_FIN_SENT_HANDLER(_tc, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "fin-tx: snd_nxt %d rcv_nxt %d", \
+ .format_args = "i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 2); \
+ ed->data[0] = _tc->snd_nxt - _tc->iss; \
+ ed->data[1] = _tc->rcv_nxt - _tc->irs; \
+}
+
+#define TCP_EVT_RST_SENT_HANDLER(_tc, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "rst-tx: snd_nxt %d rcv_nxt %d", \
+ .format_args = "i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 2); \
+ ed->data[0] = _tc->snd_nxt - _tc->iss; \
+ ed->data[1] = _tc->rcv_nxt - _tc->irs; \
+ TCP_EVT_STATE_CHANGE_HANDLER(_tc); \
+}
+
+#define TCP_EVT_FIN_RCVD_HANDLER(_tc, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "fin-rx: snd_nxt %d rcv_nxt %d", \
+ .format_args = "i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 2); \
+ ed->data[0] = _tc->snd_nxt - _tc->iss; \
+ ed->data[1] = _tc->rcv_nxt - _tc->irs; \
+}
+
+#define TCP_EVT_RST_RCVD_HANDLER(_tc, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "rst-rx: snd_nxt %d rcv_nxt %d", \
+ .format_args = "i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 2); \
+ ed->data[0] = _tc->snd_nxt - _tc->iss; \
+ ed->data[1] = _tc->rcv_nxt - _tc->irs; \
+}
+
+#define TCP_EVT_SYN_RXT_HANDLER(_tc, _type, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "%s-rxt: iss %u irs %u snd_nxt %u rcv_nxt %u", \
+ .format_args = "t4i4i4i4i4", \
+ .n_enum_strings = 2, \
+ .enum_strings = { \
+ "syn", \
+ "syn-ack", \
+ }, \
+ }; \
+ DECLARE_ETD(_tc, _e, 5); \
+ ed->data[0] = _type; \
+ ed->data[1] = _tc->iss; \
+ ed->data[2] = _tc->irs; \
+ ed->data[3] = _tc->snd_nxt - _tc->iss; \
+ ed->data[4] = _tc->rcv_nxt - _tc->irs; \
+}
+
+#else
+#define TCP_EVT_SYN_SENT_HANDLER(_tc, ...)
+#define TCP_EVT_SYNACK_SENT_HANDLER(_tc, ...)
+#define TCP_EVT_SYNACK_RCVD_HANDLER(_tc, ...)
+#define TCP_EVT_SYN_RXT_HANDLER(_tc, ...)
+#define TCP_EVT_FIN_SENT_HANDLER(_tc, ...)
+#define TCP_EVT_RST_SENT_HANDLER(_tc, ...)
+#define TCP_EVT_FIN_RCVD_HANDLER(_tc, ...)
+#define TCP_EVT_RST_RCVD_HANDLER(_tc, ...)
+#define TCP_EVT_STATE_CHANGE_HANDLER(_tc, ...)
+#endif
+
+#if TCP_DEBUG_SM > 1
+
+#define TCP_EVT_ACK_SENT_HANDLER(_tc, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "ack-tx: acked %u rcv_nxt %u rcv_wnd %u snd_nxt %u snd_wnd %u",\
+ .format_args = "i4i4i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 5); \
+ ed->data[0] = _tc->rcv_nxt - _tc->rcv_las; \
+ ed->data[1] = _tc->rcv_nxt - _tc->irs; \
+ ed->data[2] = _tc->rcv_wnd; \
+ ed->data[3] = _tc->snd_nxt - _tc->iss; \
+ ed->data[4] = _tc->snd_wnd; \
+}
+
+#define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "ack-rx: %u snd_una %u snd_wnd %u cwnd %u inflight %u", \
+ .format_args = "i4i4i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 5); \
+ ed->data[0] = _tc->bytes_acked; \
+ ed->data[1] = _tc->snd_una - _tc->iss; \
+ ed->data[2] = _tc->snd_wnd; \
+ ed->data[3] = _tc->cwnd; \
+ ed->data[4] = tcp_flight_size(_tc); \
+}
+
+#define TCP_EVT_PKTIZE_HANDLER(_tc, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "tx: una %u snd_nxt %u space %u flight %u rcv_wnd %u",\
+ .format_args = "i4i4i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 5); \
+ ed->data[0] = _tc->snd_una - _tc->iss; \
+ ed->data[1] = _tc->snd_nxt - _tc->iss; \
+ ed->data[2] = tcp_available_output_snd_space (_tc); \
+ ed->data[3] = tcp_flight_size (_tc); \
+ ed->data[4] = _tc->rcv_wnd; \
+}
+
+#define TCP_EVT_INPUT_HANDLER(_tc, _type, _len, _written, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "in: %s len %u written %d rcv_nxt %u rcv_wnd(o) %d", \
+ .format_args = "t4i4i4i4i4", \
+ .n_enum_strings = 2, \
+ .enum_strings = { \
+ "order", \
+ "ooo", \
+ }, \
+ }; \
+ DECLARE_ETD(_tc, _e, 5); \
+ ed->data[0] = _type; \
+ ed->data[1] = _len; \
+ ed->data[2] = _written; \
+ ed->data[3] = (_tc->rcv_nxt - _tc->irs) + _written; \
+ ed->data[4] = _tc->rcv_wnd - (_tc->rcv_nxt - _tc->rcv_las); \
+}
+
+#define TCP_EVT_TIMER_POP_HANDLER(_tc_index, _timer_id, ...) \
+{ \
+ tcp_connection_t *_tc; \
+ if (_timer_id == TCP_TIMER_RETRANSMIT_SYN \
+ || _timer_id == TCP_TIMER_ESTABLISH) \
+ { \
+ _tc = tcp_half_open_connection_get (_tc_index); \
+ } \
+ else \
+ { \
+ u32 _thread_index = vlib_get_thread_index (); \
+ _tc = tcp_connection_get (_tc_index, _thread_index); \
+ } \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "timer-pop: %s (%d)", \
+ .format_args = "t4i4", \
+ .n_enum_strings = 7, \
+ .enum_strings = { \
+ "retransmit", \
+ "delack", \
+ "persist", \
+ "keep", \
+ "waitclose", \
+ "retransmit syn", \
+ "establish", \
+ }, \
+ }; \
+ if (_tc) \
+ { \
+ DECLARE_ETD(_tc, _e, 2); \
+ ed->data[0] = _timer_id; \
+ ed->data[1] = _timer_id; \
+ } \
+ else \
+ { \
+ clib_warning ("pop %d for unexisting connection %d", _timer_id, \
+ _tc_index); \
+ } \
+}
+
+#define TCP_EVT_SEG_INVALID_HANDLER(_tc, _seq, _end, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "seg-inv: seq %u end %u rcv_las %u rcv_nxt %u rcv_wnd %u",\
+ .format_args = "i4i4i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 5); \
+ ed->data[0] = _seq - _tc->irs; \
+ ed->data[1] = _end - _tc->irs; \
+ ed->data[2] = _tc->rcv_las - _tc->irs; \
+ ed->data[3] = _tc->rcv_nxt - _tc->irs; \
+ ed->data[4] = _tc->rcv_wnd; \
+}
+
+#define TCP_EVT_PAWS_FAIL_HANDLER(_tc, _seq, _end, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "paws-err: seq %u end %u tsval %u tsval_recent %u", \
+ .format_args = "i4i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 4); \
+ ed->data[0] = _seq - _tc->irs; \
+ ed->data[1] = _end - _tc->irs; \
+ ed->data[2] = _tc->rcv_opts.tsval; \
+ ed->data[3] = _tc->tsval_recent; \
+}
+
+#define TCP_EVT_ACK_RCV_ERR_HANDLER(_tc, _type, _ack, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "ack-err: %s ack %u snd_una %u snd_nxt %u una_max %u", \
+ .format_args = "t4i4i4i4i4", \
+ .n_enum_strings = 3, \
+ .enum_strings = { \
+ "invalid", \
+ "old", \
+ "future", \
+ }, \
+ }; \
+ DECLARE_ETD(_tc, _e, 5); \
+ ed->data[0] = _type; \
+ ed->data[1] = _ack - _tc->iss; \
+ ed->data[2] = _tc->snd_una - _tc->iss; \
+ ed->data[3] = _tc->snd_nxt - _tc->iss; \
+ ed->data[4] = _tc->snd_una_max - _tc->iss; \
+}
+
+#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...) \
+{ \
+if (_av > 0) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "huh?: rcv_wnd %u obsd %u av %u rcv_nxt %u rcv_las %u", \
+ .format_args = "i4i4i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 5); \
+ ed->data[0] = _tc->rcv_wnd; \
+ ed->data[1] = _obs; \
+ ed->data[2] = _av; \
+ ed->data[3] = _tc->rcv_nxt - _tc->irs; \
+ ed->data[4] = _tc->rcv_las - _tc->irs; \
+} \
+}
+#else
+#define TCP_EVT_ACK_SENT_HANDLER(_tc, ...)
+#define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...)
+#define TCP_EVT_PKTIZE_HANDLER(_tc, ...)
+#define TCP_EVT_INPUT_HANDLER(_tc, _type, _len, _written, ...)
+#define TCP_EVT_TIMER_POP_HANDLER(_tc_index, _timer_id, ...)
+#define TCP_EVT_SEG_INVALID_HANDLER(_tc, _seq, _end, ...)
+#define TCP_EVT_PAWS_FAIL_HANDLER(_tc, _seq, _end, ...)
+#define TCP_EVT_ACK_RCV_ERR_HANDLER(_tc, _type, _ack, ...)
+#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...)
+#endif
+
+/*
+ * State machine verbose
+ */
+#if TCP_DEBUG_SM > 2
+#define TCP_EVT_SND_WND_HANDLER(_tc, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "snd-wnd update: %u ", \
+ .format_args = "i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 1); \
+ ed->data[0] = _tc->snd_wnd; \
+}
+
+#define TCP_EVT_OUTPUT_HANDLER(_tc, flags, n_bytes,...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "out: flags %x, bytes %u", \
+ .format_args = "i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 2); \
+ ed->data[0] = flags; \
+ ed->data[1] = n_bytes; \
+}
+#else
+#define TCP_EVT_SND_WND_HANDLER(_tc, ...)
+#define TCP_EVT_OUTPUT_HANDLER(_tc, flags, n_bytes,...)
+#endif
+
+/*
+ * Congestion Control
+ */
+
+#if TCP_DEBUG_CC
+
+#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "cc: %s wnd %u snd_cong %u rxt_bytes %u", \
+ .format_args = "t4i4i4i4", \
+ .n_enum_strings = 6, \
+ .enum_strings = { \
+ "fast-rxt", \
+ "rxt-timeout", \
+ "first-rxt", \
+ "recovered", \
+ "congestion", \
+ "undo", \
+ }, \
+ }; \
+ DECLARE_ETD(_tc, _e, 4); \
+ ed->data[0] = _sub_evt; \
+ ed->data[1] = tcp_available_snd_space (_tc); \
+ ed->data[2] = _tc->snd_congestion - _tc->iss; \
+ ed->data[3] = _tc->snd_rxt_bytes; \
+}
+
+#define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "rxt: snd_nxt %u offset %u snd %u rxt %u", \
+ .format_args = "i4i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 4); \
+ ed->data[0] = _tc->snd_nxt - _tc->iss; \
+ ed->data[1] = offset; \
+ ed->data[2] = n_bytes; \
+ ed->data[3] = _tc->snd_rxt_bytes; \
+}
+
+#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "dack-tx: rcv_nxt %u rcv_wnd %u snd_nxt %u av_wnd %u snd_wnd %u",\
+ .format_args = "i4i4i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 5); \
+ ed->data[0] = _tc->rcv_nxt - _tc->irs; \
+ ed->data[1] = _tc->rcv_wnd; \
+ ed->data[2] = _tc->snd_nxt - _tc->iss; \
+ ed->data[3] = tcp_available_snd_wnd(_tc); \
+ ed->data[4] = _tc->snd_wnd; \
+}
+
+#define TCP_EVT_DUPACK_RCVD_HANDLER(_tc, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "dack-rx: snd_una %u cwnd %u snd_wnd %u flight %u rcv_wnd %u",\
+ .format_args = "i4i4i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 5); \
+ ed->data[0] = _tc->snd_una - _tc->iss; \
+ ed->data[1] = _tc->cwnd; \
+ ed->data[2] = _tc->snd_wnd; \
+ ed->data[3] = tcp_flight_size(_tc); \
+ ed->data[4] = _tc->rcv_wnd; \
+}
+
+#define TCP_EVT_CC_PACK_HANDLER(_tc, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "pack: snd_una %u snd_una_max %u", \
+ .format_args = "i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 2); \
+ ed->data[0] = _tc->snd_una - _tc->iss; \
+ ed->data[1] = _tc->snd_una_max - _tc->iss; \
+}
+#else
+#define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...)
+#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...)
+#define TCP_EVT_DUPACK_RCVD_HANDLER(_tc, ...)
+#define TCP_EVT_CC_PACK_HANDLER(_tc, ...)
+#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...)
+#endif
+
+/*
+ * Congestion control stats
+ */
+#if TCP_DEBUG_CC_STAT
+
+#define STATS_INTERVAL 1
+
+#define TCP_EVT_CC_RTO_STAT_HANDLER(_tc, ...) \
+{ \
+if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now()) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "rto_stat: rto %u srtt %u rttvar %u ", \
+ .format_args = "i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 3); \
+ ed->data[0] = _tc->rto; \
+ ed->data[1] = _tc->srtt; \
+ ed->data[2] = _tc->rttvar; \
+} \
+}
+
+#define TCP_EVT_CC_STAT_HANDLER(_tc, ...) \
+{ \
+if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now()) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "cc_stat: cwnd %u flight %u space %u ssthresh %u snd_wnd %u",\
+ .format_args = "i4i4i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 5); \
+ ed->data[0] = _tc->cwnd; \
+ ed->data[1] = tcp_flight_size (_tc); \
+ ed->data[2] = tcp_snd_space (_tc); \
+ ed->data[3] = _tc->ssthresh; \
+ ed->data[4] = _tc->snd_wnd; \
+ TCP_EVT_CC_RTO_STAT_HANDLER (_tc); \
+ _tc->c_cc_stat_tstamp = tcp_time_now(); \
+} \
+}
+
+#else
+#define TCP_EVT_CC_STAT_HANDLER(_tc, ...)
+#endif
+
+#endif /* SRC_VNET_TCP_TCP_DEBUG_H_ */
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/tcp/tcp_error.def b/src/vnet/tcp/tcp_error.def
new file mode 100644
index 00000000..a179717f
--- /dev/null
+++ b/src/vnet/tcp/tcp_error.def
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+tcp_error (NONE, "no error")
+tcp_error (LENGTH, "inconsistent ip/tcp lengths")
+tcp_error (NO_LISTENER, "no listener for dst port")
+tcp_error (LOOKUP_DROPS, "lookup drops")
+tcp_error (DISPATCH, "Dispatch error")
+tcp_error (ENQUEUED, "Packets pushed into rx fifo")
+tcp_error (PARTIALLY_ENQUEUED, "Packets partially pushed into rx fifo")
+tcp_error (PURE_ACK, "Pure acks")
+tcp_error (SYNS_RCVD, "SYNs received")
+tcp_error (SYN_ACKS_RCVD, "SYN-ACKs received")
+tcp_error (NOT_READY, "Session not ready for packets")
+tcp_error (FIFO_FULL, "Packets dropped for lack of rx fifo space")
+tcp_error (EVENT_FIFO_FULL, "Events not sent for lack of event fifo space")
+tcp_error (API_QUEUE_FULL, "Sessions not created for lack of API queue space")
+tcp_error (CREATE_SESSION_FAIL, "Sessions couldn't be allocated")
+tcp_error (SEGMENT_INVALID, "Invalid segments")
+tcp_error (SEGMENT_OLD, "Old segment")
+tcp_error (ACK_INVALID, "Invalid ACK")
+tcp_error (ACK_DUP, "Duplicate ACK")
+tcp_error (ACK_OLD, "Old ACK")
+tcp_error (ACK_FUTURE, "Future ACK")
+tcp_error (PKTS_SENT, "Packets sent")
+tcp_error (FILTERED_DUPACKS, "Filtered duplicate ACKs")
+tcp_error (RST_SENT, "Resets sent")
+tcp_error (INVALID_CONNECTION, "Invalid connection")
+tcp_error (NO_WND, "No window")
+tcp_error (CONNECTION_CLOSED, "Connection closed")
+tcp_error (CREATE_EXISTS, "Connection already exists")
+tcp_error (PUNT, "Packets punted") \ No newline at end of file
diff --git a/src/vnet/tcp/tcp_format.c b/src/vnet/tcp/tcp_format.c
new file mode 100644
index 00000000..1ca2f58e
--- /dev/null
+++ b/src/vnet/tcp/tcp_format.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * tcp/tcp_format.c: tcp formatting
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/ip/ip.h>
+#include <vnet/tcp/tcp.h>
+
+u8 *
+format_tcp_flags (u8 * s, va_list * args)
+{
+ int flags = va_arg (*args, int);
+
+ s = format (s, "0x%02x", flags);
+#define _(f) if (flags & TCP_FLAG_##f) s = format (s, " %s", #f);
+ foreach_tcp_flag
+#undef _
+ return s;
+}
+
+/* Format TCP header. */
+u8 *
+format_tcp_header (u8 * s, va_list * args)
+{
+ tcp_header_t *tcp = va_arg (*args, tcp_header_t *);
+ u32 max_header_bytes = va_arg (*args, u32);
+ u32 header_bytes;
+ uword indent;
+
+ /* Nothing to do. */
+ if (max_header_bytes < sizeof (tcp[0]))
+ return format (s, "TCP header truncated");
+
+ indent = format_get_indent (s);
+ indent += 2;
+ header_bytes = tcp_header_bytes (tcp);
+
+ s = format (s, "TCP: %d -> %d", clib_net_to_host_u16 (tcp->src),
+ clib_net_to_host_u16 (tcp->dst));
+
+ s = format (s, "\n%Useq. 0x%08x ack 0x%08x", format_white_space, indent,
+ clib_net_to_host_u32 (tcp->seq_number),
+ clib_net_to_host_u32 (tcp->ack_number));
+
+ s = format (s, "\n%Uflags %U, tcp header: %d bytes", format_white_space,
+ indent, format_tcp_flags, tcp->flags, header_bytes);
+
+ s = format (s, "\n%Uwindow %d, checksum 0x%04x", format_white_space, indent,
+ clib_net_to_host_u16 (tcp->window),
+ clib_net_to_host_u16 (tcp->checksum));
+
+
+#if 0
+ /* Format TCP options. */
+ {
+ u8 *o;
+ u8 *option_start = (void *) (tcp + 1);
+ u8 *option_end = (void *) tcp + header_bytes;
+
+ for (o = option_start; o < option_end;)
+ {
+ u32 length = o[1];
+ switch (o[0])
+ {
+ case TCP_OPTION_END:
+ length = 1;
+ o = option_end;
+ break;
+
+ case TCP_OPTION_NOOP:
+ length = 1;
+ break;
+
+ }
+ }
+ }
+#endif
+
+ /* Recurse into next protocol layer. */
+ if (max_header_bytes != 0 && header_bytes < max_header_bytes)
+ {
+ ip_main_t *im = &ip_main;
+ tcp_udp_port_info_t *pi;
+
+ pi = ip_get_tcp_udp_port_info (im, tcp->dst);
+
+ if (pi && pi->format_header)
+ s = format (s, "\n%U%U", format_white_space, indent - 2,
+ pi->format_header,
+ /* next protocol header */ (void *) tcp + header_bytes,
+ max_header_bytes - header_bytes);
+ }
+
+ return s;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
new file mode 100644
index 00000000..63d6fd87
--- /dev/null
+++ b/src/vnet/tcp/tcp_input.c
@@ -0,0 +1,3215 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vppinfra/sparse_vec.h>
+#include <vnet/tcp/tcp_packet.h>
+#include <vnet/tcp/tcp.h>
+#include <vnet/session/session.h>
+#include <math.h>
+
+static char *tcp_error_strings[] = {
+#define tcp_error(n,s) s,
+#include <vnet/tcp/tcp_error.def>
+#undef tcp_error
+};
+
+/* All TCP nodes have the same outgoing arcs */
+#define foreach_tcp_state_next \
+ _ (DROP, "error-drop") \
+ _ (TCP4_OUTPUT, "tcp4-output") \
+ _ (TCP6_OUTPUT, "tcp6-output")
+
+typedef enum _tcp_established_next
+{
+#define _(s,n) TCP_ESTABLISHED_NEXT_##s,
+ foreach_tcp_state_next
+#undef _
+ TCP_ESTABLISHED_N_NEXT,
+} tcp_established_next_t;
+
+typedef enum _tcp_rcv_process_next
+{
+#define _(s,n) TCP_RCV_PROCESS_NEXT_##s,
+ foreach_tcp_state_next
+#undef _
+ TCP_RCV_PROCESS_N_NEXT,
+} tcp_rcv_process_next_t;
+
+typedef enum _tcp_syn_sent_next
+{
+#define _(s,n) TCP_SYN_SENT_NEXT_##s,
+ foreach_tcp_state_next
+#undef _
+ TCP_SYN_SENT_N_NEXT,
+} tcp_syn_sent_next_t;
+
+typedef enum _tcp_listen_next
+{
+#define _(s,n) TCP_LISTEN_NEXT_##s,
+ foreach_tcp_state_next
+#undef _
+ TCP_LISTEN_N_NEXT,
+} tcp_listen_next_t;
+
+/* Generic, state independent indices */
+typedef enum _tcp_state_next
+{
+#define _(s,n) TCP_NEXT_##s,
+ foreach_tcp_state_next
+#undef _
+ TCP_STATE_N_NEXT,
+} tcp_state_next_t;
+
+#define tcp_next_output(is_ip4) (is_ip4 ? TCP_NEXT_TCP4_OUTPUT \
+ : TCP_NEXT_TCP6_OUTPUT)
+
+vlib_node_registration_t tcp4_established_node;
+vlib_node_registration_t tcp6_established_node;
+
+/**
+ * Validate segment sequence number. As per RFC793:
+ *
+ * Segment Receive Test
+ * Length Window
+ * ------- ------- -------------------------------------------
+ * 0 0 SEG.SEQ = RCV.NXT
+ * 0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
+ * >0 0 not acceptable
+ * >0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
+ * or RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND
+ *
+ * This ultimately consists in checking if segment falls within the window.
+ * The one important difference compared to RFC793 is that we use rcv_las,
+ * or the rcv_nxt at last ack sent instead of rcv_nxt since that's the
+ * peer's reference when computing our receive window.
+ *
+ * This:
+ * seq_leq (end_seq, tc->rcv_las + tc->rcv_wnd) && seq_geq (seq, tc->rcv_las)
+ * however, is too strict when we have retransmits. Instead we just check that
+ * the seq is not beyond the right edge and that the end of the segment is not
+ * less than the left edge.
+ *
+ * N.B. rcv_nxt and rcv_wnd are both updated in this node if acks are sent, so
+ * use rcv_nxt in the right edge window test instead of rcv_las.
+ *
+ */
+always_inline u8
+tcp_segment_in_rcv_wnd (tcp_connection_t * tc, u32 seq, u32 end_seq)
+{
+ return (seq_geq (end_seq, tc->rcv_las)
+ && seq_leq (seq, tc->rcv_nxt + tc->rcv_wnd));
+}
+
+/**
+ * Parse TCP header options.
+ *
+ * @param th TCP header
+ * @param to TCP options data structure to be populated
+ * @return -1 if parsing failed
+ */
+int
+tcp_options_parse (tcp_header_t * th, tcp_options_t * to)
+{
+ const u8 *data;
+ u8 opt_len, opts_len, kind;
+ int j;
+ sack_block_t b;
+
+ opts_len = (tcp_doff (th) << 2) - sizeof (tcp_header_t);
+ data = (const u8 *) (th + 1);
+
+ /* Zero out all flags but those set in SYN */
+ to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE);
+
+ for (; opts_len > 0; opts_len -= opt_len, data += opt_len)
+ {
+ kind = data[0];
+
+ /* Get options length */
+ if (kind == TCP_OPTION_EOL)
+ break;
+ else if (kind == TCP_OPTION_NOOP)
+ {
+ opt_len = 1;
+ continue;
+ }
+ else
+ {
+ /* broken options */
+ if (opts_len < 2)
+ return -1;
+ opt_len = data[1];
+
+ /* weird option length */
+ if (opt_len < 2 || opt_len > opts_len)
+ return -1;
+ }
+
+ /* Parse options */
+ switch (kind)
+ {
+ case TCP_OPTION_MSS:
+ if ((opt_len == TCP_OPTION_LEN_MSS) && tcp_syn (th))
+ {
+ to->flags |= TCP_OPTS_FLAG_MSS;
+ to->mss = clib_net_to_host_u16 (*(u16 *) (data + 2));
+ }
+ break;
+ case TCP_OPTION_WINDOW_SCALE:
+ if ((opt_len == TCP_OPTION_LEN_WINDOW_SCALE) && tcp_syn (th))
+ {
+ to->flags |= TCP_OPTS_FLAG_WSCALE;
+ to->wscale = data[2];
+ if (to->wscale > TCP_MAX_WND_SCALE)
+ {
+ clib_warning ("Illegal window scaling value: %d",
+ to->wscale);
+ to->wscale = TCP_MAX_WND_SCALE;
+ }
+ }
+ break;
+ case TCP_OPTION_TIMESTAMP:
+ if (opt_len == TCP_OPTION_LEN_TIMESTAMP)
+ {
+ to->flags |= TCP_OPTS_FLAG_TSTAMP;
+ to->tsval = clib_net_to_host_u32 (*(u32 *) (data + 2));
+ to->tsecr = clib_net_to_host_u32 (*(u32 *) (data + 6));
+ }
+ break;
+ case TCP_OPTION_SACK_PERMITTED:
+ if (opt_len == TCP_OPTION_LEN_SACK_PERMITTED && tcp_syn (th))
+ to->flags |= TCP_OPTS_FLAG_SACK_PERMITTED;
+ break;
+ case TCP_OPTION_SACK_BLOCK:
+ /* If SACK permitted was not advertised or a SYN, break */
+ if ((to->flags & TCP_OPTS_FLAG_SACK_PERMITTED) == 0 || tcp_syn (th))
+ break;
+
+ /* If too short or not correctly formatted, break */
+ if (opt_len < 10 || ((opt_len - 2) % TCP_OPTION_LEN_SACK_BLOCK))
+ break;
+
+ to->flags |= TCP_OPTS_FLAG_SACK;
+ to->n_sack_blocks = (opt_len - 2) / TCP_OPTION_LEN_SACK_BLOCK;
+ vec_reset_length (to->sacks);
+ for (j = 0; j < to->n_sack_blocks; j++)
+ {
+ b.start = clib_net_to_host_u32 (*(u32 *) (data + 2 + 8 * j));
+ b.end = clib_net_to_host_u32 (*(u32 *) (data + 6 + 8 * j));
+ vec_add1 (to->sacks, b);
+ }
+ break;
+ default:
+ /* Nothing to see here */
+ continue;
+ }
+ }
+ return 0;
+}
+
+/**
+ * RFC1323: Check against wrapped sequence numbers (PAWS). If we have
+ * timestamp to echo and it's less than tsval_recent, drop segment
+ * but still send an ACK in order to retain TCP's mechanism for detecting
+ * and recovering from half-open connections
+ *
+ * Or at least that's what the theory says. It seems that this might not work
+ * very well with packet reordering and fast retransmit. XXX
+ */
+always_inline int
+tcp_segment_check_paws (tcp_connection_t * tc)
+{
+ return tcp_opts_tstamp (&tc->rcv_opts) && tc->tsval_recent
+ && timestamp_lt (tc->rcv_opts.tsval, tc->tsval_recent);
+}
+
+/**
+ * Update tsval recent
+ */
+always_inline void
+tcp_update_timestamp (tcp_connection_t * tc, u32 seq, u32 seq_end)
+{
+ /*
+ * RFC1323: If Last.ACK.sent falls within the range of sequence numbers
+ * of an incoming segment:
+ * SEG.SEQ <= Last.ACK.sent < SEG.SEQ + SEG.LEN
+ * then the TSval from the segment is copied to TS.Recent;
+ * otherwise, the TSval is ignored.
+ */
+ if (tcp_opts_tstamp (&tc->rcv_opts) && seq_leq (seq, tc->rcv_las)
+ && seq_leq (tc->rcv_las, seq_end))
+ {
+ ASSERT (timestamp_leq (tc->tsval_recent, tc->rcv_opts.tsval));
+ tc->tsval_recent = tc->rcv_opts.tsval;
+ tc->tsval_recent_age = tcp_time_now ();
+ }
+}
+
+/**
+ * Validate incoming segment as per RFC793 p. 69 and RFC1323 p. 19
+ *
+ * It first verifies if segment has a wrapped sequence number (PAWS) and then
+ * does the processing associated to the first four steps (ignoring security
+ * and precedence): sequence number, rst bit and syn bit checks.
+ *
+ * @return 0 if segments passes validation.
+ */
+static int
+tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0,
+ vlib_buffer_t * b0, tcp_header_t * th0, u32 * next0)
+{
+ if (PREDICT_FALSE (!tcp_ack (th0) && !tcp_rst (th0) && !tcp_syn (th0)))
+ return -1;
+
+ if (PREDICT_FALSE (tcp_options_parse (th0, &tc0->rcv_opts)))
+ {
+ clib_warning ("options parse error");
+ return -1;
+ }
+
+ if (tcp_segment_check_paws (tc0))
+ {
+ if (CLIB_DEBUG > 2)
+ {
+ clib_warning ("paws failed\n%U", format_tcp_connection, tc0, 2);
+ clib_warning ("seq %u seq_end %u ack %u",
+ vnet_buffer (b0)->tcp.seq_number - tc0->irs,
+ vnet_buffer (b0)->tcp.seq_end - tc0->irs,
+ vnet_buffer (b0)->tcp.ack_number - tc0->iss);
+ }
+ TCP_EVT_DBG (TCP_EVT_PAWS_FAIL, tc0, vnet_buffer (b0)->tcp.seq_number,
+ vnet_buffer (b0)->tcp.seq_end);
+
+ /* If it just so happens that a segment updates tsval_recent for a
+ * segment over 24 days old, invalidate tsval_recent. */
+ if (timestamp_lt (tc0->tsval_recent_age + TCP_PAWS_IDLE,
+ tcp_time_now ()))
+ {
+ /* Age isn't reset until we get a valid tsval (bsd inspired) */
+ tc0->tsval_recent = 0;
+ clib_warning ("paws failed - really old segment. REALLY?");
+ }
+ else
+ {
+ /* Drop after ack if not rst */
+ if (!tcp_rst (th0))
+ {
+ tcp_make_ack (tc0, b0);
+ *next0 = tcp_next_output (tc0->c_is_ip4);
+ TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0);
+ return -1;
+ }
+ }
+ }
+
+ /* 1st: check sequence number */
+ if (!tcp_segment_in_rcv_wnd (tc0, vnet_buffer (b0)->tcp.seq_number,
+ vnet_buffer (b0)->tcp.seq_end))
+ {
+ /* If our window is 0 and the packet is in sequence, let it pass
+ * through for ack processing. It should be dropped later.*/
+ if (tc0->rcv_wnd == 0
+ && tc0->rcv_nxt == vnet_buffer (b0)->tcp.seq_number)
+ {
+ /* TODO Should segment be tagged? */
+ }
+ else
+ {
+ /* If not RST, send dup ack */
+ if (!tcp_rst (th0))
+ {
+ tcp_make_ack (tc0, b0);
+ *next0 = tcp_next_output (tc0->c_is_ip4);
+ TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0);
+ }
+ return -1;
+ }
+ }
+
+ /* 2nd: check the RST bit */
+ if (tcp_rst (th0))
+ {
+ tcp_connection_reset (tc0);
+ return -1;
+ }
+
+ /* 3rd: check security and precedence (skip) */
+
+ /* 4th: check the SYN bit */
+ if (tcp_syn (th0))
+ {
+ /* TODO implement RFC 5961 */
+ if (tc0->state == TCP_STATE_SYN_RCVD)
+ {
+ tcp_make_synack (tc0, b0);
+ TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0, 0);
+ }
+ else
+ {
+ tcp_make_ack (tc0, b0);
+ TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, tc0);
+ }
+ *next0 = tcp_next_output (tc0->c_is_ip4);
+ return -1;
+ }
+
+ /* If segment in window, save timestamp */
+ tcp_update_timestamp (tc0, vnet_buffer (b0)->tcp.seq_number,
+ vnet_buffer (b0)->tcp.seq_end);
+ return 0;
+}
+
+always_inline int
+tcp_rcv_ack_is_acceptable (tcp_connection_t * tc0, vlib_buffer_t * tb0)
+{
+ /* SND.UNA =< SEG.ACK =< SND.NXT */
+ return (seq_leq (tc0->snd_una, vnet_buffer (tb0)->tcp.ack_number)
+ && seq_leq (vnet_buffer (tb0)->tcp.ack_number, tc0->snd_nxt));
+}
+
+/**
+ * Compute smoothed RTT as per VJ's '88 SIGCOMM and RFC6298
+ *
+ * Note that although the original article, srtt and rttvar are scaled
+ * to minimize round-off errors, here we don't. Instead, we rely on
+ * better precision time measurements.
+ *
+ * TODO support us rtt resolution
+ */
+static void
+tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt)
+{
+ int err, diff;
+
+ if (tc->srtt != 0)
+ {
+ err = mrtt - tc->srtt;
+
+ /* XXX Drop in RTT results in RTTVAR increase and bigger RTO.
+ * The increase should be bound */
+ tc->srtt = clib_max ((int) tc->srtt + (err >> 3), 1);
+ diff = (clib_abs (err) - (int) tc->rttvar) >> 2;
+ tc->rttvar = clib_max ((int) tc->rttvar + diff, 1);
+ }
+ else
+ {
+ /* First measurement. */
+ tc->srtt = mrtt;
+ tc->rttvar = mrtt >> 1;
+ }
+}
+
+void
+tcp_update_rto (tcp_connection_t * tc)
+{
+ tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX);
+ tc->rto = clib_max (tc->rto, TCP_RTO_MIN);
+}
+
+/**
+ * Update RTT estimate and RTO timer
+ *
+ * Measure RTT: We have two sources of RTT measurements: TSOPT and ACK
+ * timing. Middle boxes are known to fiddle with TCP options so we
+ * should give higher priority to ACK timing.
+ *
+ * This should be called only if previously sent bytes have been acked.
+ *
+ * return 1 if valid rtt 0 otherwise
+ */
+static int
+tcp_update_rtt (tcp_connection_t * tc, u32 ack)
+{
+ u32 mrtt = 0;
+
+ /* Karn's rule, part 1. Don't use retransmitted segments to estimate
+ * RTT because they're ambiguous. */
+ if (tcp_in_cong_recovery (tc) || tc->sack_sb.sacked_bytes)
+ goto done;
+
+ if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq))
+ {
+ mrtt = tcp_time_now () - tc->rtt_ts;
+ }
+ /* As per RFC7323 TSecr can be used for RTTM only if the segment advances
+ * snd_una, i.e., the left side of the send window:
+ * seq_lt (tc->snd_una, ack). This is a condition for calling update_rtt */
+ else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr)
+ {
+ mrtt = tcp_time_now () - tc->rcv_opts.tsecr;
+ }
+
+ /* Ignore dubious measurements */
+ if (mrtt == 0 || mrtt > TCP_RTT_MAX)
+ goto done;
+
+ tcp_estimate_rtt (tc, mrtt);
+
+done:
+
+ /* Allow measuring of a new RTT */
+ tc->rtt_ts = 0;
+
+ /* If we got here something must've been ACKed so make sure boff is 0,
+ * even if mrrt is not valid since we update the rto lower */
+ tc->rto_boff = 0;
+ tcp_update_rto (tc);
+
+ return 0;
+}
+
+/**
+ * Dequeue bytes that have been acked and while at it update RTT estimates.
+ */
+static void
+tcp_dequeue_acked (tcp_connection_t * tc, u32 ack)
+{
+ /* Dequeue the newly ACKed add SACKed bytes */
+ stream_session_dequeue_drop (&tc->connection,
+ tc->bytes_acked + tc->sack_sb.snd_una_adv);
+
+ tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
+
+ /* Update rtt and rto */
+ tcp_update_rtt (tc, ack);
+
+ /* If everything has been acked, stop retransmit timer
+ * otherwise update. */
+ tcp_retransmit_timer_update (tc);
+}
+
+/**
+ * Check if duplicate ack as per RFC5681 Sec. 2
+ */
+static u8
+tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 prev_snd_wnd,
+ u32 prev_snd_una)
+{
+ return ((vnet_buffer (b)->tcp.ack_number == prev_snd_una)
+ && seq_gt (tc->snd_una_max, tc->snd_una)
+ && (vnet_buffer (b)->tcp.seq_end == vnet_buffer (b)->tcp.seq_number)
+ && (prev_snd_wnd == tc->snd_wnd));
+}
+
+/**
+ * Checks if ack is a congestion control event.
+ */
+static u8
+tcp_ack_is_cc_event (tcp_connection_t * tc, vlib_buffer_t * b,
+ u32 prev_snd_wnd, u32 prev_snd_una, u8 * is_dack)
+{
+ /* Check if ack is duplicate. Per RFC 6675, ACKs that SACK new data are
+ * defined to be 'duplicate' */
+ *is_dack = tc->sack_sb.last_sacked_bytes
+ || tcp_ack_is_dupack (tc, b, prev_snd_wnd, prev_snd_una);
+
+ return ((*is_dack || tcp_in_cong_recovery (tc)) && !tcp_is_lost_fin (tc));
+}
+
+void
+scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole)
+{
+ sack_scoreboard_hole_t *next, *prev;
+
+ if (hole->next != TCP_INVALID_SACK_HOLE_INDEX)
+ {
+ next = pool_elt_at_index (sb->holes, hole->next);
+ next->prev = hole->prev;
+ }
+ else
+ {
+ sb->tail = hole->prev;
+ }
+
+ if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX)
+ {
+ prev = pool_elt_at_index (sb->holes, hole->prev);
+ prev->next = hole->next;
+ }
+ else
+ {
+ sb->head = hole->next;
+ }
+
+ if (scoreboard_hole_index (sb, hole) == sb->cur_rxt_hole)
+ sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
+
+ /* Poison the entry */
+ if (CLIB_DEBUG > 0)
+ memset (hole, 0xfe, sizeof (*hole));
+
+ pool_put (sb->holes, hole);
+}
+
+sack_scoreboard_hole_t *
+scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index,
+ u32 start, u32 end)
+{
+ sack_scoreboard_hole_t *hole, *next, *prev;
+ u32 hole_index;
+
+ pool_get (sb->holes, hole);
+ memset (hole, 0, sizeof (*hole));
+
+ hole->start = start;
+ hole->end = end;
+ hole_index = scoreboard_hole_index (sb, hole);
+
+ prev = scoreboard_get_hole (sb, prev_index);
+ if (prev)
+ {
+ hole->prev = prev_index;
+ hole->next = prev->next;
+
+ if ((next = scoreboard_next_hole (sb, hole)))
+ next->prev = hole_index;
+ else
+ sb->tail = hole_index;
+
+ prev->next = hole_index;
+ }
+ else
+ {
+ sb->head = hole_index;
+ hole->prev = TCP_INVALID_SACK_HOLE_INDEX;
+ hole->next = TCP_INVALID_SACK_HOLE_INDEX;
+ }
+
+ return hole;
+}
+
+void
+scoreboard_update_bytes (tcp_connection_t * tc, sack_scoreboard_t * sb)
+{
+ sack_scoreboard_hole_t *hole, *prev;
+ u32 bytes = 0, blks = 0;
+
+ sb->lost_bytes = 0;
+ sb->sacked_bytes = 0;
+ hole = scoreboard_last_hole (sb);
+ if (!hole)
+ return;
+
+ if (seq_gt (sb->high_sacked, hole->end))
+ {
+ bytes = sb->high_sacked - hole->end;
+ blks = 1;
+ }
+
+ while ((prev = scoreboard_prev_hole (sb, hole))
+ && (bytes < (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss
+ && blks < TCP_DUPACK_THRESHOLD))
+ {
+ bytes += hole->start - prev->end;
+ blks++;
+ hole = prev;
+ }
+
+ while (hole)
+ {
+ sb->lost_bytes += scoreboard_hole_bytes (hole);
+ hole->is_lost = 1;
+ prev = hole;
+ hole = scoreboard_prev_hole (sb, hole);
+ if (hole)
+ bytes += prev->start - hole->end;
+ }
+ sb->sacked_bytes = bytes;
+}
+
+/**
+ * Figure out the next hole to retransmit
+ *
+ * Follows logic proposed in RFC6675 Sec. 4, NextSeg()
+ */
+sack_scoreboard_hole_t *
+scoreboard_next_rxt_hole (sack_scoreboard_t * sb,
+ sack_scoreboard_hole_t * start,
+ u8 have_sent_1_smss,
+ u8 * can_rescue, u8 * snd_limited)
+{
+ sack_scoreboard_hole_t *hole = 0;
+
+ hole = start ? start : scoreboard_first_hole (sb);
+ while (hole && seq_leq (hole->end, sb->high_rxt) && hole->is_lost)
+ hole = scoreboard_next_hole (sb, hole);
+
+ /* Nothing, return */
+ if (!hole)
+ {
+ sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
+ return 0;
+ }
+
+ /* Rule (1): if higher than rxt, less than high_sacked and lost */
+ if (hole->is_lost && seq_lt (hole->start, sb->high_sacked))
+ {
+ sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
+ }
+ else
+ {
+ /* Rule (2): output takes care of transmitting new data */
+ if (!have_sent_1_smss)
+ {
+ hole = 0;
+ sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
+ }
+ /* Rule (3): if hole not lost */
+ else if (seq_lt (hole->start, sb->high_sacked))
+ {
+ *snd_limited = 1;
+ sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
+ }
+ /* Rule (4): if hole beyond high_sacked */
+ else
+ {
+ ASSERT (seq_geq (hole->start, sb->high_sacked));
+ *snd_limited = 1;
+ *can_rescue = 1;
+ /* HighRxt MUST NOT be updated */
+ return 0;
+ }
+ }
+
+ if (hole && seq_lt (sb->high_rxt, hole->start))
+ sb->high_rxt = hole->start;
+
+ return hole;
+}
+
+void
+scoreboard_init_high_rxt (sack_scoreboard_t * sb, u32 seq)
+{
+ sack_scoreboard_hole_t *hole;
+ hole = scoreboard_first_hole (sb);
+ if (hole)
+ {
+ seq = seq_gt (seq, hole->start) ? seq : hole->start;
+ sb->cur_rxt_hole = sb->head;
+ }
+ sb->high_rxt = seq;
+}
+
+/**
+ * Test that scoreboard is sane after recovery
+ *
+ * Returns 1 if scoreboard is empty or if first hole beyond
+ * snd_una.
+ */
+u8
+tcp_scoreboard_is_sane_post_recovery (tcp_connection_t * tc)
+{
+ sack_scoreboard_hole_t *hole;
+ hole = scoreboard_first_hole (&tc->sack_sb);
+ return (!hole || seq_geq (hole->start, tc->snd_una));
+}
+
+void
+tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
+{
+ sack_scoreboard_t *sb = &tc->sack_sb;
+ sack_block_t *blk, tmp;
+ sack_scoreboard_hole_t *hole, *next_hole, *last_hole;
+ u32 blk_index = 0, old_sacked_bytes, hole_index;
+ int i, j;
+
+ sb->last_sacked_bytes = 0;
+ sb->snd_una_adv = 0;
+ old_sacked_bytes = sb->sacked_bytes;
+ sb->last_bytes_delivered = 0;
+
+ if (!tcp_opts_sack (&tc->rcv_opts)
+ && sb->head == TCP_INVALID_SACK_HOLE_INDEX)
+ return;
+
+ /* Remove invalid blocks */
+ blk = tc->rcv_opts.sacks;
+ while (blk < vec_end (tc->rcv_opts.sacks))
+ {
+ if (seq_lt (blk->start, blk->end)
+ && seq_gt (blk->start, tc->snd_una)
+ && seq_gt (blk->start, ack) && seq_leq (blk->end, tc->snd_una_max))
+ {
+ blk++;
+ continue;
+ }
+ vec_del1 (tc->rcv_opts.sacks, blk - tc->rcv_opts.sacks);
+ }
+
+ /* Add block for cumulative ack */
+ if (seq_gt (ack, tc->snd_una))
+ {
+ tmp.start = tc->snd_una;
+ tmp.end = ack;
+ vec_add1 (tc->rcv_opts.sacks, tmp);
+ }
+
+ if (vec_len (tc->rcv_opts.sacks) == 0)
+ return;
+
+ tcp_scoreboard_trace_add (tc, ack);
+
+ /* Make sure blocks are ordered */
+ for (i = 0; i < vec_len (tc->rcv_opts.sacks); i++)
+ for (j = i + 1; j < vec_len (tc->rcv_opts.sacks); j++)
+ if (seq_lt (tc->rcv_opts.sacks[j].start, tc->rcv_opts.sacks[i].start))
+ {
+ tmp = tc->rcv_opts.sacks[i];
+ tc->rcv_opts.sacks[i] = tc->rcv_opts.sacks[j];
+ tc->rcv_opts.sacks[j] = tmp;
+ }
+
+ if (sb->head == TCP_INVALID_SACK_HOLE_INDEX)
+ {
+ /* If no holes, insert the first that covers all outstanding bytes */
+ last_hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX,
+ tc->snd_una, tc->snd_una_max);
+ sb->tail = scoreboard_hole_index (sb, last_hole);
+ tmp = tc->rcv_opts.sacks[vec_len (tc->rcv_opts.sacks) - 1];
+ sb->high_sacked = tmp.end;
+ }
+ else
+ {
+ /* If we have holes but snd_una_max is beyond the last hole, update
+ * last hole end */
+ tmp = tc->rcv_opts.sacks[vec_len (tc->rcv_opts.sacks) - 1];
+ last_hole = scoreboard_last_hole (sb);
+ if (seq_gt (tc->snd_una_max, last_hole->end))
+ {
+ if (seq_geq (last_hole->start, sb->high_sacked))
+ {
+ last_hole->end = tc->snd_una_max;
+ }
+ /* New hole after high sacked block */
+ else if (seq_lt (sb->high_sacked, tc->snd_una_max))
+ {
+ scoreboard_insert_hole (sb, sb->tail, sb->high_sacked,
+ tc->snd_una_max);
+ }
+ }
+ /* Keep track of max byte sacked for when the last hole
+ * is acked */
+ if (seq_gt (tmp.end, sb->high_sacked))
+ sb->high_sacked = tmp.end;
+ }
+
+ /* Walk the holes with the SACK blocks */
+ hole = pool_elt_at_index (sb->holes, sb->head);
+ while (hole && blk_index < vec_len (tc->rcv_opts.sacks))
+ {
+ blk = &tc->rcv_opts.sacks[blk_index];
+ if (seq_leq (blk->start, hole->start))
+ {
+ /* Block covers hole. Remove hole */
+ if (seq_geq (blk->end, hole->end))
+ {
+ next_hole = scoreboard_next_hole (sb, hole);
+
+ /* Byte accounting: snd_una needs to be advanced */
+ if (blk->end == ack)
+ {
+ if (next_hole)
+ {
+ if (seq_lt (ack, next_hole->start))
+ sb->snd_una_adv = next_hole->start - ack;
+ sb->last_bytes_delivered +=
+ next_hole->start - hole->end;
+ }
+ else
+ {
+ ASSERT (seq_geq (sb->high_sacked, ack));
+ sb->snd_una_adv = sb->high_sacked - ack;
+ sb->last_bytes_delivered += sb->high_sacked - hole->end;
+ }
+ }
+
+ scoreboard_remove_hole (sb, hole);
+ hole = next_hole;
+ }
+ /* Partial 'head' overlap */
+ else
+ {
+ if (seq_gt (blk->end, hole->start))
+ {
+ hole->start = blk->end;
+ }
+ blk_index++;
+ }
+ }
+ else
+ {
+ /* Hole must be split */
+ if (seq_lt (blk->end, hole->end))
+ {
+ hole_index = scoreboard_hole_index (sb, hole);
+ next_hole = scoreboard_insert_hole (sb, hole_index, blk->end,
+ hole->end);
+
+ /* Pool might've moved */
+ hole = scoreboard_get_hole (sb, hole_index);
+ hole->end = blk->start;
+ blk_index++;
+ ASSERT (hole->next == scoreboard_hole_index (sb, next_hole));
+ }
+ else if (seq_lt (blk->start, hole->end))
+ {
+ hole->end = blk->start;
+ }
+ hole = scoreboard_next_hole (sb, hole);
+ }
+ }
+
+ scoreboard_update_bytes (tc, sb);
+ sb->last_sacked_bytes = sb->sacked_bytes
+ - (old_sacked_bytes - sb->last_bytes_delivered);
+ ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes);
+ ASSERT (sb->sacked_bytes == 0
+ || sb->sacked_bytes < tc->snd_una_max - seq_max (tc->snd_una, ack));
+ ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_una_max
+ - seq_max (tc->snd_una, ack));
+ ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc)
+ || sb->holes[sb->head].start == ack + sb->snd_una_adv);
+}
+
+/**
+ * Try to update snd_wnd based on feedback received from peer.
+ *
+ * If successful, and new window is 'effectively' 0, activate persist
+ * timer.
+ */
+static void
+tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd)
+{
+ /* If (SND.WL1 < SEG.SEQ or (SND.WL1 = SEG.SEQ and SND.WL2 =< SEG.ACK)), set
+ * SND.WND <- SEG.WND, set SND.WL1 <- SEG.SEQ, and set SND.WL2 <- SEG.ACK */
+ if (seq_lt (tc->snd_wl1, seq)
+ || (tc->snd_wl1 == seq && seq_leq (tc->snd_wl2, ack)))
+ {
+ tc->snd_wnd = snd_wnd;
+ tc->snd_wl1 = seq;
+ tc->snd_wl2 = ack;
+ TCP_EVT_DBG (TCP_EVT_SND_WND, tc);
+
+ if (tc->snd_wnd < tc->snd_mss)
+ {
+ /* Set persist timer if not set and we just got 0 wnd */
+ if (!tcp_timer_is_active (tc, TCP_TIMER_PERSIST)
+ && !tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT))
+ tcp_persist_timer_set (tc);
+ }
+ else
+ {
+ tcp_persist_timer_reset (tc);
+ if (!tcp_in_recovery (tc) && tc->rto_boff > 0)
+ {
+ tc->rto_boff = 0;
+ tcp_update_rto (tc);
+ }
+ }
+ }
+}
+
+void
+tcp_cc_init_congestion (tcp_connection_t * tc)
+{
+ tcp_fastrecovery_on (tc);
+ tc->snd_congestion = tc->snd_una_max;
+ tc->cc_algo->congestion (tc);
+ TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 4);
+}
+
+static void
+tcp_cc_recovery_exit (tcp_connection_t * tc)
+{
+ /* Deflate rto */
+ tc->rto_boff = 0;
+ tcp_update_rto (tc);
+ tc->snd_rxt_ts = 0;
+ tc->snd_nxt = tc->snd_una_max;
+ tcp_recovery_off (tc);
+ TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3);
+}
+
+void
+tcp_cc_fastrecovery_exit (tcp_connection_t * tc)
+{
+ tc->cc_algo->recovered (tc);
+ tc->snd_rxt_bytes = 0;
+ tc->rcv_dupacks = 0;
+ tc->snd_nxt = tc->snd_una_max;
+ tcp_fastrecovery_off (tc);
+ tcp_fastrecovery_1_smss_off (tc);
+ TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3);
+}
+
+static void
+tcp_cc_congestion_undo (tcp_connection_t * tc)
+{
+ tc->cwnd = tc->prev_cwnd;
+ tc->ssthresh = tc->prev_ssthresh;
+ tc->snd_nxt = tc->snd_una_max;
+ tc->rcv_dupacks = 0;
+ if (tcp_in_recovery (tc))
+ tcp_cc_recovery_exit (tc);
+ ASSERT (tc->rto_boff == 0);
+ TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 5);
+ /* TODO extend for fastrecovery */
+}
+
+static u8
+tcp_cc_is_spurious_retransmit (tcp_connection_t * tc)
+{
+ return (tcp_in_recovery (tc) && tc->rto_boff == 1
+ && tc->snd_rxt_ts
+ && tcp_opts_tstamp (&tc->rcv_opts)
+ && timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts));
+}
+
+int
+tcp_cc_recover (tcp_connection_t * tc)
+{
+ ASSERT (tcp_in_cong_recovery (tc));
+ if (tcp_cc_is_spurious_retransmit (tc))
+ {
+ tcp_cc_congestion_undo (tc);
+ return 1;
+ }
+
+ if (tcp_in_recovery (tc))
+ tcp_cc_recovery_exit (tc);
+ else if (tcp_in_fastrecovery (tc))
+ tcp_cc_fastrecovery_exit (tc);
+
+ ASSERT (tc->rto_boff == 0);
+ ASSERT (!tcp_in_cong_recovery (tc));
+ ASSERT (tcp_scoreboard_is_sane_post_recovery (tc));
+ return 0;
+}
+
+static void
+tcp_cc_update (tcp_connection_t * tc, vlib_buffer_t * b)
+{
+ ASSERT (!tcp_in_cong_recovery (tc) || tcp_is_lost_fin (tc));
+
+ /* Congestion avoidance */
+ tc->cc_algo->rcv_ack (tc);
+ tc->tsecr_last_ack = tc->rcv_opts.tsecr;
+
+ /* If a cumulative ack, make sure dupacks is 0 */
+ tc->rcv_dupacks = 0;
+
+ /* When dupacks hits the threshold we only enter fast retransmit if
+ * cumulative ack covers more than snd_congestion. Should snd_una
+ * wrap this test may fail under otherwise valid circumstances.
+ * Therefore, proactively update snd_congestion when wrap detected. */
+ if (PREDICT_FALSE
+ (seq_leq (tc->snd_congestion, tc->snd_una - tc->bytes_acked)
+ && seq_gt (tc->snd_congestion, tc->snd_una)))
+ tc->snd_congestion = tc->snd_una - 1;
+}
+
+static u8
+tcp_should_fastrecover_sack (tcp_connection_t * tc)
+{
+ return (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss < tc->sack_sb.sacked_bytes;
+}
+
+static u8
+tcp_should_fastrecover (tcp_connection_t * tc)
+{
+ return (tc->rcv_dupacks == TCP_DUPACK_THRESHOLD
+ || tcp_should_fastrecover_sack (tc));
+}
+
+/**
+ * One function to rule them all ... and in the darkness bind them
+ */
+static void
+tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
+{
+ u32 rxt_delivered;
+
+ /*
+ * Duplicate ACK. Check if we should enter fast recovery, or if already in
+ * it account for the bytes that left the network.
+ */
+ if (is_dack)
+ {
+ ASSERT (tc->snd_una != tc->snd_una_max
+ || tc->sack_sb.last_sacked_bytes);
+
+ tc->rcv_dupacks++;
+
+ if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD && !tc->bytes_acked)
+ {
+ ASSERT (tcp_in_fastrecovery (tc));
+ /* Pure duplicate ack. If some data got acked, it's handled lower */
+ tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
+ return;
+ }
+ else if (tcp_should_fastrecover (tc))
+ {
+ /* Things are already bad */
+ if (tcp_in_cong_recovery (tc))
+ {
+ tc->rcv_dupacks = 0;
+ goto partial_ack_test;
+ }
+
+ /* If of of the two conditions lower hold, reset dupacks because
+ * we're probably after timeout (RFC6582 heuristics).
+ * If Cumulative ack does not cover more than congestion threshold,
+ * and:
+ * 1) The following doesn't hold: The congestion window is greater
+ * than SMSS bytes and the difference between highest_ack
+ * and prev_highest_ack is at most 4*SMSS bytes
+ * 2) Echoed timestamp in the last non-dup ack does not equal the
+ * stored timestamp
+ */
+ if (seq_leq (tc->snd_una, tc->snd_congestion)
+ && ((!(tc->cwnd > tc->snd_mss
+ && tc->bytes_acked <= 4 * tc->snd_mss))
+ || (tc->rcv_opts.tsecr != tc->tsecr_last_ack)))
+ {
+ tc->rcv_dupacks = 0;
+ return;
+ }
+
+ tcp_cc_init_congestion (tc);
+ tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
+
+ /* The first segment MUST be retransmitted */
+ tcp_retransmit_first_unacked (tc);
+
+ /* Post retransmit update cwnd to ssthresh and account for the
+ * three segments that have left the network and should've been
+ * buffered at the receiver XXX */
+ tc->cwnd = tc->ssthresh + tc->rcv_dupacks * tc->snd_mss;
+ ASSERT (tc->cwnd >= tc->snd_mss);
+
+ /* If cwnd allows, send more data */
+ if (tcp_opts_sack_permitted (&tc->rcv_opts))
+ {
+ scoreboard_init_high_rxt (&tc->sack_sb,
+ tc->snd_una + tc->snd_mss);
+ tcp_fast_retransmit_sack (tc);
+ }
+ else
+ {
+ tcp_fast_retransmit_no_sack (tc);
+ }
+
+ return;
+ }
+ else if (!tc->bytes_acked
+ || (tc->bytes_acked && !tcp_in_cong_recovery (tc)))
+ {
+ tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
+ return;
+ }
+ else
+ goto partial_ack;
+ }
+
+partial_ack_test:
+
+ if (!tc->bytes_acked)
+ return;
+
+partial_ack:
+ /*
+ * Legitimate ACK. 1) See if we can exit recovery
+ */
+ /* XXX limit this only to first partial ack? */
+ tcp_retransmit_timer_update (tc);
+
+ if (seq_geq (tc->snd_una, tc->snd_congestion))
+ {
+ /* If spurious return, we've already updated everything */
+ if (tcp_cc_recover (tc))
+ {
+ tc->tsecr_last_ack = tc->rcv_opts.tsecr;
+ return;
+ }
+
+ tc->snd_nxt = tc->snd_una_max;
+
+ /* Treat as congestion avoidance ack */
+ tc->cc_algo->rcv_ack (tc);
+ tc->tsecr_last_ack = tc->rcv_opts.tsecr;
+ return;
+ }
+
+ /*
+ * Legitimate ACK. 2) If PARTIAL ACK try to retransmit
+ */
+ TCP_EVT_DBG (TCP_EVT_CC_PACK, tc);
+
+ /* RFC6675: If the incoming ACK is a cumulative acknowledgment,
+ * reset dupacks to 0 */
+ tc->rcv_dupacks = 0;
+
+ tcp_retransmit_first_unacked (tc);
+
+ /* Post RTO timeout don't try anything fancy */
+ if (tcp_in_recovery (tc))
+ return;
+
+ /* Remove retransmitted bytes that have been delivered */
+ ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv
+ >= tc->sack_sb.last_bytes_delivered
+ || (tc->flags & TCP_CONN_FINSNT));
+
+ if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt))
+ {
+ /* If we have sacks and we haven't gotten an ack beyond high_rxt,
+ * remove sacked bytes delivered */
+ rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv
+ - tc->sack_sb.last_bytes_delivered;
+ ASSERT (tc->snd_rxt_bytes >= rxt_delivered);
+ tc->snd_rxt_bytes -= rxt_delivered;
+ }
+ else
+ {
+ /* Either all retransmitted holes have been acked, or we're
+ * "in the blind" and retransmitting segment by segment */
+ tc->snd_rxt_bytes = 0;
+ }
+
+ tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK);
+
+ /*
+ * Since this was a partial ack, try to retransmit some more data
+ */
+ tcp_fast_retransmit (tc);
+}
+
+void
+tcp_cc_init (tcp_connection_t * tc)
+{
+ tc->cc_algo = tcp_cc_algo_get (TCP_CC_NEWRENO);
+ tc->cc_algo->init (tc);
+}
+
+/**
+ * Process incoming ACK
+ */
+static int
+tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
+ tcp_header_t * th, u32 * next, u32 * error)
+{
+ u32 prev_snd_wnd, prev_snd_una;
+ u8 is_dack;
+
+ TCP_EVT_DBG (TCP_EVT_CC_STAT, tc);
+
+ /* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) */
+ if (PREDICT_FALSE (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)))
+ {
+ /* If we have outstanding data and this is within the window, accept it,
+ * probably retransmit has timed out. Otherwise ACK segment and then
+ * drop it */
+ if (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_una_max))
+ {
+ tcp_make_ack (tc, b);
+ *next = tcp_next_output (tc->c_is_ip4);
+ *error = TCP_ERROR_ACK_INVALID;
+ TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 0,
+ vnet_buffer (b)->tcp.ack_number);
+ return -1;
+ }
+
+ TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 2,
+ vnet_buffer (b)->tcp.ack_number);
+
+ tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
+ *error = TCP_ERROR_ACK_FUTURE;
+ }
+
+ /* If old ACK, probably it's an old dupack */
+ if (PREDICT_FALSE (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una)))
+ {
+ *error = TCP_ERROR_ACK_OLD;
+ TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 1,
+ vnet_buffer (b)->tcp.ack_number);
+ if (tcp_in_fastrecovery (tc) && tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
+ {
+ TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc);
+ tcp_cc_handle_event (tc, 1);
+ }
+ /* Don't drop yet */
+ return 0;
+ }
+
+ /*
+ * Looks okay, process feedback
+ */
+
+ if (tcp_opts_sack_permitted (&tc->rcv_opts))
+ tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number);
+
+ prev_snd_wnd = tc->snd_wnd;
+ prev_snd_una = tc->snd_una;
+ tcp_update_snd_wnd (tc, vnet_buffer (b)->tcp.seq_number,
+ vnet_buffer (b)->tcp.ack_number,
+ clib_net_to_host_u16 (th->window) << tc->snd_wscale);
+ tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una;
+ tc->snd_una = vnet_buffer (b)->tcp.ack_number + tc->sack_sb.snd_una_adv;
+ tcp_validate_txf_size (tc, tc->bytes_acked);
+
+ if (tc->bytes_acked)
+ tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number);
+
+ TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc);
+
+ /*
+ * Check if we have congestion event
+ */
+
+ if (tcp_ack_is_cc_event (tc, b, prev_snd_wnd, prev_snd_una, &is_dack))
+ {
+ tcp_cc_handle_event (tc, is_dack);
+ if (!tcp_in_cong_recovery (tc))
+ return 0;
+ *error = TCP_ERROR_ACK_DUP;
+ TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc, 1);
+ return vnet_buffer (b)->tcp.data_len ? 0 : -1;
+ }
+
+ /*
+ * Update congestion control (slow start/congestion avoidance)
+ */
+ tcp_cc_update (tc, b);
+
+ return 0;
+}
+
+static u8
+tcp_sack_vector_is_sane (sack_block_t * sacks)
+{
+ int i;
+ for (i = 1; i < vec_len (sacks); i++)
+ {
+ if (sacks[i - 1].end == sacks[i].start)
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * Build SACK list as per RFC2018.
+ *
+ * Makes sure the first block contains the segment that generated the current
+ * ACK and the following ones are the ones most recently reported in SACK
+ * blocks.
+ *
+ * @param tc TCP connection for which the SACK list is updated
+ * @param start Start sequence number of the newest SACK block
+ * @param end End sequence of the newest SACK block
+ */
+void
+tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end)
+{
+ sack_block_t *new_list = 0, *block = 0;
+ int i;
+
+ /* If the first segment is ooo add it to the list. Last write might've moved
+ * rcv_nxt over the first segment. */
+ if (seq_lt (tc->rcv_nxt, start))
+ {
+ vec_add2 (new_list, block, 1);
+ block->start = start;
+ block->end = end;
+ }
+
+ /* Find the blocks still worth keeping. */
+ for (i = 0; i < vec_len (tc->snd_sacks); i++)
+ {
+ /* Discard if rcv_nxt advanced beyond current block */
+ if (seq_leq (tc->snd_sacks[i].start, tc->rcv_nxt))
+ continue;
+
+ /* Merge or drop if segment overlapped by the new segment */
+ if (block && (seq_geq (tc->snd_sacks[i].end, new_list[0].start)
+ && seq_leq (tc->snd_sacks[i].start, new_list[0].end)))
+ {
+ if (seq_lt (tc->snd_sacks[i].start, new_list[0].start))
+ new_list[0].start = tc->snd_sacks[i].start;
+ if (seq_lt (new_list[0].end, tc->snd_sacks[i].end))
+ new_list[0].end = tc->snd_sacks[i].end;
+ continue;
+ }
+
+ /* Save to new SACK list if we have space. */
+ if (vec_len (new_list) < TCP_MAX_SACK_BLOCKS)
+ {
+ vec_add1 (new_list, tc->snd_sacks[i]);
+ }
+ else
+ {
+ clib_warning ("sack discarded");
+ }
+ }
+
+ ASSERT (vec_len (new_list) <= TCP_MAX_SACK_BLOCKS);
+
+ /* Replace old vector with new one */
+ vec_free (tc->snd_sacks);
+ tc->snd_sacks = new_list;
+
+ /* Segments should not 'touch' */
+ ASSERT (tcp_sack_vector_is_sane (tc->snd_sacks));
+}
+
+/** Enqueue data for delivery to application */
+always_inline int
+tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b,
+ u16 data_len)
+{
+ int written, error = TCP_ERROR_ENQUEUED;
+
+ ASSERT (seq_geq (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt));
+
+ /* Pure ACK. Update rcv_nxt and be done. */
+ if (PREDICT_FALSE (data_len == 0))
+ {
+ return TCP_ERROR_PURE_ACK;
+ }
+
+ written = stream_session_enqueue_data (&tc->connection, b, 0,
+ 1 /* queue event */ , 1);
+
+ TCP_EVT_DBG (TCP_EVT_INPUT, tc, 0, data_len, written);
+
+ /* Update rcv_nxt */
+ if (PREDICT_TRUE (written == data_len))
+ {
+ tc->rcv_nxt += written;
+ }
+ /* If more data written than expected, account for out-of-order bytes. */
+ else if (written > data_len)
+ {
+ tc->rcv_nxt += written;
+
+ /* Send ACK confirming the update */
+ tc->flags |= TCP_CONN_SNDACK;
+ }
+ else if (written > 0)
+ {
+ /* We've written something but FIFO is probably full now */
+ tc->rcv_nxt += written;
+
+ /* Depending on how fast the app is, all remaining buffers in burst will
+ * not be enqueued. Inform peer */
+ tc->flags |= TCP_CONN_SNDACK;
+
+ error = TCP_ERROR_PARTIALLY_ENQUEUED;
+ }
+ else
+ {
+ tc->flags |= TCP_CONN_SNDACK;
+ return TCP_ERROR_FIFO_FULL;
+ }
+
+ /* Update SACK list if need be */
+ if (tcp_opts_sack_permitted (&tc->rcv_opts))
+ {
+ /* Remove SACK blocks that have been delivered */
+ tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt);
+ }
+
+ return error;
+}
+
+/** Enqueue out-of-order data */
+always_inline int
+tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b,
+ u16 data_len)
+{
+ stream_session_t *s0;
+ int rv, offset;
+
+ ASSERT (seq_gt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt));
+
+ /* Pure ACK. Do nothing */
+ if (PREDICT_FALSE (data_len == 0))
+ {
+ return TCP_ERROR_PURE_ACK;
+ }
+
+ /* Enqueue out-of-order data with relative offset */
+ rv = stream_session_enqueue_data (&tc->connection, b,
+ vnet_buffer (b)->tcp.seq_number -
+ tc->rcv_nxt, 0 /* queue event */ , 0);
+
+ /* Nothing written */
+ if (rv)
+ {
+ TCP_EVT_DBG (TCP_EVT_INPUT, tc, 1, data_len, 0);
+ return TCP_ERROR_FIFO_FULL;
+ }
+
+ TCP_EVT_DBG (TCP_EVT_INPUT, tc, 1, data_len, data_len);
+
+ /* Update SACK list if in use */
+ if (tcp_opts_sack_permitted (&tc->rcv_opts))
+ {
+ ooo_segment_t *newest;
+ u32 start, end;
+
+ s0 = stream_session_get (tc->c_s_index, tc->c_thread_index);
+
+ /* Get the newest segment from the fifo */
+ newest = svm_fifo_newest_ooo_segment (s0->server_rx_fifo);
+ if (newest)
+ {
+ offset = ooo_segment_offset (s0->server_rx_fifo, newest);
+ ASSERT (offset <= vnet_buffer (b)->tcp.seq_number - tc->rcv_nxt);
+ start = tc->rcv_nxt + offset;
+ end = start + ooo_segment_length (s0->server_rx_fifo, newest);
+ tcp_update_sack_list (tc, start, end);
+ svm_fifo_newest_ooo_segment_reset (s0->server_rx_fifo);
+ }
+ }
+
+ return TCP_ERROR_ENQUEUED;
+}
+
+/**
+ * Check if ACK could be delayed. If ack can be delayed, it should return
+ * true for a full frame. If we're always acking return 0.
+ */
+always_inline int
+tcp_can_delack (tcp_connection_t * tc)
+{
+ /* Send ack if ... */
+ if (TCP_ALWAYS_ACK
+ /* just sent a rcv wnd 0 */
+ || (tc->flags & TCP_CONN_SENT_RCV_WND0) != 0
+ /* constrained to send ack */
+ || (tc->flags & TCP_CONN_SNDACK) != 0
+ /* we're almost out of tx wnd */
+ || tcp_available_snd_space (tc) < 4 * tc->snd_mss)
+ return 0;
+
+ return 1;
+}
+
+static int
+tcp_buffer_discard_bytes (vlib_buffer_t * b, u32 n_bytes_to_drop)
+{
+ u32 discard, first = b->current_length;
+ vlib_main_t *vm = vlib_get_main ();
+
+ /* Handle multi-buffer segments */
+ if (n_bytes_to_drop > b->current_length)
+ {
+ if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT))
+ return -1;
+ do
+ {
+ discard = clib_min (n_bytes_to_drop, b->current_length);
+ vlib_buffer_advance (b, discard);
+ b = vlib_get_buffer (vm, b->next_buffer);
+ n_bytes_to_drop -= discard;
+ }
+ while (n_bytes_to_drop);
+ if (n_bytes_to_drop > first)
+ b->total_length_not_including_first_buffer -= n_bytes_to_drop - first;
+ }
+ else
+ vlib_buffer_advance (b, n_bytes_to_drop);
+ vnet_buffer (b)->tcp.data_len -= n_bytes_to_drop;
+ return 0;
+}
+
+static int
+tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b,
+ u32 * next0)
+{
+ u32 error = 0, n_bytes_to_drop, n_data_bytes;
+
+ vlib_buffer_advance (b, vnet_buffer (b)->tcp.data_offset);
+ n_data_bytes = vnet_buffer (b)->tcp.data_len;
+ ASSERT (n_data_bytes);
+
+ /* Handle out-of-order data */
+ if (PREDICT_FALSE (vnet_buffer (b)->tcp.seq_number != tc->rcv_nxt))
+ {
+ /* Old sequence numbers allowed through because they overlapped
+ * the rx window */
+ if (seq_lt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt))
+ {
+ error = TCP_ERROR_SEGMENT_OLD;
+ *next0 = TCP_NEXT_DROP;
+
+ /* Completely in the past (possible retransmit) */
+ if (seq_leq (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt))
+ {
+ /* Ack retransmissions since we may not have any data to send */
+ tcp_make_ack (tc, b);
+ *next0 = tcp_next_output (tc->c_is_ip4);
+ goto done;
+ }
+
+ /* Chop off the bytes in the past */
+ n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number;
+ n_data_bytes -= n_bytes_to_drop;
+ vnet_buffer (b)->tcp.seq_number = tc->rcv_nxt;
+ if (tcp_buffer_discard_bytes (b, n_bytes_to_drop))
+ goto done;
+
+ goto in_order;
+ }
+
+ error = tcp_session_enqueue_ooo (tc, b, n_data_bytes);
+
+ /* N.B. Should not filter burst of dupacks. Two issues 1) dupacks open
+ * cwnd on remote peer when congested 2) acks leaving should have the
+ * latest rcv_wnd since the burst may eaten up all of it, so only the
+ * old ones could be filtered.
+ */
+
+ /* RFC2581: Send DUPACK for fast retransmit */
+ tcp_make_ack (tc, b);
+ *next0 = tcp_next_output (tc->c_is_ip4);
+
+ /* Mark as DUPACK. We may filter these in output if
+ * the burst fills the holes. */
+ if (n_data_bytes)
+ vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_DUPACK;
+
+ TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc);
+ goto done;
+ }
+
+in_order:
+
+ /* In order data, enqueue. Fifo figures out by itself if any out-of-order
+ * segments can be enqueued after fifo tail offset changes. */
+ error = tcp_session_enqueue_data (tc, b, n_data_bytes);
+
+ /* Check if ACK can be delayed */
+ if (tcp_can_delack (tc))
+ {
+ if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK))
+ tcp_timer_set (tc, TCP_TIMER_DELACK, TCP_DELACK_TIME);
+ goto done;
+ }
+
+ *next0 = tcp_next_output (tc->c_is_ip4);
+ tcp_make_ack (tc, b);
+
+done:
+ return error;
+}
+
+typedef struct
+{
+ tcp_header_t tcp_header;
+ tcp_connection_t tcp_connection;
+} tcp_rx_trace_t;
+
+u8 *
+format_tcp_rx_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *);
+ uword indent = format_get_indent (s);
+
+ s = format (s, "%U\n%U%U",
+ format_tcp_header, &t->tcp_header, 128,
+ format_white_space, indent,
+ format_tcp_connection, &t->tcp_connection, 1);
+
+ return s;
+}
+
+u8 *
+format_tcp_rx_trace_short (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *);
+
+ s = format (s, "%d -> %d (%U)",
+ clib_net_to_host_u16 (t->tcp_header.src_port),
+ clib_net_to_host_u16 (t->tcp_header.dst_port), format_tcp_state,
+ t->tcp_connection.state);
+
+ return s;
+}
+
+void
+tcp_set_rx_trace_data (tcp_rx_trace_t * t0, tcp_connection_t * tc0,
+ tcp_header_t * th0, vlib_buffer_t * b0, u8 is_ip4)
+{
+ if (tc0)
+ {
+ clib_memcpy (&t0->tcp_connection, tc0, sizeof (t0->tcp_connection));
+ }
+ else
+ {
+ th0 = tcp_buffer_hdr (b0);
+ }
+ clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header));
+}
+
+always_inline void
+tcp_established_inc_counter (vlib_main_t * vm, u8 is_ip4, u8 evt, u8 val)
+{
+ if (PREDICT_TRUE (!val))
+ return;
+
+ if (is_ip4)
+ vlib_node_increment_counter (vm, tcp4_established_node.index, evt, val);
+ else
+ vlib_node_increment_counter (vm, tcp6_established_node.index, evt, val);
+}
+
+always_inline uword
+tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame, int is_ip4)
+{
+ u32 n_left_from, next_index, *from, *to_next;
+ u32 my_thread_index = vm->thread_index, errors = 0;
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ u8 is_fin = 0;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t *b0;
+ tcp_header_t *th0 = 0;
+ tcp_connection_t *tc0;
+ u32 next0 = TCP_ESTABLISHED_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
+ my_thread_index);
+
+ if (PREDICT_FALSE (tc0 == 0))
+ {
+ error0 = TCP_ERROR_INVALID_CONNECTION;
+ goto done;
+ }
+
+ th0 = tcp_buffer_hdr (b0);
+ /* N.B. buffer is rewritten if segment is ooo. Thus, th0 becomes a
+ * dangling reference. */
+ is_fin = tcp_is_fin (th0);
+
+ /* SYNs, FINs and data consume sequence numbers */
+ vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number
+ + tcp_is_syn (th0) + is_fin + vnet_buffer (b0)->tcp.data_len;
+
+ /* TODO header prediction fast path */
+
+ /* 1-4: check SEQ, RST, SYN */
+ if (PREDICT_FALSE (tcp_segment_validate (vm, tc0, b0, th0, &next0)))
+ {
+ error0 = TCP_ERROR_SEGMENT_INVALID;
+ TCP_EVT_DBG (TCP_EVT_SEG_INVALID, tc0,
+ vnet_buffer (b0)->tcp.seq_number,
+ vnet_buffer (b0)->tcp.seq_end);
+ goto done;
+ }
+
+ /* 5: check the ACK field */
+ if (tcp_rcv_ack (tc0, b0, th0, &next0, &error0))
+ goto done;
+
+ /* 6: check the URG bit TODO */
+
+ /* 7: process the segment text */
+ if (vnet_buffer (b0)->tcp.data_len)
+ error0 = tcp_segment_rcv (tm, tc0, b0, &next0);
+
+ /* 8: check the FIN bit */
+ if (PREDICT_FALSE (is_fin))
+ {
+ /* Enter CLOSE-WAIT and notify session. To avoid lingering
+ * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */
+ /* Account for the FIN if nothing else was received */
+ if (vnet_buffer (b0)->tcp.data_len == 0)
+ tc0->rcv_nxt += 1;
+ tcp_make_ack (tc0, b0);
+ next0 = tcp_next_output (tc0->c_is_ip4);
+ tc0->state = TCP_STATE_CLOSE_WAIT;
+ stream_session_disconnect_notify (&tc0->connection);
+ tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
+ TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0);
+ }
+
+ done:
+ b0->error = node->errors[error0];
+ if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ tcp_rx_trace_t *t0 =
+ vlib_add_trace (vm, node, b0, sizeof (*t0));
+ tcp_set_rx_trace_data (t0, tc0, th0, b0, is_ip4);
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+ n_left_to_next, bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ errors = session_manager_flush_enqueue_events (my_thread_index);
+ tcp_established_inc_counter (vm, is_ip4, TCP_ERROR_EVENT_FIFO_FULL, errors);
+ tcp_flush_frame_to_output (vm, my_thread_index, is_ip4);
+
+ return from_frame->n_vectors;
+}
+
+static uword
+tcp4_established (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ return tcp46_established_inline (vm, node, from_frame, 1 /* is_ip4 */ );
+}
+
+static uword
+tcp6_established (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ return tcp46_established_inline (vm, node, from_frame, 0 /* is_ip4 */ );
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (tcp4_established_node) =
+{
+ .function = tcp4_established,
+ .name = "tcp4-established",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+ .n_errors = TCP_N_ERROR,
+ .error_strings = tcp_error_strings,
+ .n_next_nodes = TCP_ESTABLISHED_N_NEXT,
+ .next_nodes =
+ {
+#define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n,
+ foreach_tcp_state_next
+#undef _
+ },
+ .format_trace = format_tcp_rx_trace_short,
+};
+/* *INDENT-ON* */
+
+VLIB_NODE_FUNCTION_MULTIARCH (tcp4_established_node, tcp4_established);
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (tcp6_established_node) =
+{
+ .function = tcp6_established,
+ .name = "tcp6-established",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+ .n_errors = TCP_N_ERROR,
+ .error_strings = tcp_error_strings,
+ .n_next_nodes = TCP_ESTABLISHED_N_NEXT,
+ .next_nodes =
+ {
+#define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n,
+ foreach_tcp_state_next
+#undef _
+ },
+ .format_trace = format_tcp_rx_trace_short,
+};
+/* *INDENT-ON* */
+
+
+VLIB_NODE_FUNCTION_MULTIARCH (tcp6_established_node, tcp6_established);
+
+vlib_node_registration_t tcp4_syn_sent_node;
+vlib_node_registration_t tcp6_syn_sent_node;
+
+static u8
+tcp_lookup_is_valid (tcp_connection_t * tc, tcp_header_t * hdr)
+{
+ transport_connection_t *tmp;
+ if (!tc)
+ return 1;
+
+ u8 is_valid = (tc->c_lcl_port == hdr->dst_port
+ && (tc->state == TCP_STATE_LISTEN
+ || tc->c_rmt_port == hdr->src_port));
+
+ if (!is_valid)
+ {
+ if ((tmp =
+ stream_session_half_open_lookup (&tc->c_lcl_ip, &tc->c_rmt_ip,
+ tc->c_lcl_port, tc->c_rmt_port,
+ tc->c_transport_proto)))
+ {
+ if (tmp->lcl_port == hdr->dst_port
+ && tmp->rmt_port == hdr->src_port)
+ {
+ clib_warning ("half-open is valid!");
+ }
+ }
+ }
+ return is_valid;
+}
+
+/**
+ * Lookup transport connection
+ */
+static tcp_connection_t *
+tcp_lookup_connection (vlib_buffer_t * b, u8 thread_index, u8 is_ip4)
+{
+ tcp_header_t *tcp;
+ transport_connection_t *tconn;
+ tcp_connection_t *tc;
+ if (is_ip4)
+ {
+ ip4_header_t *ip4;
+ ip4 = vlib_buffer_get_current (b);
+ tcp = ip4_next_header (ip4);
+ tconn = stream_session_lookup_transport_wt4 (&ip4->dst_address,
+ &ip4->src_address,
+ tcp->dst_port,
+ tcp->src_port,
+ SESSION_TYPE_IP4_TCP,
+ thread_index);
+ tc = tcp_get_connection_from_transport (tconn);
+ ASSERT (tcp_lookup_is_valid (tc, tcp));
+ }
+ else
+ {
+ ip6_header_t *ip6;
+ ip6 = vlib_buffer_get_current (b);
+ tcp = ip6_next_header (ip6);
+ tconn = stream_session_lookup_transport_wt6 (&ip6->dst_address,
+ &ip6->src_address,
+ tcp->dst_port,
+ tcp->src_port,
+ SESSION_TYPE_IP6_TCP,
+ thread_index);
+ tc = tcp_get_connection_from_transport (tconn);
+ ASSERT (tcp_lookup_is_valid (tc, tcp));
+ }
+ return tc;
+}
+
+always_inline uword
+tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame, int is_ip4)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ u32 n_left_from, next_index, *from, *to_next;
+ u32 my_thread_index = vm->thread_index, errors = 0;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0, ack0, seq0;
+ vlib_buffer_t *b0;
+ tcp_rx_trace_t *t0;
+ tcp_header_t *tcp0 = 0;
+ tcp_connection_t *tc0;
+ tcp_connection_t *new_tc0;
+ u32 next0 = TCP_SYN_SENT_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ tc0 =
+ tcp_half_open_connection_get (vnet_buffer (b0)->
+ tcp.connection_index);
+ if (PREDICT_FALSE (tc0 == 0))
+ {
+ error0 = TCP_ERROR_INVALID_CONNECTION;
+ goto drop;
+ }
+
+ /* Half-open completed recently but the connection was't removed
+ * yet by the owning thread */
+ if (PREDICT_FALSE (tc0->flags & TCP_CONN_HALF_OPEN_DONE))
+ {
+ /* Make sure the connection actually exists */
+ ASSERT (tcp_lookup_connection (b0, my_thread_index, is_ip4));
+ goto drop;
+ }
+
+ ack0 = vnet_buffer (b0)->tcp.ack_number;
+ seq0 = vnet_buffer (b0)->tcp.seq_number;
+ tcp0 = tcp_buffer_hdr (b0);
+
+ /* Crude check to see if the connection handle does not match
+ * the packet. Probably connection just switched to established */
+ if (PREDICT_FALSE (tcp0->dst_port != tc0->c_lcl_port
+ || tcp0->src_port != tc0->c_rmt_port))
+ goto drop;
+
+ if (PREDICT_FALSE
+ (!tcp_ack (tcp0) && !tcp_rst (tcp0) && !tcp_syn (tcp0)))
+ goto drop;
+
+ /* SYNs, FINs and data consume sequence numbers */
+ vnet_buffer (b0)->tcp.seq_end = seq0 + tcp_is_syn (tcp0)
+ + tcp_is_fin (tcp0) + vnet_buffer (b0)->tcp.data_len;
+
+ /*
+ * 1. check the ACK bit
+ */
+
+ /*
+ * If the ACK bit is set
+ * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless
+ * the RST bit is set, if so drop the segment and return)
+ * <SEQ=SEG.ACK><CTL=RST>
+ * and discard the segment. Return.
+ * If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable.
+ */
+ if (tcp_ack (tcp0))
+ {
+ if (seq_leq (ack0, tc0->iss) || seq_gt (ack0, tc0->snd_nxt))
+ {
+ clib_warning ("ack not in rcv wnd");
+ if (!tcp_rst (tcp0))
+ tcp_send_reset_w_pkt (tc0, b0, is_ip4);
+ goto drop;
+ }
+
+ /* Make sure ACK is valid */
+ if (seq_gt (tc0->snd_una, ack0))
+ {
+ clib_warning ("ack invalid");
+ goto drop;
+ }
+ }
+
+ /*
+ * 2. check the RST bit
+ */
+
+ if (tcp_rst (tcp0))
+ {
+ /* If ACK is acceptable, signal client that peer is not
+ * willing to accept connection and drop connection*/
+ if (tcp_ack (tcp0))
+ tcp_connection_reset (tc0);
+ goto drop;
+ }
+
+ /*
+ * 3. check the security and precedence (skipped)
+ */
+
+ /*
+ * 4. check the SYN bit
+ */
+
+ /* No SYN flag. Drop. */
+ if (!tcp_syn (tcp0))
+ {
+ clib_warning ("not synack");
+ goto drop;
+ }
+
+ /* Parse options */
+ if (tcp_options_parse (tcp0, &tc0->rcv_opts))
+ {
+ clib_warning ("options parse fail");
+ goto drop;
+ }
+
+ /* Valid SYN or SYN-ACK. Move connection from half-open pool to
+ * current thread pool. */
+ pool_get (tm->connections[my_thread_index], new_tc0);
+ clib_memcpy (new_tc0, tc0, sizeof (*new_tc0));
+ new_tc0->c_c_index = new_tc0 - tm->connections[my_thread_index];
+ new_tc0->c_thread_index = my_thread_index;
+ new_tc0->rcv_nxt = vnet_buffer (b0)->tcp.seq_end;
+ new_tc0->irs = seq0;
+ new_tc0->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID;
+ new_tc0->timers[TCP_TIMER_RETRANSMIT_SYN] =
+ TCP_TIMER_HANDLE_INVALID;
+
+ /* If this is not the owning thread, wait for syn retransmit to
+ * expire and cleanup then */
+ if (tcp_half_open_connection_cleanup (tc0))
+ tc0->flags |= TCP_CONN_HALF_OPEN_DONE;
+
+ if (tcp_opts_tstamp (&new_tc0->rcv_opts))
+ {
+ new_tc0->tsval_recent = new_tc0->rcv_opts.tsval;
+ new_tc0->tsval_recent_age = tcp_time_now ();
+ }
+
+ if (tcp_opts_wscale (&new_tc0->rcv_opts))
+ new_tc0->snd_wscale = new_tc0->rcv_opts.wscale;
+
+ /* RFC1323: SYN and SYN-ACK wnd not scaled */
+ new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window);
+ new_tc0->snd_wl1 = seq0;
+ new_tc0->snd_wl2 = ack0;
+
+ tcp_connection_init_vars (new_tc0);
+
+ /* SYN-ACK: See if we can switch to ESTABLISHED state */
+ if (PREDICT_TRUE (tcp_ack (tcp0)))
+ {
+ /* Our SYN is ACKed: we have iss < ack = snd_una */
+
+ /* TODO Dequeue acknowledged segments if we support Fast Open */
+ new_tc0->snd_una = ack0;
+ new_tc0->state = TCP_STATE_ESTABLISHED;
+
+ /* Make sure las is initialized for the wnd computation */
+ new_tc0->rcv_las = new_tc0->rcv_nxt;
+
+ /* Notify app that we have connection. If session layer can't
+ * allocate session send reset */
+ if (stream_session_connect_notify (&new_tc0->connection, 0))
+ {
+ clib_warning ("connect notify fail");
+ tcp_send_reset_w_pkt (new_tc0, b0, is_ip4);
+ tcp_connection_cleanup (new_tc0);
+ goto drop;
+ }
+
+ /* Make sure after data segment processing ACK is sent */
+ new_tc0->flags |= TCP_CONN_SNDACK;
+
+ /* Update rtt with the syn-ack sample */
+ tcp_update_rtt (new_tc0, vnet_buffer (b0)->tcp.ack_number);
+ TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, new_tc0);
+ }
+ /* SYN: Simultaneous open. Change state to SYN-RCVD and send SYN-ACK */
+ else
+ {
+ new_tc0->state = TCP_STATE_SYN_RCVD;
+
+ /* Notify app that we have connection */
+ if (stream_session_connect_notify (&new_tc0->connection, 0))
+ {
+ tcp_connection_cleanup (new_tc0);
+ tcp_send_reset_w_pkt (tc0, b0, is_ip4);
+ TCP_EVT_DBG (TCP_EVT_RST_SENT, tc0);
+ goto drop;
+ }
+
+ tc0->rtt_ts = 0;
+ tcp_init_snd_vars (tc0);
+ tcp_make_synack (new_tc0, b0);
+ next0 = tcp_next_output (is_ip4);
+
+ goto drop;
+ }
+
+ /* Read data, if any */
+ if (PREDICT_FALSE (vnet_buffer (b0)->tcp.data_len))
+ {
+ ASSERT (0);
+ error0 = tcp_segment_rcv (tm, new_tc0, b0, &next0);
+ if (error0 == TCP_ERROR_PURE_ACK)
+ error0 = TCP_ERROR_SYN_ACKS_RCVD;
+ }
+ else
+ {
+ tcp_make_ack (new_tc0, b0);
+ next0 = tcp_next_output (new_tc0->c_is_ip4);
+ }
+
+ drop:
+
+ b0->error = error0 ? node->errors[error0] : 0;
+ if (PREDICT_FALSE
+ ((b0->flags & VLIB_BUFFER_IS_TRACED) && tcp0 != 0))
+ {
+ t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
+ clib_memcpy (&t0->tcp_header, tcp0, sizeof (t0->tcp_header));
+ clib_memcpy (&t0->tcp_connection, tc0,
+ sizeof (t0->tcp_connection));
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+ n_left_to_next, bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ errors = session_manager_flush_enqueue_events (my_thread_index);
+ if (errors)
+ {
+ if (is_ip4)
+ vlib_node_increment_counter (vm, tcp4_established_node.index,
+ TCP_ERROR_EVENT_FIFO_FULL, errors);
+ else
+ vlib_node_increment_counter (vm, tcp6_established_node.index,
+ TCP_ERROR_EVENT_FIFO_FULL, errors);
+ }
+
+ return from_frame->n_vectors;
+}
+
+static uword
+tcp4_syn_sent (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ return tcp46_syn_sent_inline (vm, node, from_frame, 1 /* is_ip4 */ );
+}
+
+static uword
+tcp6_syn_sent_rcv (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ return tcp46_syn_sent_inline (vm, node, from_frame, 0 /* is_ip4 */ );
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (tcp4_syn_sent_node) =
+{
+ .function = tcp4_syn_sent,
+ .name = "tcp4-syn-sent",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+ .n_errors = TCP_N_ERROR,
+ .error_strings = tcp_error_strings,
+ .n_next_nodes = TCP_SYN_SENT_N_NEXT,
+ .next_nodes =
+ {
+#define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n,
+ foreach_tcp_state_next
+#undef _
+ },
+ .format_trace = format_tcp_rx_trace_short,
+};
+/* *INDENT-ON* */
+
+VLIB_NODE_FUNCTION_MULTIARCH (tcp4_syn_sent_node, tcp4_syn_sent);
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (tcp6_syn_sent_node) =
+{
+ .function = tcp6_syn_sent_rcv,
+ .name = "tcp6-syn-sent",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+ .n_errors = TCP_N_ERROR,
+ .error_strings = tcp_error_strings,
+ .n_next_nodes = TCP_SYN_SENT_N_NEXT,
+ .next_nodes =
+ {
+#define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n,
+ foreach_tcp_state_next
+#undef _
+ },
+ .format_trace = format_tcp_rx_trace_short,
+};
+/* *INDENT-ON* */
+
+VLIB_NODE_FUNCTION_MULTIARCH (tcp6_syn_sent_node, tcp6_syn_sent_rcv);
+
+/**
+ * Handles reception for all states except LISTEN, SYN-SENT and ESTABLISHED
+ * as per RFC793 p. 64
+ */
+always_inline uword
+tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame, int is_ip4)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ u32 n_left_from, next_index, *from, *to_next;
+ u32 my_thread_index = vm->thread_index, errors = 0;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t *b0;
+ tcp_header_t *tcp0 = 0;
+ tcp_connection_t *tc0;
+ u32 next0 = TCP_RCV_PROCESS_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED;
+ u8 is_fin0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
+ my_thread_index);
+ if (PREDICT_FALSE (tc0 == 0))
+ {
+ error0 = TCP_ERROR_INVALID_CONNECTION;
+ goto drop;
+ }
+
+ tcp0 = tcp_buffer_hdr (b0);
+ is_fin0 = tcp_is_fin (tcp0);
+
+ /* SYNs, FINs and data consume sequence numbers */
+ vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number
+ + tcp_is_syn (tcp0) + is_fin0 + vnet_buffer (b0)->tcp.data_len;
+
+ if (CLIB_DEBUG)
+ {
+ tcp_connection_t *tmp;
+ tmp = tcp_lookup_connection (b0, my_thread_index, is_ip4);
+ if (tmp->state != tc0->state)
+ {
+ clib_warning ("state changed");
+ ASSERT (0);
+ goto drop;
+ }
+ }
+
+ /*
+ * Special treatment for CLOSED
+ */
+ switch (tc0->state)
+ {
+ case TCP_STATE_CLOSED:
+ goto drop;
+ break;
+ }
+
+ /*
+ * For all other states (except LISTEN)
+ */
+
+ /* 1-4: check SEQ, RST, SYN */
+ if (PREDICT_FALSE (tcp_segment_validate (vm, tc0, b0, tcp0,
+ &next0)))
+ {
+ error0 = TCP_ERROR_SEGMENT_INVALID;
+ goto drop;
+ }
+
+ /* 5: check the ACK field */
+ switch (tc0->state)
+ {
+ case TCP_STATE_SYN_RCVD:
+ /*
+ * If the segment acknowledgment is not acceptable, form a
+ * reset segment,
+ * <SEQ=SEG.ACK><CTL=RST>
+ * and send it.
+ */
+ if (!tcp_rcv_ack_is_acceptable (tc0, b0))
+ {
+ clib_warning ("connection not accepted");
+ tcp_send_reset_w_pkt (tc0, b0, is_ip4);
+ goto drop;
+ }
+
+ /* Update rtt and rto */
+ tcp_update_rtt (tc0, vnet_buffer (b0)->tcp.ack_number);
+
+ /* Switch state to ESTABLISHED */
+ tc0->state = TCP_STATE_ESTABLISHED;
+
+ /* Initialize session variables */
+ tc0->snd_una = vnet_buffer (b0)->tcp.ack_number;
+ tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window)
+ << tc0->rcv_opts.wscale;
+ tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number;
+ tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
+ stream_session_accept_notify (&tc0->connection);
+
+ /* Reset SYN-ACK retransmit and SYN_RCV establish timers */
+ tcp_retransmit_timer_reset (tc0);
+ tcp_timer_reset (tc0, TCP_TIMER_ESTABLISH);
+ TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
+ break;
+ case TCP_STATE_ESTABLISHED:
+ /* We can get packets in established state here because they
+ * were enqueued before state change */
+ if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
+ goto drop;
+
+ break;
+ case TCP_STATE_FIN_WAIT_1:
+ /* In addition to the processing for the ESTABLISHED state, if
+ * our FIN is now acknowledged then enter FIN-WAIT-2 and
+ * continue processing in that state. */
+ if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
+ goto drop;
+
+ /* Still have to send the FIN */
+ if (tc0->flags & TCP_CONN_FINPNDG)
+ {
+ /* TX fifo finally drained */
+ if (!stream_session_tx_fifo_max_dequeue (&tc0->connection))
+ tcp_send_fin (tc0);
+ }
+ /* If FIN is ACKed */
+ else if (tc0->snd_una == tc0->snd_una_max)
+ {
+ tc0->state = TCP_STATE_FIN_WAIT_2;
+ TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
+
+ /* Stop all retransmit timers because we have nothing more
+ * to send. Enable waitclose though because we're willing to
+ * wait for peer's FIN but not indefinitely. */
+ tcp_connection_timers_reset (tc0);
+ tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
+ }
+ break;
+ case TCP_STATE_FIN_WAIT_2:
+ /* In addition to the processing for the ESTABLISHED state, if
+ * the retransmission queue is empty, the user's CLOSE can be
+ * acknowledged ("ok") but do not delete the TCB. */
+ if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
+ goto drop;
+ break;
+ case TCP_STATE_CLOSE_WAIT:
+ /* Do the same processing as for the ESTABLISHED state. */
+ if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
+ goto drop;
+ break;
+ case TCP_STATE_CLOSING:
+ /* In addition to the processing for the ESTABLISHED state, if
+ * the ACK acknowledges our FIN then enter the TIME-WAIT state,
+ * otherwise ignore the segment. */
+ if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
+ goto drop;
+
+ tc0->state = TCP_STATE_TIME_WAIT;
+ TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
+ tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
+ goto drop;
+
+ break;
+ case TCP_STATE_LAST_ACK:
+ /* The only thing that [should] arrive in this state is an
+ * acknowledgment of our FIN. If our FIN is now acknowledged,
+ * delete the TCB, enter the CLOSED state, and return. */
+
+ if (!tcp_rcv_ack_is_acceptable (tc0, b0))
+ goto drop;
+
+ tc0->snd_una = vnet_buffer (b0)->tcp.ack_number;
+ /* Apparently our FIN was lost */
+ if (is_fin0)
+ {
+ tcp_send_fin (tc0);
+ goto drop;
+ }
+
+ tc0->state = TCP_STATE_CLOSED;
+ TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
+
+ /* Don't delete the connection/session yet. Instead, wait a
+ * reasonable amount of time until the pipes are cleared. In
+ * particular, this makes sure that we won't have dead sessions
+ * when processing events on the tx path */
+ tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME);
+ tcp_retransmit_timer_reset (tc0);
+
+ goto drop;
+
+ break;
+ case TCP_STATE_TIME_WAIT:
+ /* The only thing that can arrive in this state is a
+ * retransmission of the remote FIN. Acknowledge it, and restart
+ * the 2 MSL timeout. */
+
+ if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
+ goto drop;
+
+ tcp_make_ack (tc0, b0);
+ tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
+
+ goto drop;
+
+ break;
+ default:
+ ASSERT (0);
+ }
+
+ /* 6: check the URG bit TODO */
+
+ /* 7: process the segment text */
+ switch (tc0->state)
+ {
+ case TCP_STATE_ESTABLISHED:
+ case TCP_STATE_FIN_WAIT_1:
+ case TCP_STATE_FIN_WAIT_2:
+ if (vnet_buffer (b0)->tcp.data_len)
+ error0 = tcp_segment_rcv (tm, tc0, b0, &next0);
+ else if (is_fin0)
+ tc0->rcv_nxt += 1;
+ break;
+ case TCP_STATE_CLOSE_WAIT:
+ case TCP_STATE_CLOSING:
+ case TCP_STATE_LAST_ACK:
+ case TCP_STATE_TIME_WAIT:
+ /* This should not occur, since a FIN has been received from the
+ * remote side. Ignore the segment text. */
+ break;
+ }
+
+ /* 8: check the FIN bit */
+ if (!is_fin0)
+ goto drop;
+
+ switch (tc0->state)
+ {
+ case TCP_STATE_ESTABLISHED:
+ case TCP_STATE_SYN_RCVD:
+ /* Send FIN-ACK notify app and enter CLOSE-WAIT */
+ tcp_connection_timers_reset (tc0);
+ tcp_make_fin (tc0, b0);
+ tc0->snd_nxt += 1;
+ next0 = tcp_next_output (tc0->c_is_ip4);
+ stream_session_disconnect_notify (&tc0->connection);
+ tc0->state = TCP_STATE_CLOSE_WAIT;
+ TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
+ break;
+ case TCP_STATE_CLOSE_WAIT:
+ case TCP_STATE_CLOSING:
+ case TCP_STATE_LAST_ACK:
+ /* move along .. */
+ break;
+ case TCP_STATE_FIN_WAIT_1:
+ tc0->state = TCP_STATE_CLOSING;
+ tcp_make_ack (tc0, b0);
+ next0 = tcp_next_output (is_ip4);
+ TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
+ /* Wait for ACK but not forever */
+ tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
+ break;
+ case TCP_STATE_FIN_WAIT_2:
+ /* Got FIN, send ACK! Be more aggressive with resource cleanup */
+ tc0->state = TCP_STATE_TIME_WAIT;
+ tcp_connection_timers_reset (tc0);
+ tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_TIMEWAIT_TIME);
+ tcp_make_ack (tc0, b0);
+ next0 = tcp_next_output (is_ip4);
+ TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
+ break;
+ case TCP_STATE_TIME_WAIT:
+ /* Remain in the TIME-WAIT state. Restart the time-wait
+ * timeout.
+ */
+ tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_TIMEWAIT_TIME);
+ break;
+ }
+ TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0);
+
+ drop:
+ b0->error = error0 ? node->errors[error0] : 0;
+
+ if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ tcp_rx_trace_t *t0 =
+ vlib_add_trace (vm, node, b0, sizeof (*t0));
+ tcp_set_rx_trace_data (t0, tc0, tcp0, b0, is_ip4);
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+ n_left_to_next, bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ errors = session_manager_flush_enqueue_events (my_thread_index);
+ if (errors)
+ {
+ if (is_ip4)
+ vlib_node_increment_counter (vm, tcp4_established_node.index,
+ TCP_ERROR_EVENT_FIFO_FULL, errors);
+ else
+ vlib_node_increment_counter (vm, tcp6_established_node.index,
+ TCP_ERROR_EVENT_FIFO_FULL, errors);
+ }
+
+ return from_frame->n_vectors;
+}
+
+static uword
+tcp4_rcv_process (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ return tcp46_rcv_process_inline (vm, node, from_frame, 1 /* is_ip4 */ );
+}
+
+static uword
+tcp6_rcv_process (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ return tcp46_rcv_process_inline (vm, node, from_frame, 0 /* is_ip4 */ );
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (tcp4_rcv_process_node) =
+{
+ .function = tcp4_rcv_process,
+ .name = "tcp4-rcv-process",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+ .n_errors = TCP_N_ERROR,
+ .error_strings = tcp_error_strings,
+ .n_next_nodes = TCP_RCV_PROCESS_N_NEXT,
+ .next_nodes =
+ {
+#define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n,
+ foreach_tcp_state_next
+#undef _
+ },
+ .format_trace = format_tcp_rx_trace_short,
+};
+/* *INDENT-ON* */
+
+VLIB_NODE_FUNCTION_MULTIARCH (tcp4_rcv_process_node, tcp4_rcv_process);
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (tcp6_rcv_process_node) =
+{
+ .function = tcp6_rcv_process,
+ .name = "tcp6-rcv-process",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+ .n_errors = TCP_N_ERROR,
+ .error_strings = tcp_error_strings,
+ .n_next_nodes = TCP_RCV_PROCESS_N_NEXT,
+ .next_nodes =
+ {
+#define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n,
+ foreach_tcp_state_next
+#undef _
+ },
+ .format_trace = format_tcp_rx_trace_short,
+};
+/* *INDENT-ON* */
+
+VLIB_NODE_FUNCTION_MULTIARCH (tcp6_rcv_process_node, tcp6_rcv_process);
+
+vlib_node_registration_t tcp4_listen_node;
+vlib_node_registration_t tcp6_listen_node;
+
+/**
+ * LISTEN state processing as per RFC 793 p. 65
+ */
+always_inline uword
+tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame, int is_ip4)
+{
+ u32 n_left_from, next_index, *from, *to_next;
+ u32 my_thread_index = vm->thread_index;
+ u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t *b0;
+ tcp_rx_trace_t *t0;
+ tcp_header_t *th0 = 0;
+ tcp_connection_t *lc0;
+ ip4_header_t *ip40;
+ ip6_header_t *ip60;
+ tcp_connection_t *child0;
+ u32 error0 = TCP_ERROR_SYNS_RCVD, next0 = TCP_LISTEN_NEXT_DROP;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ lc0 = tcp_listener_get (vnet_buffer (b0)->tcp.connection_index);
+
+ if (is_ip4)
+ {
+ ip40 = vlib_buffer_get_current (b0);
+ th0 = ip4_next_header (ip40);
+ }
+ else
+ {
+ ip60 = vlib_buffer_get_current (b0);
+ th0 = ip6_next_header (ip60);
+ }
+
+ /* Create child session. For syn-flood protection use filter */
+
+ /* 1. first check for an RST: handled in dispatch */
+ /* if (tcp_rst (th0))
+ goto drop; */
+
+ /* 2. second check for an ACK: handled in dispatch */
+ /* if (tcp_ack (th0))
+ {
+ tcp_send_reset (b0, is_ip4);
+ goto drop;
+ } */
+
+ /* 3. check for a SYN (did that already) */
+
+ /* Make sure connection wasn't just created */
+ child0 = tcp_lookup_connection (b0, my_thread_index, is_ip4);
+ if (PREDICT_FALSE (child0->state != TCP_STATE_LISTEN))
+ {
+ error0 = TCP_ERROR_CREATE_EXISTS;
+ goto drop;
+ }
+
+ /* Create child session and send SYN-ACK */
+ child0 = tcp_connection_new (my_thread_index);
+ child0->c_lcl_port = lc0->c_lcl_port;
+ child0->c_rmt_port = th0->src_port;
+ child0->c_is_ip4 = is_ip4;
+ child0->state = TCP_STATE_SYN_RCVD;
+
+ if (is_ip4)
+ {
+ child0->c_lcl_ip4.as_u32 = ip40->dst_address.as_u32;
+ child0->c_rmt_ip4.as_u32 = ip40->src_address.as_u32;
+ }
+ else
+ {
+ clib_memcpy (&child0->c_lcl_ip6, &ip60->dst_address,
+ sizeof (ip6_address_t));
+ clib_memcpy (&child0->c_rmt_ip6, &ip60->src_address,
+ sizeof (ip6_address_t));
+ }
+
+ if (stream_session_accept (&child0->connection, lc0->c_s_index, sst,
+ 0 /* notify */ ))
+ {
+ clib_warning ("session accept fail");
+ tcp_connection_cleanup (child0);
+ error0 = TCP_ERROR_CREATE_SESSION_FAIL;
+ goto drop;
+ }
+
+ if (tcp_options_parse (th0, &child0->rcv_opts))
+ {
+ clib_warning ("options parse fail");
+ goto drop;
+ }
+
+ child0->irs = vnet_buffer (b0)->tcp.seq_number;
+ child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1;
+ child0->rcv_las = child0->rcv_nxt;
+
+ /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK}
+ * segments are used to initialize PAWS. */
+ if (tcp_opts_tstamp (&child0->rcv_opts))
+ {
+ child0->tsval_recent = child0->rcv_opts.tsval;
+ child0->tsval_recent_age = tcp_time_now ();
+ }
+
+ if (tcp_opts_wscale (&child0->rcv_opts))
+ child0->snd_wscale = child0->rcv_opts.wscale;
+
+ child0->snd_wnd = clib_net_to_host_u16 (th0->window)
+ << child0->snd_wscale;
+ child0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number;
+ child0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
+
+ tcp_connection_init_vars (child0);
+ TCP_EVT_DBG (TCP_EVT_SYN_RCVD, child0, 1);
+
+ /* Reuse buffer to make syn-ack and send */
+ tcp_make_synack (child0, b0);
+ next0 = tcp_next_output (is_ip4);
+ tcp_timer_set (child0, TCP_TIMER_ESTABLISH, TCP_SYN_RCVD_TIME);
+
+ drop:
+ if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
+ clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header));
+ clib_memcpy (&t0->tcp_connection, lc0,
+ sizeof (t0->tcp_connection));
+ }
+
+ b0->error = node->errors[error0];
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+ n_left_to_next, bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ return from_frame->n_vectors;
+}
+
+static uword
+tcp4_listen (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ return tcp46_listen_inline (vm, node, from_frame, 1 /* is_ip4 */ );
+}
+
+static uword
+tcp6_listen (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ return tcp46_listen_inline (vm, node, from_frame, 0 /* is_ip4 */ );
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (tcp4_listen_node) =
+{
+ .function = tcp4_listen,
+ .name = "tcp4-listen",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+ .n_errors = TCP_N_ERROR,
+ .error_strings = tcp_error_strings,
+ .n_next_nodes = TCP_LISTEN_N_NEXT,
+ .next_nodes =
+ {
+#define _(s,n) [TCP_LISTEN_NEXT_##s] = n,
+ foreach_tcp_state_next
+#undef _
+ },
+ .format_trace = format_tcp_rx_trace_short,
+};
+/* *INDENT-ON* */
+
+VLIB_NODE_FUNCTION_MULTIARCH (tcp4_listen_node, tcp4_listen);
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (tcp6_listen_node) =
+{
+ .function = tcp6_listen,
+ .name = "tcp6-listen",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+ .n_errors = TCP_N_ERROR,
+ .error_strings = tcp_error_strings,
+ .n_next_nodes = TCP_LISTEN_N_NEXT,
+ .next_nodes =
+ {
+#define _(s,n) [TCP_LISTEN_NEXT_##s] = n,
+ foreach_tcp_state_next
+#undef _
+ },
+ .format_trace = format_tcp_rx_trace_short,
+};
+/* *INDENT-ON* */
+
+VLIB_NODE_FUNCTION_MULTIARCH (tcp6_listen_node, tcp6_listen);
+
+vlib_node_registration_t tcp4_input_node;
+vlib_node_registration_t tcp6_input_node;
+
+typedef enum _tcp_input_next
+{
+ TCP_INPUT_NEXT_DROP,
+ TCP_INPUT_NEXT_LISTEN,
+ TCP_INPUT_NEXT_RCV_PROCESS,
+ TCP_INPUT_NEXT_SYN_SENT,
+ TCP_INPUT_NEXT_ESTABLISHED,
+ TCP_INPUT_NEXT_RESET,
+ TCP_INPUT_NEXT_PUNT,
+ TCP_INPUT_N_NEXT
+} tcp_input_next_t;
+
+#define foreach_tcp4_input_next \
+ _ (DROP, "error-drop") \
+ _ (LISTEN, "tcp4-listen") \
+ _ (RCV_PROCESS, "tcp4-rcv-process") \
+ _ (SYN_SENT, "tcp4-syn-sent") \
+ _ (ESTABLISHED, "tcp4-established") \
+ _ (RESET, "tcp4-reset") \
+ _ (PUNT, "error-punt")
+
+#define foreach_tcp6_input_next \
+ _ (DROP, "error-drop") \
+ _ (LISTEN, "tcp6-listen") \
+ _ (RCV_PROCESS, "tcp6-rcv-process") \
+ _ (SYN_SENT, "tcp6-syn-sent") \
+ _ (ESTABLISHED, "tcp6-established") \
+ _ (RESET, "tcp6-reset") \
+ _ (PUNT, "error-punt")
+
+#define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN)
+
+always_inline uword
+tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame, int is_ip4)
+{
+ u32 n_left_from, next_index, *from, *to_next;
+ u32 my_thread_index = vm->thread_index;
+ tcp_main_t *tm = vnet_get_tcp_main ();
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+ next_index = node->cached_next_index;
+ tcp_set_time_now (my_thread_index);
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ int n_advance_bytes0, n_data_bytes0;
+ u32 bi0;
+ vlib_buffer_t *b0;
+ tcp_header_t *tcp0 = 0;
+ tcp_connection_t *tc0;
+ transport_connection_t *tconn;
+ ip4_header_t *ip40;
+ ip6_header_t *ip60;
+ u32 error0 = TCP_ERROR_NO_LISTENER, next0 = TCP_INPUT_NEXT_DROP;
+ u8 flags0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ vnet_buffer (b0)->tcp.flags = 0;
+
+ /* Checksum computed by ipx_local no need to compute again */
+
+ if (is_ip4)
+ {
+ ip40 = vlib_buffer_get_current (b0);
+ tcp0 = ip4_next_header (ip40);
+ n_advance_bytes0 = (ip4_header_bytes (ip40)
+ + tcp_header_bytes (tcp0));
+ n_data_bytes0 = clib_net_to_host_u16 (ip40->length)
+ - n_advance_bytes0;
+ tconn = stream_session_lookup_transport_wt4 (&ip40->dst_address,
+ &ip40->src_address,
+ tcp0->dst_port,
+ tcp0->src_port,
+ SESSION_TYPE_IP4_TCP,
+ my_thread_index);
+ tc0 = tcp_get_connection_from_transport (tconn);
+ ASSERT (tcp_lookup_is_valid (tc0, tcp0));
+ }
+ else
+ {
+ ip60 = vlib_buffer_get_current (b0);
+ tcp0 = ip6_next_header (ip60);
+ n_advance_bytes0 = tcp_header_bytes (tcp0);
+ n_data_bytes0 = clib_net_to_host_u16 (ip60->payload_length)
+ - n_advance_bytes0;
+ n_advance_bytes0 += sizeof (ip60[0]);
+ tconn = stream_session_lookup_transport_wt6 (&ip60->dst_address,
+ &ip60->src_address,
+ tcp0->dst_port,
+ tcp0->src_port,
+ SESSION_TYPE_IP6_TCP,
+ my_thread_index);
+ tc0 = tcp_get_connection_from_transport (tconn);
+ ASSERT (tcp_lookup_is_valid (tc0, tcp0));
+ }
+
+ /* Length check */
+ if (PREDICT_FALSE (n_advance_bytes0 < 0))
+ {
+ error0 = TCP_ERROR_LENGTH;
+ goto done;
+ }
+
+ /* Session exists */
+ if (PREDICT_TRUE (0 != tc0))
+ {
+ /* Save connection index */
+ vnet_buffer (b0)->tcp.connection_index = tc0->c_c_index;
+ vnet_buffer (b0)->tcp.seq_number =
+ clib_net_to_host_u32 (tcp0->seq_number);
+ vnet_buffer (b0)->tcp.ack_number =
+ clib_net_to_host_u32 (tcp0->ack_number);
+
+ vnet_buffer (b0)->tcp.hdr_offset = (u8 *) tcp0
+ - (u8 *) vlib_buffer_get_current (b0);
+ vnet_buffer (b0)->tcp.data_offset = n_advance_bytes0;
+ vnet_buffer (b0)->tcp.data_len = n_data_bytes0;
+
+ flags0 = tcp0->flags & filter_flags;
+ next0 = tm->dispatch_table[tc0->state][flags0].next;
+ error0 = tm->dispatch_table[tc0->state][flags0].error;
+
+ if (PREDICT_FALSE (error0 == TCP_ERROR_DISPATCH
+ || next0 == TCP_INPUT_NEXT_RESET))
+ {
+ /* Overload tcp flags to store state */
+ tcp_state_t state0 = tc0->state;
+ vnet_buffer (b0)->tcp.flags = tc0->state;
+
+ if (error0 == TCP_ERROR_DISPATCH)
+ clib_warning ("disp error state %U flags %U",
+ format_tcp_state, state0, format_tcp_flags,
+ (int) flags0);
+ }
+ }
+ else
+ {
+ if ((is_ip4 && tm->punt_unknown4) ||
+ (!is_ip4 && tm->punt_unknown6))
+ {
+ next0 = TCP_INPUT_NEXT_PUNT;
+ error0 = TCP_ERROR_PUNT;
+ }
+ else
+ {
+ /* Send reset */
+ next0 = TCP_INPUT_NEXT_RESET;
+ error0 = TCP_ERROR_NO_LISTENER;
+ }
+ }
+
+ done:
+ b0->error = error0 ? node->errors[error0] : 0;
+
+ if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ tcp_rx_trace_t *t0 =
+ vlib_add_trace (vm, node, b0, sizeof (*t0));
+ tcp_set_rx_trace_data (t0, tc0, tcp0, b0, is_ip4);
+ }
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+ n_left_to_next, bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return from_frame->n_vectors;
+}
+
+static uword
+tcp4_input (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ return tcp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */ );
+}
+
+static uword
+tcp6_input (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ return tcp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */ );
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (tcp4_input_node) =
+{
+ .function = tcp4_input,
+ .name = "tcp4-input",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+ .n_errors = TCP_N_ERROR,
+ .error_strings = tcp_error_strings,
+ .n_next_nodes = TCP_INPUT_N_NEXT,
+ .next_nodes =
+ {
+#define _(s,n) [TCP_INPUT_NEXT_##s] = n,
+ foreach_tcp4_input_next
+#undef _
+ },
+ .format_buffer = format_tcp_header,
+ .format_trace = format_tcp_rx_trace,
+};
+/* *INDENT-ON* */
+
+VLIB_NODE_FUNCTION_MULTIARCH (tcp4_input_node, tcp4_input);
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (tcp6_input_node) =
+{
+ .function = tcp6_input,
+ .name = "tcp6-input",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+ .n_errors = TCP_N_ERROR,
+ .error_strings = tcp_error_strings,
+ .n_next_nodes = TCP_INPUT_N_NEXT,
+ .next_nodes =
+ {
+#define _(s,n) [TCP_INPUT_NEXT_##s] = n,
+ foreach_tcp6_input_next
+#undef _
+ },
+ .format_buffer = format_tcp_header,
+ .format_trace = format_tcp_rx_trace,
+};
+/* *INDENT-ON* */
+
+VLIB_NODE_FUNCTION_MULTIARCH (tcp6_input_node, tcp6_input);
+
+static void
+tcp_dispatch_table_init (tcp_main_t * tm)
+{
+ int i, j;
+ for (i = 0; i < ARRAY_LEN (tm->dispatch_table); i++)
+ for (j = 0; j < ARRAY_LEN (tm->dispatch_table[i]); j++)
+ {
+ tm->dispatch_table[i][j].next = TCP_INPUT_NEXT_DROP;
+ tm->dispatch_table[i][j].error = TCP_ERROR_DISPATCH;
+ }
+
+#define _(t,f,n,e) \
+do { \
+ tm->dispatch_table[TCP_STATE_##t][f].next = (n); \
+ tm->dispatch_table[TCP_STATE_##t][f].error = (e); \
+} while (0)
+
+ /* SYNs for new connections -> tcp-listen. */
+ _(LISTEN, TCP_FLAG_SYN, TCP_INPUT_NEXT_LISTEN, TCP_ERROR_NONE);
+ _(LISTEN, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_NONE);
+ _(LISTEN, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_NONE);
+ _(LISTEN, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET,
+ TCP_ERROR_NONE);
+ /* ACK for for a SYN-ACK -> tcp-rcv-process. */
+ _(SYN_RCVD, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+ _(SYN_RCVD, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+ _(SYN_RCVD, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+ /* SYN-ACK for a SYN */
+ _(SYN_SENT, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT,
+ TCP_ERROR_NONE);
+ _(SYN_SENT, TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE);
+ _(SYN_SENT, TCP_FLAG_RST, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE);
+ _(SYN_SENT, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT,
+ TCP_ERROR_NONE);
+ /* ACK for for established connection -> tcp-established. */
+ _(ESTABLISHED, TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
+ /* FIN for for established connection -> tcp-established. */
+ _(ESTABLISHED, TCP_FLAG_FIN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
+ _(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED,
+ TCP_ERROR_NONE);
+ _(ESTABLISHED, TCP_FLAG_RST, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
+ _(ESTABLISHED, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED,
+ TCP_ERROR_NONE);
+ _(ESTABLISHED, TCP_FLAG_SYN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
+ _(ESTABLISHED, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED,
+ TCP_ERROR_NONE);
+ /* ACK or FIN-ACK to our FIN */
+ _(FIN_WAIT_1, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+ _(FIN_WAIT_1, TCP_FLAG_ACK | TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS,
+ TCP_ERROR_NONE);
+ /* FIN in reply to our FIN from the other side */
+ _(FIN_WAIT_1, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+ _(FIN_WAIT_1, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+ /* FIN confirming that the peer (app) has closed */
+ _(FIN_WAIT_2, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+ _(FIN_WAIT_2, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+ _(FIN_WAIT_2, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
+ TCP_ERROR_NONE);
+ _(CLOSE_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+ _(CLOSE_WAIT, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
+ TCP_ERROR_NONE);
+ _(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+ _(LAST_ACK, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+ _(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
+ TCP_ERROR_NONE);
+ _(LAST_ACK, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+ _(TIME_WAIT, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+ _(TIME_WAIT, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
+ TCP_ERROR_NONE);
+ _(TIME_WAIT, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+ _(TIME_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+ _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED);
+ _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED);
+ _(CLOSED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP,
+ TCP_ERROR_CONNECTION_CLOSED);
+#undef _
+}
+
+clib_error_t *
+tcp_input_init (vlib_main_t * vm)
+{
+ clib_error_t *error = 0;
+ tcp_main_t *tm = vnet_get_tcp_main ();
+
+ if ((error = vlib_call_init_function (vm, tcp_init)))
+ return error;
+
+ /* Initialize dispatch table. */
+ tcp_dispatch_table_init (tm);
+
+ return error;
+}
+
+VLIB_INIT_FUNCTION (tcp_input_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c
new file mode 100644
index 00000000..103fea4c
--- /dev/null
+++ b/src/vnet/tcp/tcp_newreno.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2017 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/tcp/tcp.h>
+
+void
+newreno_congestion (tcp_connection_t * tc)
+{
+ tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss);
+}
+
+void
+newreno_recovered (tcp_connection_t * tc)
+{
+ tc->cwnd = tc->ssthresh;
+}
+
+void
+newreno_rcv_ack (tcp_connection_t * tc)
+{
+ if (tcp_in_slowstart (tc))
+ {
+ tc->cwnd += clib_min (tc->snd_mss, tc->bytes_acked);
+ }
+ else
+ {
+ /* Round up to 1 if needed */
+ tc->cwnd += clib_max ((tc->snd_mss * tc->snd_mss) / tc->cwnd, 1);
+ }
+}
+
+void
+newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type)
+{
+ if (ack_type == TCP_CC_DUPACK)
+ {
+ if (!tcp_opts_sack_permitted (tc))
+ tc->cwnd += tc->snd_mss;
+ }
+ else if (ack_type == TCP_CC_PARTIALACK)
+ {
+ /* RFC 6582 Sec. 3.2 */
+ if (!tcp_opts_sack_permitted (&tc->rcv_opts))
+ {
+ /* Deflate the congestion window by the amount of new data
+ * acknowledged by the Cumulative Acknowledgment field.
+ * If the partial ACK acknowledges at least one SMSS of new data,
+ * then add back SMSS bytes to the congestion window. This
+ * artificially inflates the congestion window in order to reflect
+ * the additional segment that has left the network. This "partial
+ * window deflation" attempts to ensure that, when fast recovery
+ * eventually ends, approximately ssthresh amount of data will be
+ * outstanding in the network.*/
+ tc->cwnd = (tc->cwnd > tc->bytes_acked + tc->snd_mss) ?
+ tc->cwnd - tc->bytes_acked : tc->snd_mss;
+ if (tc->bytes_acked > tc->snd_mss)
+ tc->cwnd += tc->snd_mss;
+ }
+ }
+}
+
+void
+newreno_conn_init (tcp_connection_t * tc)
+{
+ tc->ssthresh = tc->snd_wnd;
+ tc->cwnd = tcp_initial_cwnd (tc);
+}
+
+const static tcp_cc_algorithm_t tcp_newreno = {
+ .congestion = newreno_congestion,
+ .recovered = newreno_recovered,
+ .rcv_ack = newreno_rcv_ack,
+ .rcv_cong_ack = newreno_rcv_cong_ack,
+ .init = newreno_conn_init
+};
+
+clib_error_t *
+newreno_init (vlib_main_t * vm)
+{
+ clib_error_t *error = 0;
+
+ tcp_cc_algo_register (TCP_CC_NEWRENO, &tcp_newreno);
+
+ return error;
+}
+
+VLIB_INIT_FUNCTION (newreno_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
new file mode 100644
index 00000000..a954bfa7
--- /dev/null
+++ b/src/vnet/tcp/tcp_output.c
@@ -0,0 +1,2113 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/tcp/tcp.h>
+#include <vnet/lisp-cp/packets.h>
+#include <math.h>
+
+vlib_node_registration_t tcp4_output_node;
+vlib_node_registration_t tcp6_output_node;
+
+typedef enum _tcp_output_next
+{
+ TCP_OUTPUT_NEXT_DROP,
+ TCP_OUTPUT_NEXT_IP_LOOKUP,
+ TCP_OUTPUT_N_NEXT
+} tcp_output_next_t;
+
+#define foreach_tcp4_output_next \
+ _ (DROP, "error-drop") \
+ _ (IP_LOOKUP, "ip4-lookup")
+
+#define foreach_tcp6_output_next \
+ _ (DROP, "error-drop") \
+ _ (IP_LOOKUP, "ip6-lookup")
+
+static char *tcp_error_strings[] = {
+#define tcp_error(n,s) s,
+#include <vnet/tcp/tcp_error.def>
+#undef tcp_error
+};
+
+typedef struct
+{
+ tcp_header_t tcp_header;
+ tcp_connection_t tcp_connection;
+} tcp_tx_trace_t;
+
+u16 dummy_mtu = 1460;
+
+u8 *
+format_tcp_tx_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ tcp_tx_trace_t *t = va_arg (*args, tcp_tx_trace_t *);
+ uword indent = format_get_indent (s);
+
+ s = format (s, "%U\n%U%U",
+ format_tcp_header, &t->tcp_header, 128,
+ format_white_space, indent,
+ format_tcp_connection, &t->tcp_connection, 1);
+
+ return s;
+}
+
+static u8
+tcp_window_compute_scale (u32 window)
+{
+ u8 wnd_scale = 0;
+ while (wnd_scale < TCP_MAX_WND_SCALE && (window >> wnd_scale) > TCP_WND_MAX)
+ wnd_scale++;
+ return wnd_scale;
+}
+
+/**
+ * Update max segment size we're able to process.
+ *
+ * The value is constrained by our interface's MTU and IP options. It is
+ * also what we advertise to our peer.
+ */
+void
+tcp_update_rcv_mss (tcp_connection_t * tc)
+{
+ /* TODO find our iface MTU */
+ tc->mss = dummy_mtu - sizeof (tcp_header_t);
+}
+
+/**
+ * TCP's initial window
+ */
+always_inline u32
+tcp_initial_wnd_unscaled (tcp_connection_t * tc)
+{
+ /* RFC 6928 recommends the value lower. However at the time our connections
+ * are initialized, fifos may not be allocated. Therefore, advertise the
+ * smallest possible unscaled window size and update once fifos are
+ * assigned to the session.
+ */
+ /*
+ tcp_update_rcv_mss (tc);
+ TCP_IW_N_SEGMENTS * tc->mss;
+ */
+ return TCP_MIN_RX_FIFO_SIZE;
+}
+
+/**
+ * Compute initial window and scale factor. As per RFC1323, window field in
+ * SYN and SYN-ACK segments is never scaled.
+ */
+u32
+tcp_initial_window_to_advertise (tcp_connection_t * tc)
+{
+ u32 max_fifo;
+
+ /* Initial wnd for SYN. Fifos are not allocated yet.
+ * Use some predefined value. For SYN-ACK we still want the
+ * scale to be computed in the same way */
+ max_fifo = TCP_MAX_RX_FIFO_SIZE;
+
+ tc->rcv_wscale = tcp_window_compute_scale (max_fifo);
+ tc->rcv_wnd = tcp_initial_wnd_unscaled (tc);
+
+ return clib_min (tc->rcv_wnd, TCP_WND_MAX);
+}
+
+/**
+ * Compute and return window to advertise, scaled as per RFC1323
+ */
+u32
+tcp_window_to_advertise (tcp_connection_t * tc, tcp_state_t state)
+{
+ if (state < TCP_STATE_ESTABLISHED)
+ return tcp_initial_window_to_advertise (tc);
+
+ tcp_update_rcv_wnd (tc);
+
+ if (tc->rcv_wnd == 0)
+ {
+ tc->flags |= TCP_CONN_SENT_RCV_WND0;
+ }
+ else
+ {
+ tc->flags &= ~TCP_CONN_SENT_RCV_WND0;
+ }
+
+ return tc->rcv_wnd >> tc->rcv_wscale;
+}
+
+void
+tcp_update_rcv_wnd (tcp_connection_t * tc)
+{
+ i32 observed_wnd;
+ u32 available_space, max_fifo, wnd;
+
+ /*
+ * Figure out how much space we have available
+ */
+ available_space = stream_session_max_rx_enqueue (&tc->connection);
+ max_fifo = stream_session_rx_fifo_size (&tc->connection);
+
+ ASSERT (tc->rcv_opts.mss < max_fifo);
+ if (available_space < tc->rcv_opts.mss && available_space < max_fifo >> 3)
+ available_space = 0;
+
+ /*
+ * Use the above and what we know about what we've previously advertised
+ * to compute the new window
+ */
+ observed_wnd = (i32) tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las);
+ if (observed_wnd < 0)
+ observed_wnd = 0;
+
+ /* Bad. Thou shalt not shrink */
+ if (available_space < observed_wnd)
+ {
+ wnd = observed_wnd;
+ TCP_EVT_DBG (TCP_EVT_RCV_WND_SHRUNK, tc, observed_wnd, available_space);
+ }
+ else
+ {
+ wnd = available_space;
+ }
+
+ /* Make sure we have a multiple of rcv_wscale */
+ if (wnd && tc->rcv_wscale)
+ {
+ wnd &= ~(1 << tc->rcv_wscale);
+ if (wnd == 0)
+ wnd = 1 << tc->rcv_wscale;
+ }
+
+ tc->rcv_wnd = clib_min (wnd, TCP_WND_MAX << tc->rcv_wscale);
+}
+
+/**
+ * Write TCP options to segment.
+ */
+u32
+tcp_options_write (u8 * data, tcp_options_t * opts)
+{
+ u32 opts_len = 0;
+ u32 buf, seq_len = 4;
+
+ if (tcp_opts_mss (opts))
+ {
+ *data++ = TCP_OPTION_MSS;
+ *data++ = TCP_OPTION_LEN_MSS;
+ buf = clib_host_to_net_u16 (opts->mss);
+ clib_memcpy (data, &buf, sizeof (opts->mss));
+ data += sizeof (opts->mss);
+ opts_len += TCP_OPTION_LEN_MSS;
+ }
+
+ if (tcp_opts_wscale (opts))
+ {
+ *data++ = TCP_OPTION_WINDOW_SCALE;
+ *data++ = TCP_OPTION_LEN_WINDOW_SCALE;
+ *data++ = opts->wscale;
+ opts_len += TCP_OPTION_LEN_WINDOW_SCALE;
+ }
+
+ if (tcp_opts_sack_permitted (opts))
+ {
+ *data++ = TCP_OPTION_SACK_PERMITTED;
+ *data++ = TCP_OPTION_LEN_SACK_PERMITTED;
+ opts_len += TCP_OPTION_LEN_SACK_PERMITTED;
+ }
+
+ if (tcp_opts_tstamp (opts))
+ {
+ *data++ = TCP_OPTION_TIMESTAMP;
+ *data++ = TCP_OPTION_LEN_TIMESTAMP;
+ buf = clib_host_to_net_u32 (opts->tsval);
+ clib_memcpy (data, &buf, sizeof (opts->tsval));
+ data += sizeof (opts->tsval);
+ buf = clib_host_to_net_u32 (opts->tsecr);
+ clib_memcpy (data, &buf, sizeof (opts->tsecr));
+ data += sizeof (opts->tsecr);
+ opts_len += TCP_OPTION_LEN_TIMESTAMP;
+ }
+
+ if (tcp_opts_sack (opts))
+ {
+ int i;
+ u32 n_sack_blocks = clib_min (vec_len (opts->sacks),
+ TCP_OPTS_MAX_SACK_BLOCKS);
+
+ if (n_sack_blocks != 0)
+ {
+ *data++ = TCP_OPTION_SACK_BLOCK;
+ *data++ = 2 + n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK;
+ for (i = 0; i < n_sack_blocks; i++)
+ {
+ buf = clib_host_to_net_u32 (opts->sacks[i].start);
+ clib_memcpy (data, &buf, seq_len);
+ data += seq_len;
+ buf = clib_host_to_net_u32 (opts->sacks[i].end);
+ clib_memcpy (data, &buf, seq_len);
+ data += seq_len;
+ }
+ opts_len += 2 + n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK;
+ }
+ }
+
+ /* Terminate TCP options */
+ if (opts_len % 4)
+ {
+ *data++ = TCP_OPTION_EOL;
+ opts_len += TCP_OPTION_LEN_EOL;
+ }
+
+ /* Pad with zeroes to a u32 boundary */
+ while (opts_len % 4)
+ {
+ *data++ = TCP_OPTION_NOOP;
+ opts_len += TCP_OPTION_LEN_NOOP;
+ }
+ return opts_len;
+}
+
+always_inline int
+tcp_make_syn_options (tcp_options_t * opts, u8 wnd_scale)
+{
+ u8 len = 0;
+
+ opts->flags |= TCP_OPTS_FLAG_MSS;
+ opts->mss = dummy_mtu; /*XXX discover that */
+ len += TCP_OPTION_LEN_MSS;
+
+ opts->flags |= TCP_OPTS_FLAG_WSCALE;
+ opts->wscale = wnd_scale;
+ len += TCP_OPTION_LEN_WINDOW_SCALE;
+
+ opts->flags |= TCP_OPTS_FLAG_TSTAMP;
+ opts->tsval = tcp_time_now ();
+ opts->tsecr = 0;
+ len += TCP_OPTION_LEN_TIMESTAMP;
+
+ if (TCP_USE_SACKS)
+ {
+ opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED;
+ len += TCP_OPTION_LEN_SACK_PERMITTED;
+ }
+
+ /* Align to needed boundary */
+ len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN;
+ return len;
+}
+
+always_inline int
+tcp_make_synack_options (tcp_connection_t * tc, tcp_options_t * opts)
+{
+ u8 len = 0;
+
+ opts->flags |= TCP_OPTS_FLAG_MSS;
+ opts->mss = tc->mss;
+ len += TCP_OPTION_LEN_MSS;
+
+ if (tcp_opts_wscale (&tc->rcv_opts))
+ {
+ opts->flags |= TCP_OPTS_FLAG_WSCALE;
+ opts->wscale = tc->rcv_wscale;
+ len += TCP_OPTION_LEN_WINDOW_SCALE;
+ }
+
+ if (tcp_opts_tstamp (&tc->rcv_opts))
+ {
+ opts->flags |= TCP_OPTS_FLAG_TSTAMP;
+ opts->tsval = tcp_time_now ();
+ opts->tsecr = tc->tsval_recent;
+ len += TCP_OPTION_LEN_TIMESTAMP;
+ }
+
+ if (tcp_opts_sack_permitted (&tc->rcv_opts))
+ {
+ opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED;
+ len += TCP_OPTION_LEN_SACK_PERMITTED;
+ }
+
+ /* Align to needed boundary */
+ len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN;
+ return len;
+}
+
+always_inline int
+tcp_make_established_options (tcp_connection_t * tc, tcp_options_t * opts)
+{
+ u8 len = 0;
+
+ opts->flags = 0;
+
+ if (tcp_opts_tstamp (&tc->rcv_opts))
+ {
+ opts->flags |= TCP_OPTS_FLAG_TSTAMP;
+ opts->tsval = tcp_time_now ();
+ opts->tsecr = tc->tsval_recent;
+ len += TCP_OPTION_LEN_TIMESTAMP;
+ }
+ if (tcp_opts_sack_permitted (&tc->rcv_opts))
+ {
+ if (vec_len (tc->snd_sacks))
+ {
+ opts->flags |= TCP_OPTS_FLAG_SACK;
+ opts->sacks = tc->snd_sacks;
+ opts->n_sack_blocks = clib_min (vec_len (tc->snd_sacks),
+ TCP_OPTS_MAX_SACK_BLOCKS);
+ len += 2 + TCP_OPTION_LEN_SACK_BLOCK * opts->n_sack_blocks;
+ }
+ }
+
+ /* Align to needed boundary */
+ len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN;
+ return len;
+}
+
+always_inline int
+tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts,
+ tcp_state_t state)
+{
+ switch (state)
+ {
+ case TCP_STATE_ESTABLISHED:
+ case TCP_STATE_FIN_WAIT_1:
+ return tcp_make_established_options (tc, opts);
+ case TCP_STATE_SYN_RCVD:
+ return tcp_make_synack_options (tc, opts);
+ case TCP_STATE_SYN_SENT:
+ return tcp_make_syn_options (opts, tc->rcv_wscale);
+ default:
+ clib_warning ("Not handled!");
+ return 0;
+ }
+}
+
+/**
+ * Update snd_mss to reflect the effective segment size that we can send
+ * by taking into account all TCP options, including SACKs
+ */
+void
+tcp_update_snd_mss (tcp_connection_t * tc)
+{
+ /* Compute options to be used for connection. These may be reused when
+ * sending data or to compute the effective mss (snd_mss) */
+ tc->snd_opts_len =
+ tcp_make_options (tc, &tc->snd_opts, TCP_STATE_ESTABLISHED);
+
+ /* XXX check if MTU has been updated */
+ tc->snd_mss = clib_min (tc->mss, tc->rcv_opts.mss) - tc->snd_opts_len;
+ ASSERT (tc->snd_mss > 0);
+}
+
+void
+tcp_init_mss (tcp_connection_t * tc)
+{
+ u16 default_min_mss = 536;
+ tcp_update_rcv_mss (tc);
+
+ /* TODO cache mss and consider PMTU discovery */
+ tc->snd_mss = clib_min (tc->rcv_opts.mss, tc->mss);
+
+ if (tc->snd_mss < 45)
+ {
+ clib_warning ("snd mss is 0");
+ /* Assume that at least the min default mss works */
+ tc->snd_mss = default_min_mss;
+ tc->rcv_opts.mss = default_min_mss;
+ }
+
+ /* We should have enough space for 40 bytes of options */
+ ASSERT (tc->snd_mss > 45);
+
+ /* If we use timestamp option, account for it */
+ if (tcp_opts_tstamp (&tc->rcv_opts))
+ tc->snd_mss -= TCP_OPTION_LEN_TIMESTAMP;
+}
+
+always_inline int
+tcp_alloc_tx_buffers (tcp_main_t * tm, u8 thread_index, u32 n_free_buffers)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ u32 current_length = vec_len (tm->tx_buffers[thread_index]);
+ u32 n_allocated;
+
+ vec_validate (tm->tx_buffers[thread_index],
+ current_length + n_free_buffers - 1);
+ n_allocated =
+ vlib_buffer_alloc (vm, &tm->tx_buffers[thread_index][current_length],
+ n_free_buffers);
+ _vec_len (tm->tx_buffers[thread_index]) = current_length + n_allocated;
+ /* buffer shortage, report failure */
+ if (vec_len (tm->tx_buffers[thread_index]) == 0)
+ {
+ clib_warning ("out of buffers");
+ return -1;
+ }
+ return 0;
+}
+
+always_inline int
+tcp_get_free_buffer_index (tcp_main_t * tm, u32 * bidx)
+{
+ u32 *my_tx_buffers;
+ u32 thread_index = vlib_get_thread_index ();
+ if (PREDICT_FALSE (vec_len (tm->tx_buffers[thread_index]) == 0))
+ {
+ if (tcp_alloc_tx_buffers (tm, thread_index, VLIB_FRAME_SIZE))
+ return -1;
+ }
+ my_tx_buffers = tm->tx_buffers[thread_index];
+ *bidx = my_tx_buffers[vec_len (my_tx_buffers) - 1];
+ _vec_len (my_tx_buffers) -= 1;
+ return 0;
+}
+
+always_inline void
+tcp_return_buffer (tcp_main_t * tm)
+{
+ _vec_len (tm->tx_buffers[vlib_get_thread_index ()]) += 1;
+}
+
+always_inline void *
+tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b)
+{
+ if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+ vlib_buffer_free_one (vm, b->next_buffer);
+ /* Zero all flags but free list index and trace flag */
+ b->flags &= VLIB_BUFFER_NEXT_PRESENT - 1;
+ b->current_data = 0;
+ b->current_length = 0;
+ b->total_length_not_including_first_buffer = 0;
+ vnet_buffer (b)->tcp.flags = 0;
+
+ /* Leave enough space for headers */
+ return vlib_buffer_make_headroom (b, MAX_HDRS_LEN);
+}
+
+always_inline void *
+tcp_init_buffer (vlib_main_t * vm, vlib_buffer_t * b)
+{
+ ASSERT ((b->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
+ b->flags &= VLIB_BUFFER_FREE_LIST_INDEX_MASK;
+ b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
+ b->total_length_not_including_first_buffer = 0;
+ vnet_buffer (b)->tcp.flags = 0;
+
+ /* Leave enough space for headers */
+ return vlib_buffer_make_headroom (b, MAX_HDRS_LEN);
+}
+
+/**
+ * Prepare ACK
+ */
+void
+tcp_make_ack_i (tcp_connection_t * tc, vlib_buffer_t * b, tcp_state_t state,
+ u8 flags)
+{
+ tcp_options_t _snd_opts, *snd_opts = &_snd_opts;
+ u8 tcp_opts_len, tcp_hdr_opts_len;
+ tcp_header_t *th;
+ u16 wnd;
+
+ wnd = tcp_window_to_advertise (tc, state);
+
+ /* Make and write options */
+ tcp_opts_len = tcp_make_established_options (tc, snd_opts);
+ tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t);
+
+ th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt,
+ tc->rcv_nxt, tcp_hdr_opts_len, flags, wnd);
+
+ tcp_options_write ((u8 *) (th + 1), snd_opts);
+ vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
+}
+
+/**
+ * Convert buffer to ACK
+ */
+void
+tcp_make_ack (tcp_connection_t * tc, vlib_buffer_t * b)
+{
+ vlib_main_t *vm = vlib_get_main ();
+
+ tcp_reuse_buffer (vm, b);
+ tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_ACK);
+ TCP_EVT_DBG (TCP_EVT_ACK_SENT, tc);
+ vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK;
+ tc->rcv_las = tc->rcv_nxt;
+}
+
+/**
+ * Convert buffer to FIN-ACK
+ */
+void
+tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ u8 flags = 0;
+
+ tcp_reuse_buffer (vm, b);
+
+ flags = TCP_FLAG_FIN | TCP_FLAG_ACK;
+ tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, flags);
+
+ /* Reset flags, make sure ack is sent */
+ vnet_buffer (b)->tcp.flags &= ~TCP_BUF_FLAG_DUPACK;
+}
+
+/**
+ * Convert buffer to SYN
+ */
+void
+tcp_make_syn (tcp_connection_t * tc, vlib_buffer_t * b)
+{
+ u8 tcp_hdr_opts_len, tcp_opts_len;
+ tcp_header_t *th;
+ u16 initial_wnd;
+ tcp_options_t snd_opts;
+
+ initial_wnd = tcp_initial_window_to_advertise (tc);
+
+ /* Make and write options */
+ memset (&snd_opts, 0, sizeof (snd_opts));
+ tcp_opts_len = tcp_make_syn_options (&snd_opts, tc->rcv_wscale);
+ tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t);
+
+ th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss,
+ tc->rcv_nxt, tcp_hdr_opts_len, TCP_FLAG_SYN,
+ initial_wnd);
+ vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
+ tcp_options_write ((u8 *) (th + 1), &snd_opts);
+
+ tcp_timer_update (tc, TCP_TIMER_RETRANSMIT_SYN,
+ tc->rto * TCP_TO_TIMER_TICK);
+}
+
+/**
+ * Convert buffer to SYN-ACK
+ */
+void
+tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ tcp_options_t _snd_opts, *snd_opts = &_snd_opts;
+ u8 tcp_opts_len, tcp_hdr_opts_len;
+ tcp_header_t *th;
+ u16 initial_wnd;
+
+ memset (snd_opts, 0, sizeof (*snd_opts));
+ tcp_reuse_buffer (vm, b);
+
+ initial_wnd = tcp_initial_window_to_advertise (tc);
+ tcp_opts_len = tcp_make_synack_options (tc, snd_opts);
+ tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t);
+
+ th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss,
+ tc->rcv_nxt, tcp_hdr_opts_len,
+ TCP_FLAG_SYN | TCP_FLAG_ACK, initial_wnd);
+ tcp_options_write ((u8 *) (th + 1), snd_opts);
+
+ vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
+ vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK;
+
+ /* Init retransmit timer. Use update instead of set because of
+ * retransmissions */
+ tcp_retransmit_timer_force_update (tc);
+ TCP_EVT_DBG (TCP_EVT_SYNACK_SENT, tc);
+}
+
+always_inline void
+tcp_enqueue_to_ip_lookup_i (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
+ u8 is_ip4, u8 flush)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ u32 thread_index = vlib_get_thread_index ();
+ u32 *to_next, next_index;
+ vlib_frame_t *f;
+
+ b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
+ b->error = 0;
+
+ /* Default FIB for now */
+ vnet_buffer (b)->sw_if_index[VLIB_TX] = 0;
+
+ /* Send to IP lookup */
+ next_index = is_ip4 ? ip4_lookup_node.index : ip6_lookup_node.index;
+ if (VLIB_BUFFER_TRACE_TRAJECTORY > 0)
+ {
+ b->pre_data[0] = 2;
+ b->pre_data[1] = next_index;
+ }
+
+ f = tm->ip_lookup_tx_frames[!is_ip4][thread_index];
+ if (!f)
+ {
+ f = vlib_get_frame_to_node (vm, next_index);
+ ASSERT (f);
+ tm->ip_lookup_tx_frames[!is_ip4][thread_index] = f;
+ }
+
+ to_next = vlib_frame_vector_args (f);
+ to_next[f->n_vectors] = bi;
+ f->n_vectors += 1;
+ if (flush || f->n_vectors == VLIB_FRAME_SIZE)
+ {
+ vlib_put_frame_to_node (vm, next_index, f);
+ tm->ip_lookup_tx_frames[!is_ip4][thread_index] = 0;
+ }
+}
+
+always_inline void
+tcp_enqueue_to_ip_lookup_now (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
+ u8 is_ip4)
+{
+ tcp_enqueue_to_ip_lookup_i (vm, b, bi, is_ip4, 1);
+}
+
+always_inline void
+tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
+ u8 is_ip4)
+{
+ tcp_enqueue_to_ip_lookup_i (vm, b, bi, is_ip4, 0);
+}
+
+always_inline void
+tcp_enqueue_to_output_i (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
+ u8 is_ip4, u8 flush)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ u32 thread_index = vlib_get_thread_index ();
+ u32 *to_next, next_index;
+ vlib_frame_t *f;
+
+ b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
+ b->error = 0;
+
+ /* Decide where to send the packet */
+ next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index;
+ if (VLIB_BUFFER_TRACE_TRAJECTORY > 0)
+ {
+ b->pre_data[0] = 1;
+ b->pre_data[1] = next_index;
+ }
+
+ /* Get frame to v4/6 output node */
+ f = tm->tx_frames[!is_ip4][thread_index];
+ if (!f)
+ {
+ f = vlib_get_frame_to_node (vm, next_index);
+ ASSERT (f);
+ tm->tx_frames[!is_ip4][thread_index] = f;
+ }
+ to_next = vlib_frame_vector_args (f);
+ to_next[f->n_vectors] = bi;
+ f->n_vectors += 1;
+ if (flush || f->n_vectors == VLIB_FRAME_SIZE)
+ {
+ vlib_put_frame_to_node (vm, next_index, f);
+ tm->tx_frames[!is_ip4][thread_index] = 0;
+ }
+}
+
+always_inline void
+tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4)
+{
+ tcp_enqueue_to_output_i (vm, b, bi, is_ip4, 0);
+}
+
+always_inline void
+tcp_enqueue_to_output_now (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
+ u8 is_ip4)
+{
+ tcp_enqueue_to_output_i (vm, b, bi, is_ip4, 1);
+}
+
+int
+tcp_make_reset_in_place (vlib_main_t * vm, vlib_buffer_t * b0,
+ tcp_state_t state, u8 thread_index, u8 is_ip4)
+{
+ ip4_header_t *ih4;
+ ip6_header_t *ih6;
+ tcp_header_t *th0;
+ ip4_address_t src_ip40, dst_ip40;
+ ip6_address_t src_ip60, dst_ip60;
+ u16 src_port, dst_port;
+ u32 tmp;
+ u32 seq, ack;
+ u8 flags;
+
+ /* Find IP and TCP headers */
+ th0 = tcp_buffer_hdr (b0);
+
+ /* Save src and dst ip */
+ if (is_ip4)
+ {
+ ih4 = vlib_buffer_get_current (b0);
+ ASSERT ((ih4->ip_version_and_header_length & 0xF0) == 0x40);
+ src_ip40.as_u32 = ih4->src_address.as_u32;
+ dst_ip40.as_u32 = ih4->dst_address.as_u32;
+ }
+ else
+ {
+ ih6 = vlib_buffer_get_current (b0);
+ ASSERT ((ih6->ip_version_traffic_class_and_flow_label & 0xF0) == 0x60);
+ clib_memcpy (&src_ip60, &ih6->src_address, sizeof (ip6_address_t));
+ clib_memcpy (&dst_ip60, &ih6->dst_address, sizeof (ip6_address_t));
+ }
+
+ src_port = th0->src_port;
+ dst_port = th0->dst_port;
+
+ /* Try to determine what/why we're actually resetting */
+ if (state == TCP_STATE_CLOSED)
+ {
+ if (!tcp_syn (th0))
+ return -1;
+
+ tmp = clib_net_to_host_u32 (th0->seq_number);
+
+ /* Got a SYN for no listener. */
+ flags = TCP_FLAG_RST | TCP_FLAG_ACK;
+ ack = clib_host_to_net_u32 (tmp + 1);
+ seq = 0;
+ }
+ else
+ {
+ flags = TCP_FLAG_RST;
+ seq = th0->ack_number;
+ ack = 0;
+ }
+
+ tcp_reuse_buffer (vm, b0);
+ th0 = vlib_buffer_push_tcp_net_order (b0, dst_port, src_port, seq, ack,
+ sizeof (tcp_header_t), flags, 0);
+
+ if (is_ip4)
+ {
+ ih4 = vlib_buffer_push_ip4 (vm, b0, &dst_ip40, &src_ip40,
+ IP_PROTOCOL_TCP, 1);
+ th0->checksum = ip4_tcp_udp_compute_checksum (vm, b0, ih4);
+ }
+ else
+ {
+ int bogus = ~0;
+ ih6 = vlib_buffer_push_ip6 (vm, b0, &dst_ip60, &src_ip60,
+ IP_PROTOCOL_TCP);
+ th0->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b0, ih6, &bogus);
+ ASSERT (!bogus);
+ }
+
+ return 0;
+}
+
+/**
+ * Send reset without reusing existing buffer
+ *
+ * It extracts connection info out of original packet
+ */
+void
+tcp_send_reset_w_pkt (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4)
+{
+ vlib_buffer_t *b;
+ u32 bi;
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ vlib_main_t *vm = vlib_get_main ();
+ u8 tcp_hdr_len, flags = 0;
+ tcp_header_t *th, *pkt_th;
+ u32 seq, ack;
+ ip4_header_t *ih4, *pkt_ih4;
+ ip6_header_t *ih6, *pkt_ih6;
+
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return;
+
+ b = vlib_get_buffer (vm, bi);
+ tcp_init_buffer (vm, b);
+
+ /* Make and write options */
+ tcp_hdr_len = sizeof (tcp_header_t);
+
+ if (is_ip4)
+ {
+ pkt_ih4 = vlib_buffer_get_current (pkt);
+ pkt_th = ip4_next_header (pkt_ih4);
+ }
+ else
+ {
+ pkt_ih6 = vlib_buffer_get_current (pkt);
+ pkt_th = ip6_next_header (pkt_ih6);
+ }
+
+ if (tcp_ack (pkt_th))
+ {
+ flags = TCP_FLAG_RST;
+ seq = pkt_th->ack_number;
+ ack = (tc && tc->state >= TCP_STATE_SYN_RCVD) ? tc->rcv_nxt : 0;
+ }
+ else
+ {
+ flags = TCP_FLAG_RST | TCP_FLAG_ACK;
+ seq = 0;
+ ack = clib_host_to_net_u32 (vnet_buffer (pkt)->tcp.seq_end);
+ }
+
+ th = vlib_buffer_push_tcp_net_order (b, pkt_th->dst_port, pkt_th->src_port,
+ seq, ack, tcp_hdr_len, flags, 0);
+
+ /* Swap src and dst ip */
+ if (is_ip4)
+ {
+ ASSERT ((pkt_ih4->ip_version_and_header_length & 0xF0) == 0x40);
+ ih4 = vlib_buffer_push_ip4 (vm, b, &pkt_ih4->dst_address,
+ &pkt_ih4->src_address, IP_PROTOCOL_TCP, 1);
+ th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih4);
+ }
+ else
+ {
+ int bogus = ~0;
+ ASSERT ((pkt_ih6->ip_version_traffic_class_and_flow_label & 0xF0) ==
+ 0x60);
+ ih6 = vlib_buffer_push_ip6 (vm, b, &pkt_ih6->dst_address,
+ &pkt_ih6->src_address, IP_PROTOCOL_TCP);
+ th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih6, &bogus);
+ ASSERT (!bogus);
+ }
+
+ tcp_enqueue_to_ip_lookup_now (vm, b, bi, is_ip4);
+ TCP_EVT_DBG (TCP_EVT_RST_SENT, tc);
+}
+
+/**
+ * Build and set reset packet for connection
+ */
+void
+tcp_send_reset (tcp_connection_t * tc)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ vlib_buffer_t *b;
+ u32 bi;
+ tcp_header_t *th;
+ u16 tcp_hdr_opts_len, advertise_wnd, opts_write_len;
+ u8 flags;
+
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return;
+ b = vlib_get_buffer (vm, bi);
+ tcp_init_buffer (vm, b);
+
+ tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state);
+ tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t);
+ advertise_wnd = tcp_window_to_advertise (tc, TCP_STATE_ESTABLISHED);
+ flags = TCP_FLAG_RST;
+ th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt,
+ tc->rcv_nxt, tcp_hdr_opts_len, flags,
+ advertise_wnd);
+ opts_write_len = tcp_options_write ((u8 *) (th + 1), &tc->snd_opts);
+ ASSERT (opts_write_len == tc->snd_opts_len);
+ vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
+ if (tc->c_is_ip4)
+ {
+ ip4_header_t *ih4;
+ ih4 = vlib_buffer_push_ip4 (vm, b, &tc->c_lcl_ip.ip4,
+ &tc->c_rmt_ip.ip4, IP_PROTOCOL_TCP, 0);
+ th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih4);
+ }
+ else
+ {
+ int bogus = ~0;
+ ip6_header_t *ih6;
+ ih6 = vlib_buffer_push_ip6 (vm, b, &tc->c_lcl_ip.ip6,
+ &tc->c_rmt_ip.ip6, IP_PROTOCOL_TCP);
+ th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih6, &bogus);
+ ASSERT (!bogus);
+ }
+ tcp_enqueue_to_ip_lookup_now (vm, b, bi, tc->c_is_ip4);
+ TCP_EVT_DBG (TCP_EVT_RST_SENT, tc);
+}
+
+void
+tcp_push_ip_hdr (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b)
+{
+ tcp_header_t *th = vlib_buffer_get_current (b);
+ vlib_main_t *vm = vlib_get_main ();
+ if (tc->c_is_ip4)
+ {
+ ip4_header_t *ih;
+ ih = vlib_buffer_push_ip4 (vm, b, &tc->c_lcl_ip4,
+ &tc->c_rmt_ip4, IP_PROTOCOL_TCP, 1);
+ th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih);
+ }
+ else
+ {
+ ip6_header_t *ih;
+ int bogus = ~0;
+
+ ih = vlib_buffer_push_ip6 (vm, b, &tc->c_lcl_ip6,
+ &tc->c_rmt_ip6, IP_PROTOCOL_TCP);
+ th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih, &bogus);
+ ASSERT (!bogus);
+ }
+}
+
+/**
+ * Send SYN
+ *
+ * Builds a SYN packet for a half-open connection and sends it to ipx_lookup.
+ * The packet is not forwarded through tcpx_output to avoid doing lookups
+ * in the half_open pool.
+ */
+void
+tcp_send_syn (tcp_connection_t * tc)
+{
+ vlib_buffer_t *b;
+ u32 bi;
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ vlib_main_t *vm = vlib_get_main ();
+
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return;
+
+ b = vlib_get_buffer (vm, bi);
+ tcp_init_buffer (vm, b);
+ tcp_make_syn (tc, b);
+
+ /* Measure RTT with this */
+ tc->rtt_ts = tcp_time_now ();
+ tc->rtt_seq = tc->snd_nxt;
+ tc->rto_boff = 0;
+
+ /* Set the connection establishment timer */
+ tcp_timer_set (tc, TCP_TIMER_ESTABLISH, TCP_ESTABLISH_TIME);
+
+ tcp_push_ip_hdr (tm, tc, b);
+ tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4);
+ TCP_EVT_DBG (TCP_EVT_SYN_SENT, tc);
+}
+
+/**
+ * Flush tx frame populated by retransmits and timer pops
+ */
+void
+tcp_flush_frame_to_output (vlib_main_t * vm, u8 thread_index, u8 is_ip4)
+{
+ if (tcp_main.tx_frames[!is_ip4][thread_index])
+ {
+ u32 next_index;
+ next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index;
+ vlib_put_frame_to_node (vm, next_index,
+ tcp_main.tx_frames[!is_ip4][thread_index]);
+ tcp_main.tx_frames[!is_ip4][thread_index] = 0;
+ }
+}
+
+/**
+ * Flush ip lookup tx frames populated by timer pops
+ */
+always_inline void
+tcp_flush_frame_to_ip_lookup (vlib_main_t * vm, u8 thread_index, u8 is_ip4)
+{
+ if (tcp_main.ip_lookup_tx_frames[!is_ip4][thread_index])
+ {
+ u32 next_index;
+ next_index = is_ip4 ? ip4_lookup_node.index : ip6_lookup_node.index;
+ vlib_put_frame_to_node (vm, next_index,
+ tcp_main.ip_lookup_tx_frames[!is_ip4]
+ [thread_index]);
+ tcp_main.ip_lookup_tx_frames[!is_ip4][thread_index] = 0;
+ }
+}
+
+/**
+ * Flush v4 and v6 tcp and ip-lookup tx frames for thread index
+ */
+void
+tcp_flush_frames_to_output (u8 thread_index)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ tcp_flush_frame_to_output (vm, thread_index, 1);
+ tcp_flush_frame_to_output (vm, thread_index, 0);
+ tcp_flush_frame_to_ip_lookup (vm, thread_index, 1);
+ tcp_flush_frame_to_ip_lookup (vm, thread_index, 0);
+}
+
+/**
+ * Send FIN
+ */
+void
+tcp_send_fin (tcp_connection_t * tc)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ vlib_main_t *vm = vlib_get_main ();
+ vlib_buffer_t *b;
+ u32 bi;
+ u8 fin_snt = 0;
+
+
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return;
+ b = vlib_get_buffer (vm, bi);
+ fin_snt = tc->flags & TCP_CONN_FINSNT;
+ if (fin_snt)
+ tc->snd_nxt = tc->snd_una;
+ tcp_make_fin (tc, b);
+ tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4);
+ if (!fin_snt)
+ {
+ tc->flags |= TCP_CONN_FINSNT;
+ tc->flags &= ~TCP_CONN_FINPNDG;
+ /* Account for the FIN */
+ tc->snd_una_max += 1;
+ tc->snd_nxt = tc->snd_una_max;
+ }
+ tcp_retransmit_timer_force_update (tc);
+ TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc);
+}
+
+always_inline u8
+tcp_make_state_flags (tcp_connection_t * tc, tcp_state_t next_state)
+{
+ switch (next_state)
+ {
+ case TCP_STATE_ESTABLISHED:
+ return TCP_FLAG_ACK;
+ case TCP_STATE_SYN_RCVD:
+ return TCP_FLAG_SYN | TCP_FLAG_ACK;
+ case TCP_STATE_SYN_SENT:
+ return TCP_FLAG_SYN;
+ case TCP_STATE_LAST_ACK:
+ case TCP_STATE_FIN_WAIT_1:
+ if (tc->snd_nxt + 1 < tc->snd_una_max)
+ return TCP_FLAG_ACK;
+ else
+ return TCP_FLAG_FIN;
+ default:
+ clib_warning ("Shouldn't be here!");
+ }
+ return 0;
+}
+
+/**
+ * Push TCP header and update connection variables
+ */
+static void
+tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b,
+ tcp_state_t next_state, u8 compute_opts)
+{
+ u32 advertise_wnd, data_len;
+ u8 tcp_hdr_opts_len, opts_write_len, flags;
+ tcp_header_t *th;
+
+ data_len = b->current_length + b->total_length_not_including_first_buffer;
+ ASSERT (!b->total_length_not_including_first_buffer
+ || (b->flags & VLIB_BUFFER_NEXT_PRESENT));
+ vnet_buffer (b)->tcp.flags = 0;
+
+ if (compute_opts)
+ tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state);
+
+ tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t);
+ advertise_wnd = tcp_window_to_advertise (tc, next_state);
+ flags = tcp_make_state_flags (tc, next_state);
+
+ /* Push header and options */
+ th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt,
+ tc->rcv_nxt, tcp_hdr_opts_len, flags,
+ advertise_wnd);
+ opts_write_len = tcp_options_write ((u8 *) (th + 1), &tc->snd_opts);
+
+ ASSERT (opts_write_len == tc->snd_opts_len);
+ vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
+
+ /*
+ * Update connection variables
+ */
+
+ tc->snd_nxt += data_len;
+ tc->rcv_las = tc->rcv_nxt;
+
+ /* TODO this is updated in output as well ... */
+ if (seq_gt (tc->snd_nxt, tc->snd_una_max))
+ {
+ tc->snd_una_max = tc->snd_nxt;
+ tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
+ }
+
+ TCP_EVT_DBG (TCP_EVT_PKTIZE, tc);
+}
+
+void
+tcp_send_ack (tcp_connection_t * tc)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ vlib_main_t *vm = vlib_get_main ();
+
+ vlib_buffer_t *b;
+ u32 bi;
+
+ /* Get buffer */
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return;
+ b = vlib_get_buffer (vm, bi);
+
+ /* Fill in the ACK */
+ tcp_make_ack (tc, b);
+ tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
+}
+
+/**
+ * Delayed ack timer handler
+ *
+ * Sends delayed ACK when timer expires
+ */
+void
+tcp_timer_delack_handler (u32 index)
+{
+ u32 thread_index = vlib_get_thread_index ();
+ tcp_connection_t *tc;
+
+ tc = tcp_connection_get (index, thread_index);
+ tc->timers[TCP_TIMER_DELACK] = TCP_TIMER_HANDLE_INVALID;
+ tcp_send_ack (tc);
+}
+
+/**
+ * Build a retransmit segment
+ *
+ * @return the number of bytes in the segment or 0 if there's nothing to
+ * retransmit
+ */
+u32
+tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset,
+ u32 max_deq_bytes, vlib_buffer_t ** b)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ vlib_main_t *vm = vlib_get_main ();
+ int n_bytes = 0;
+ u32 start, bi, available_bytes, seg_size;
+ u8 *data;
+
+ ASSERT (tc->state >= TCP_STATE_ESTABLISHED);
+ ASSERT (max_deq_bytes != 0);
+
+ /*
+ * Make sure we can retransmit something
+ */
+ available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection);
+ ASSERT (available_bytes >= offset);
+ available_bytes -= offset;
+ if (!available_bytes)
+ return 0;
+ max_deq_bytes = clib_min (tc->snd_mss, max_deq_bytes);
+ max_deq_bytes = clib_min (available_bytes, max_deq_bytes);
+
+ /* Start is beyond snd_congestion */
+ start = tc->snd_una + offset;
+ if (seq_geq (start, tc->snd_congestion))
+ goto done;
+
+ /* Don't overshoot snd_congestion */
+ if (seq_gt (start + max_deq_bytes, tc->snd_congestion))
+ {
+ max_deq_bytes = tc->snd_congestion - start;
+ if (max_deq_bytes == 0)
+ goto done;
+ }
+
+ seg_size = max_deq_bytes + MAX_HDRS_LEN;
+
+ /*
+ * Prepare options
+ */
+ tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state);
+
+ /*
+ * Allocate and fill in buffer(s)
+ */
+
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return 0;
+ *b = vlib_get_buffer (vm, bi);
+ data = tcp_init_buffer (vm, *b);
+
+ /* Easy case, buffer size greater than mss */
+ if (PREDICT_TRUE (seg_size <= tm->bytes_per_buffer))
+ {
+ n_bytes = stream_session_peek_bytes (&tc->connection, data, offset,
+ max_deq_bytes);
+ ASSERT (n_bytes == max_deq_bytes);
+ b[0]->current_length = n_bytes;
+ tcp_push_hdr_i (tc, *b, tc->state, 0);
+ }
+ /* Split mss into multiple buffers */
+ else
+ {
+ u32 chain_bi = ~0, n_bufs_per_seg;
+ u32 thread_index = vlib_get_thread_index ();
+ u16 n_peeked, len_to_deq, available_bufs;
+ vlib_buffer_t *chain_b, *prev_b;
+ int i;
+
+ n_bufs_per_seg = ceil ((double) seg_size / tm->bytes_per_buffer);
+
+ /* Make sure we have enough buffers */
+ available_bufs = vec_len (tm->tx_buffers[thread_index]);
+ if (n_bufs_per_seg > available_bufs)
+ {
+ if (tcp_alloc_tx_buffers (tm, thread_index,
+ VLIB_FRAME_SIZE - available_bufs))
+ {
+ tcp_return_buffer (tm);
+ *b = 0;
+ return 0;
+ }
+ }
+
+ n_bytes = stream_session_peek_bytes (&tc->connection, data, offset,
+ tm->bytes_per_buffer -
+ MAX_HDRS_LEN);
+ b[0]->current_length = n_bytes;
+ b[0]->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
+ b[0]->total_length_not_including_first_buffer = 0;
+ max_deq_bytes -= n_bytes;
+
+ chain_b = *b;
+ for (i = 1; i < n_bufs_per_seg; i++)
+ {
+ prev_b = chain_b;
+ len_to_deq = clib_min (max_deq_bytes, tm->bytes_per_buffer);
+ tcp_get_free_buffer_index (tm, &chain_bi);
+ ASSERT (chain_bi != (u32) ~ 0);
+ chain_b = vlib_get_buffer (vm, chain_bi);
+ chain_b->current_data = 0;
+ data = vlib_buffer_get_current (chain_b);
+ n_peeked = stream_session_peek_bytes (&tc->connection, data,
+ offset + n_bytes, len_to_deq);
+ ASSERT (n_peeked == len_to_deq);
+ n_bytes += n_peeked;
+ chain_b->current_length = n_peeked;
+ chain_b->flags &= VLIB_BUFFER_FREE_LIST_INDEX_MASK;
+ chain_b->next_buffer = 0;
+
+ /* update previous buffer */
+ prev_b->next_buffer = chain_bi;
+ prev_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
+
+ max_deq_bytes -= n_peeked;
+ b[0]->total_length_not_including_first_buffer += n_peeked;
+ }
+
+ tcp_push_hdr_i (tc, *b, tc->state, 0);
+ }
+
+ ASSERT (n_bytes > 0);
+ ASSERT (((*b)->current_data + (*b)->current_length) <=
+ tm->bytes_per_buffer);
+
+ if (tcp_in_fastrecovery (tc))
+ tc->snd_rxt_bytes += n_bytes;
+
+done:
+ TCP_EVT_DBG (TCP_EVT_CC_RTX, tc, offset, n_bytes);
+ return n_bytes;
+}
+
+/**
+ * Reset congestion control, switch cwnd to loss window and try again.
+ */
+static void
+tcp_rtx_timeout_cc (tcp_connection_t * tc)
+{
+ tc->prev_ssthresh = tc->ssthresh;
+ tc->prev_cwnd = tc->cwnd;
+
+ /* Cleanly recover cc (also clears up fast retransmit) */
+ if (tcp_in_fastrecovery (tc))
+ tcp_cc_fastrecovery_exit (tc);
+
+ /* Start again from the beginning */
+ tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss);
+ tc->cwnd = tcp_loss_wnd (tc);
+ tc->snd_congestion = tc->snd_una_max;
+ tc->rtt_ts = 0;
+ tcp_recovery_on (tc);
+}
+
+static void
+tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ vlib_main_t *vm = vlib_get_main ();
+ u32 thread_index = vlib_get_thread_index ();
+ tcp_connection_t *tc;
+ vlib_buffer_t *b = 0;
+ u32 bi, n_bytes;
+
+ if (is_syn)
+ {
+ tc = tcp_half_open_connection_get (index);
+ /* Note: the connection may have transitioned to ESTABLISHED... */
+ if (PREDICT_FALSE (tc == 0))
+ return;
+ tc->timers[TCP_TIMER_RETRANSMIT_SYN] = TCP_TIMER_HANDLE_INVALID;
+ }
+ else
+ {
+ tc = tcp_connection_get (index, thread_index);
+ /* Note: the connection may have been closed and pool_put */
+ if (PREDICT_FALSE (tc == 0))
+ return;
+ tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID;
+ }
+
+ if (tc->state >= TCP_STATE_ESTABLISHED)
+ {
+ /* Lost FIN, retransmit and return */
+ if (tcp_is_lost_fin (tc))
+ {
+ tcp_send_fin (tc);
+ return;
+ }
+
+ /* We're not in recovery so make sure rto_boff is 0 */
+ if (!tcp_in_recovery (tc) && tc->rto_boff > 0)
+ {
+ tc->rto_boff = 0;
+ tcp_update_rto (tc);
+ }
+
+ /* Increment RTO backoff (also equal to number of retries) and go back
+ * to first un-acked byte */
+ tc->rto_boff += 1;
+
+ /* First retransmit timeout */
+ if (tc->rto_boff == 1)
+ tcp_rtx_timeout_cc (tc);
+
+ tc->snd_nxt = tc->snd_una;
+ tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
+
+ TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1);
+
+ /* Send one segment. Note that n_bytes may be zero due to buffer shortfall */
+ n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b);
+
+ /* TODO be less aggressive about this */
+ scoreboard_clear (&tc->sack_sb);
+
+ if (n_bytes == 0)
+ {
+ ASSERT (!b);
+ if (tc->snd_una == tc->snd_una_max)
+ return;
+ ASSERT (tc->rto_boff > 1 && tc->snd_una == tc->snd_congestion);
+ clib_warning ("retransmit fail: %U", format_tcp_connection, tc, 2);
+ /* Try again eventually */
+ tcp_retransmit_timer_set (tc);
+ return;
+ }
+
+ bi = vlib_get_buffer_index (vm, b);
+
+ /* For first retransmit, record timestamp (Eifel detection RFC3522) */
+ if (tc->rto_boff == 1)
+ tc->snd_rxt_ts = tcp_time_now ();
+
+ tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
+ tcp_retransmit_timer_update (tc);
+ }
+ /* Retransmit for SYN */
+ else if (tc->state == TCP_STATE_SYN_SENT)
+ {
+ /* Half-open connection actually moved to established but we were
+ * waiting for syn retransmit to pop to call cleanup from the right
+ * thread. */
+ if (tc->flags & TCP_CONN_HALF_OPEN_DONE)
+ {
+ if (tcp_half_open_connection_cleanup (tc))
+ {
+ clib_warning ("could not remove half-open connection");
+ ASSERT (0);
+ }
+ return;
+ }
+
+ /* Try without increasing RTO a number of times. If this fails,
+ * start growing RTO exponentially */
+ tc->rto_boff += 1;
+ if (tc->rto_boff > TCP_RTO_SYN_RETRIES)
+ tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
+
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return;
+
+ b = vlib_get_buffer (vm, bi);
+ tcp_init_buffer (vm, b);
+ tcp_make_syn (tc, b);
+
+ tc->rtt_ts = 0;
+ TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc, 0);
+
+ /* This goes straight to ipx_lookup. Retransmit timer set already */
+ tcp_push_ip_hdr (tm, tc, b);
+ tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4);
+ }
+ /* Retransmit SYN-ACK */
+ else if (tc->state == TCP_STATE_SYN_RCVD)
+ {
+ tc->rto_boff += 1;
+ if (tc->rto_boff > TCP_RTO_SYN_RETRIES)
+ tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
+ tc->rtt_ts = 0;
+
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return;
+
+ b = vlib_get_buffer (vm, bi);
+ tcp_make_synack (tc, b);
+ TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc, 1);
+
+ /* Retransmit timer already updated, just enqueue to output */
+ tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
+ }
+ else
+ {
+ ASSERT (tc->state == TCP_STATE_CLOSED);
+ TCP_DBG ("connection state: %d", tc->state);
+ return;
+ }
+}
+
+void
+tcp_timer_retransmit_handler (u32 index)
+{
+ tcp_timer_retransmit_handler_i (index, 0);
+}
+
+void
+tcp_timer_retransmit_syn_handler (u32 index)
+{
+ tcp_timer_retransmit_handler_i (index, 1);
+}
+
+/**
+ * Got 0 snd_wnd from peer, try to do something about it.
+ *
+ */
+void
+tcp_timer_persist_handler (u32 index)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ vlib_main_t *vm = vlib_get_main ();
+ u32 thread_index = vlib_get_thread_index ();
+ tcp_connection_t *tc;
+ vlib_buffer_t *b;
+ u32 bi, max_snd_bytes, available_bytes, offset;
+ int n_bytes = 0;
+ u8 *data;
+
+ tc = tcp_connection_get_if_valid (index, thread_index);
+
+ if (!tc)
+ return;
+
+ /* Make sure timer handle is set to invalid */
+ tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID;
+
+ /* Problem already solved or worse */
+ if (tc->state == TCP_STATE_CLOSED || tc->state > TCP_STATE_ESTABLISHED
+ || tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc))
+ return;
+
+ available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection);
+ offset = tc->snd_una_max - tc->snd_una;
+
+ /* Reprogram persist if no new bytes available to send. We may have data
+ * next time */
+ if (!available_bytes)
+ {
+ tcp_persist_timer_set (tc);
+ return;
+ }
+
+ if (available_bytes <= offset)
+ {
+ ASSERT (tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT));
+ return;
+ }
+
+ /* Increment RTO backoff */
+ tc->rto_boff += 1;
+ tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
+
+ /*
+ * Try to force the first unsent segment (or buffer)
+ */
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return;
+ b = vlib_get_buffer (vm, bi);
+ data = tcp_init_buffer (vm, b);
+
+ tcp_validate_txf_size (tc, offset);
+ tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state);
+ max_snd_bytes = clib_min (tc->snd_mss, tm->bytes_per_buffer - MAX_HDRS_LEN);
+ n_bytes = stream_session_peek_bytes (&tc->connection, data, offset,
+ max_snd_bytes);
+ b->current_length = n_bytes;
+ ASSERT (n_bytes != 0 && (tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)
+ || tc->snd_nxt == tc->snd_una_max
+ || tc->rto_boff > 1));
+
+ tcp_push_hdr_i (tc, b, tc->state, 0);
+ tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
+
+ /* Just sent new data, enable retransmit */
+ tcp_retransmit_timer_update (tc);
+}
+
+/**
+ * Retransmit first unacked segment
+ */
+void
+tcp_retransmit_first_unacked (tcp_connection_t * tc)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ vlib_buffer_t *b;
+ u32 bi, old_snd_nxt, n_bytes;
+
+ old_snd_nxt = tc->snd_nxt;
+ tc->snd_nxt = tc->snd_una;
+
+ TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2);
+ n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b);
+ if (!n_bytes)
+ return;
+ bi = vlib_get_buffer_index (vm, b);
+ tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
+
+ tc->snd_nxt = old_snd_nxt;
+}
+
+/**
+ * Do fast retransmit with SACKs
+ */
+void
+tcp_fast_retransmit_sack (tcp_connection_t * tc)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ u32 n_written = 0, offset, max_bytes;
+ vlib_buffer_t *b = 0;
+ sack_scoreboard_hole_t *hole;
+ sack_scoreboard_t *sb;
+ u32 bi, old_snd_nxt;
+ int snd_space;
+ u8 snd_limited = 0, can_rescue = 0;
+
+ ASSERT (tcp_in_fastrecovery (tc));
+ TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0);
+
+ old_snd_nxt = tc->snd_nxt;
+ sb = &tc->sack_sb;
+ snd_space = tcp_available_snd_space (tc);
+
+ hole = scoreboard_get_hole (sb, sb->cur_rxt_hole);
+ while (hole && snd_space > 0)
+ {
+ hole = scoreboard_next_rxt_hole (sb, hole,
+ tcp_fastrecovery_sent_1_smss (tc),
+ &can_rescue, &snd_limited);
+ if (!hole)
+ {
+ if (!can_rescue || !(seq_lt (sb->rescue_rxt, tc->snd_una)
+ || seq_gt (sb->rescue_rxt,
+ tc->snd_congestion)))
+ break;
+
+ /* If rescue rxt undefined or less than snd_una then one segment of
+ * up to SMSS octets that MUST include the highest outstanding
+ * unSACKed sequence number SHOULD be returned, and RescueRxt set to
+ * RecoveryPoint. HighRxt MUST NOT be updated.
+ */
+ max_bytes = clib_min (tc->snd_mss,
+ tc->snd_congestion - tc->snd_una);
+ max_bytes = clib_min (max_bytes, snd_space);
+ offset = tc->snd_congestion - tc->snd_una - max_bytes;
+ sb->rescue_rxt = tc->snd_congestion;
+ tc->snd_nxt = tc->snd_una + offset;
+ n_written = tcp_prepare_retransmit_segment (tc, offset, max_bytes,
+ &b);
+ ASSERT (n_written);
+ bi = vlib_get_buffer_index (vm, b);
+ tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
+ break;
+ }
+
+ max_bytes = clib_min (hole->end - sb->high_rxt, snd_space);
+ max_bytes = snd_limited ? clib_min (max_bytes, tc->snd_mss) : max_bytes;
+ if (max_bytes == 0)
+ break;
+ offset = sb->high_rxt - tc->snd_una;
+ tc->snd_nxt = sb->high_rxt;
+ n_written = tcp_prepare_retransmit_segment (tc, offset, max_bytes, &b);
+
+ /* Nothing left to retransmit */
+ if (n_written == 0)
+ break;
+
+ bi = vlib_get_buffer_index (vm, b);
+ sb->high_rxt += n_written;
+ tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
+ ASSERT (n_written <= snd_space);
+ snd_space -= n_written;
+ }
+
+ /* If window allows, send 1 SMSS of new data */
+ tc->snd_nxt = old_snd_nxt;
+}
+
+/**
+ * Fast retransmit without SACK info
+ */
+void
+tcp_fast_retransmit_no_sack (tcp_connection_t * tc)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ u32 n_written = 0, offset = 0, bi, old_snd_nxt;
+ int snd_space;
+ vlib_buffer_t *b;
+
+ ASSERT (tcp_in_fastrecovery (tc));
+ TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0);
+
+ /* Start resending from first un-acked segment */
+ old_snd_nxt = tc->snd_nxt;
+ tc->snd_nxt = tc->snd_una;
+ snd_space = tcp_available_snd_space (tc);
+
+ while (snd_space > 0)
+ {
+ offset += n_written;
+ n_written = tcp_prepare_retransmit_segment (tc, offset, snd_space, &b);
+
+ /* Nothing left to retransmit */
+ if (n_written == 0)
+ break;
+
+ bi = vlib_get_buffer_index (vm, b);
+ tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
+ snd_space -= n_written;
+ }
+
+ /* Restore snd_nxt. If window allows, send 1 SMSS of new data */
+ tc->snd_nxt = old_snd_nxt;
+}
+
+/**
+ * Do fast retransmit
+ */
+void
+tcp_fast_retransmit (tcp_connection_t * tc)
+{
+ if (tcp_opts_sack_permitted (&tc->rcv_opts)
+ && scoreboard_first_hole (&tc->sack_sb))
+ tcp_fast_retransmit_sack (tc);
+ else
+ tcp_fast_retransmit_no_sack (tc);
+}
+
+always_inline u32
+tcp_session_has_ooo_data (tcp_connection_t * tc)
+{
+ stream_session_t *s =
+ stream_session_get (tc->c_s_index, tc->c_thread_index);
+ return svm_fifo_has_ooo_data (s->server_rx_fifo);
+}
+
+always_inline uword
+tcp46_output_inline (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame, int is_ip4)
+{
+ u32 n_left_from, next_index, *from, *to_next;
+ u32 my_thread_index = vm->thread_index;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+ next_index = node->cached_next_index;
+ tcp_set_time_now (my_thread_index);
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t *b0;
+ tcp_connection_t *tc0;
+ tcp_tx_trace_t *t0;
+ tcp_header_t *th0 = 0;
+ u32 error0 = TCP_ERROR_PKTS_SENT, next0 = TCP_OUTPUT_NEXT_IP_LOOKUP;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
+ my_thread_index);
+ if (PREDICT_FALSE (tc0 == 0 || tc0->state == TCP_STATE_CLOSED))
+ {
+ error0 = TCP_ERROR_INVALID_CONNECTION;
+ next0 = TCP_OUTPUT_NEXT_DROP;
+ goto done;
+ }
+
+ th0 = vlib_buffer_get_current (b0);
+ TCP_EVT_DBG (TCP_EVT_OUTPUT, tc0, th0->flags, b0->current_length);
+
+ if (is_ip4)
+ {
+ vlib_buffer_push_ip4 (vm, b0, &tc0->c_lcl_ip4, &tc0->c_rmt_ip4,
+ IP_PROTOCOL_TCP, 1);
+ b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM;
+ vnet_buffer (b0)->l4_hdr_offset = (u8 *) th0 - b0->data;
+ th0->checksum = 0;
+ }
+ else
+ {
+ ip6_header_t *ih0;
+ ih0 = vlib_buffer_push_ip6 (vm, b0, &tc0->c_lcl_ip6,
+ &tc0->c_rmt_ip6, IP_PROTOCOL_TCP);
+ b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM;
+ vnet_buffer (b0)->l3_hdr_offset = (u8 *) ih0 - b0->data;
+ vnet_buffer (b0)->l4_hdr_offset = (u8 *) th0 - b0->data;
+ th0->checksum = 0;
+ }
+
+ /* Filter out DUPACKs if there are no OOO segments left */
+ if (PREDICT_FALSE
+ (vnet_buffer (b0)->tcp.flags & TCP_BUF_FLAG_DUPACK))
+ {
+ if (!tcp_session_has_ooo_data (tc0))
+ {
+ error0 = TCP_ERROR_FILTERED_DUPACKS;
+ next0 = TCP_OUTPUT_NEXT_DROP;
+ goto done;
+ }
+ }
+
+ /* Stop DELACK timer and fix flags */
+ tc0->flags &= ~(TCP_CONN_SNDACK);
+ tcp_timer_reset (tc0, TCP_TIMER_DELACK);
+
+ /* If not retransmitting
+ * 1) update snd_una_max (SYN, SYNACK, FIN)
+ * 2) If we're not tracking an ACK, start tracking */
+ if (seq_lt (tc0->snd_una_max, tc0->snd_nxt))
+ {
+ tc0->snd_una_max = tc0->snd_nxt;
+ if (tc0->rtt_ts == 0)
+ {
+ tc0->rtt_ts = tcp_time_now ();
+ tc0->rtt_seq = tc0->snd_nxt;
+ }
+ }
+
+ /* Set the retransmit timer if not set already and not
+ * doing a pure ACK */
+ if (!tcp_timer_is_active (tc0, TCP_TIMER_RETRANSMIT)
+ && tc0->snd_nxt != tc0->snd_una)
+ {
+ tcp_retransmit_timer_set (tc0);
+ tc0->rto_boff = 0;
+ }
+
+#if 0
+ /* Make sure we haven't lost route to our peer */
+ if (PREDICT_FALSE (tc0->last_fib_check
+ < tc0->snd_opts.tsval + TCP_FIB_RECHECK_PERIOD))
+ {
+ if (PREDICT_TRUE
+ (tc0->c_rmt_fei == tcp_lookup_rmt_in_fib (tc0)))
+ {
+ tc0->last_fib_check = tc0->snd_opts.tsval;
+ }
+ else
+ {
+ clib_warning ("lost connection to peer");
+ tcp_connection_reset (tc0);
+ goto done;
+ }
+ }
+
+ /* Use pre-computed dpo to set next node */
+ next0 = tc0->c_rmt_dpo.dpoi_next_node;
+ vnet_buffer (b0)->ip.adj_index[VLIB_TX] = tc0->c_rmt_dpo.dpoi_index;
+#endif
+
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0;
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0;
+
+ b0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
+ done:
+ b0->error = node->errors[error0];
+ if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
+ if (th0)
+ {
+ clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header));
+ }
+ else
+ {
+ memset (&t0->tcp_header, 0, sizeof (t0->tcp_header));
+ }
+ clib_memcpy (&t0->tcp_connection, tc0,
+ sizeof (t0->tcp_connection));
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+ n_left_to_next, bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return from_frame->n_vectors;
+}
+
+static uword
+tcp4_output (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ return tcp46_output_inline (vm, node, from_frame, 1 /* is_ip4 */ );
+}
+
+static uword
+tcp6_output (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ return tcp46_output_inline (vm, node, from_frame, 0 /* is_ip4 */ );
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (tcp4_output_node) =
+{
+ .function = tcp4_output,.name = "tcp4-output",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+ .n_errors = TCP_N_ERROR,
+ .error_strings = tcp_error_strings,
+ .n_next_nodes = TCP_OUTPUT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [TCP_OUTPUT_NEXT_##s] = n,
+ foreach_tcp4_output_next
+#undef _
+ },
+ .format_buffer = format_tcp_header,
+ .format_trace = format_tcp_tx_trace,
+};
+/* *INDENT-ON* */
+
+VLIB_NODE_FUNCTION_MULTIARCH (tcp4_output_node, tcp4_output);
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (tcp6_output_node) =
+{
+ .function = tcp6_output,
+ .name = "tcp6-output",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+ .n_errors = TCP_N_ERROR,
+ .error_strings = tcp_error_strings,
+ .n_next_nodes = TCP_OUTPUT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [TCP_OUTPUT_NEXT_##s] = n,
+ foreach_tcp6_output_next
+#undef _
+ },
+ .format_buffer = format_tcp_header,
+ .format_trace = format_tcp_tx_trace,
+};
+/* *INDENT-ON* */
+
+VLIB_NODE_FUNCTION_MULTIARCH (tcp6_output_node, tcp6_output);
+
+u32
+tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b)
+{
+ tcp_connection_t *tc;
+
+ tc = (tcp_connection_t *) tconn;
+ tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED, 0);
+ ASSERT (seq_leq (tc->snd_una_max, tc->snd_una + tc->snd_wnd));
+
+ if (tc->rtt_ts == 0 && !tcp_in_cong_recovery (tc))
+ {
+ tc->rtt_ts = tcp_time_now ();
+ tc->rtt_seq = tc->snd_nxt;
+ }
+ return 0;
+}
+
+typedef enum _tcp_reset_next
+{
+ TCP_RESET_NEXT_DROP,
+ TCP_RESET_NEXT_IP_LOOKUP,
+ TCP_RESET_N_NEXT
+} tcp_reset_next_t;
+
+#define foreach_tcp4_reset_next \
+ _(DROP, "error-drop") \
+ _(IP_LOOKUP, "ip4-lookup")
+
+#define foreach_tcp6_reset_next \
+ _(DROP, "error-drop") \
+ _(IP_LOOKUP, "ip6-lookup")
+
+static uword
+tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame, u8 is_ip4)
+{
+ u32 n_left_from, next_index, *from, *to_next;
+ u32 my_thread_index = vm->thread_index;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t *b0;
+ tcp_tx_trace_t *t0;
+ tcp_header_t *th0;
+ u32 error0 = TCP_ERROR_RST_SENT, next0 = TCP_RESET_NEXT_IP_LOOKUP;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ if (tcp_make_reset_in_place (vm, b0, vnet_buffer (b0)->tcp.flags,
+ my_thread_index, is_ip4))
+ {
+ error0 = TCP_ERROR_LOOKUP_DROPS;
+ next0 = TCP_RESET_NEXT_DROP;
+ goto done;
+ }
+
+ /* Prepare to send to IP lookup */
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = 0;
+ next0 = TCP_RESET_NEXT_IP_LOOKUP;
+
+ done:
+ b0->error = node->errors[error0];
+ b0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
+ if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ th0 = vlib_buffer_get_current (b0);
+ if (is_ip4)
+ th0 = ip4_next_header ((ip4_header_t *) th0);
+ else
+ th0 = ip6_next_header ((ip6_header_t *) th0);
+ t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
+ clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header));
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+ n_left_to_next, bi0, next0);
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ return from_frame->n_vectors;
+}
+
+static uword
+tcp4_send_reset (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ return tcp46_send_reset_inline (vm, node, from_frame, 1);
+}
+
+static uword
+tcp6_send_reset (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ return tcp46_send_reset_inline (vm, node, from_frame, 0);
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (tcp4_reset_node) = {
+ .function = tcp4_send_reset,
+ .name = "tcp4-reset",
+ .vector_size = sizeof (u32),
+ .n_errors = TCP_N_ERROR,
+ .error_strings = tcp_error_strings,
+ .n_next_nodes = TCP_RESET_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [TCP_RESET_NEXT_##s] = n,
+ foreach_tcp4_reset_next
+#undef _
+ },
+ .format_trace = format_tcp_tx_trace,
+};
+/* *INDENT-ON* */
+
+VLIB_NODE_FUNCTION_MULTIARCH (tcp4_reset_node, tcp4_send_reset);
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (tcp6_reset_node) = {
+ .function = tcp6_send_reset,
+ .name = "tcp6-reset",
+ .vector_size = sizeof (u32),
+ .n_errors = TCP_N_ERROR,
+ .error_strings = tcp_error_strings,
+ .n_next_nodes = TCP_RESET_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [TCP_RESET_NEXT_##s] = n,
+ foreach_tcp6_reset_next
+#undef _
+ },
+ .format_trace = format_tcp_tx_trace,
+};
+/* *INDENT-ON* */
+
+VLIB_NODE_FUNCTION_MULTIARCH (tcp6_reset_node, tcp6_send_reset);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/tcp/tcp_packet.h b/src/vnet/tcp/tcp_packet.h
new file mode 100644
index 00000000..9ccfe655
--- /dev/null
+++ b/src/vnet/tcp/tcp_packet.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_tcp_packet_h
+#define included_tcp_packet_h
+
+#include <vnet/vnet.h>
+
+/* TCP flags bit 0 first. */
+#define foreach_tcp_flag \
+ _ (FIN) /**< No more data from sender. */ \
+ _ (SYN) /**< Synchronize sequence numbers. */ \
+ _ (RST) /**< Reset the connection. */ \
+ _ (PSH) /**< Push function. */ \
+ _ (ACK) /**< Ack field significant. */ \
+ _ (URG) /**< Urgent pointer field significant. */ \
+ _ (ECE) /**< ECN-echo. Receiver got CE packet */ \
+ _ (CWR) /**< Sender reduced congestion window */
+
+enum
+{
+#define _(f) TCP_FLAG_BIT_##f,
+ foreach_tcp_flag
+#undef _
+ TCP_N_FLAG_BITS,
+};
+
+enum
+{
+#define _(f) TCP_FLAG_##f = 1 << TCP_FLAG_BIT_##f,
+ foreach_tcp_flag
+#undef _
+};
+
+typedef struct _tcp_header
+{
+ union
+ {
+ struct
+ {
+ u16 src_port; /**< Source port. */
+ u16 dst_port; /**< Destination port. */
+ };
+ struct
+ {
+ u16 src, dst;
+ };
+ };
+
+ u32 seq_number; /**< Sequence number of the first data octet in this
+ * segment, except when SYN is present. If SYN
+ * is present the seq number is is the ISN and the
+ * first data octet is ISN+1 */
+ u32 ack_number; /**< Acknowledgement number if ACK is set. It contains
+ * the value of the next sequence number the sender
+ * of the segment is expecting to receive. */
+ u8 data_offset_and_reserved;
+ u8 flags; /**< Flags: see the macro above */
+ u16 window; /**< Number of bytes sender is willing to receive. */
+
+ u16 checksum; /**< Checksum of TCP pseudo header and data. */
+ u16 urgent_pointer; /**< Seq number of the byte after the urgent data. */
+} __attribute__ ((packed)) tcp_header_t;
+
+/* Flag tests that return 0 or !0 */
+#define tcp_doff(_th) ((_th)->data_offset_and_reserved >> 4)
+#define tcp_fin(_th) ((_th)->flags & TCP_FLAG_FIN)
+#define tcp_syn(_th) ((_th)->flags & TCP_FLAG_SYN)
+#define tcp_rst(_th) ((_th)->flags & TCP_FLAG_RST)
+#define tcp_psh(_th) ((_th)->flags & TCP_FLAG_PSH)
+#define tcp_ack(_th) ((_th)->flags & TCP_FLAG_ACK)
+#define tcp_urg(_th) ((_th)->flags & TCP_FLAG_URG)
+#define tcp_ece(_th) ((_th)->flags & TCP_FLAG_ECE)
+#define tcp_cwr(_th) ((_th)->flags & TCP_FLAG_CWR)
+
+/* Flag tests that return 0 or 1 */
+#define tcp_is_syn(_th) !!((_th)->flags & TCP_FLAG_SYN)
+#define tcp_is_fin(_th) !!((_th)->flags & TCP_FLAG_FIN)
+
+always_inline int
+tcp_header_bytes (tcp_header_t * t)
+{
+ return tcp_doff (t) * sizeof (u32);
+}
+
+/*
+ * TCP options.
+ */
+
+typedef enum tcp_option_type
+{
+ TCP_OPTION_EOL = 0, /**< End of options. */
+ TCP_OPTION_NOOP = 1, /**< No operation. */
+ TCP_OPTION_MSS = 2, /**< Limit MSS. */
+ TCP_OPTION_WINDOW_SCALE = 3, /**< Window scale. */
+ TCP_OPTION_SACK_PERMITTED = 4, /**< Selective Ack permitted. */
+ TCP_OPTION_SACK_BLOCK = 5, /**< Selective Ack block. */
+ TCP_OPTION_TIMESTAMP = 8, /**< Timestamps. */
+ TCP_OPTION_UTO = 28, /**< User timeout. */
+ TCP_OPTION_AO = 29, /**< Authentication Option. */
+} tcp_option_type_t;
+
+#define foreach_tcp_options_flag \
+ _ (MSS) /**< MSS advertised in SYN */ \
+ _ (TSTAMP) /**< Timestamp capability advertised in SYN */ \
+ _ (WSCALE) /**< Wnd scale capability advertised in SYN */ \
+ _ (SACK_PERMITTED) /**< SACK capability advertised in SYN */ \
+ _ (SACK) /**< SACK present */
+
+enum
+{
+#define _(f) TCP_OPTS_FLAG_BIT_##f,
+ foreach_tcp_options_flag
+#undef _
+ TCP_OPTIONS_N_FLAG_BITS,
+};
+
+enum
+{
+#define _(f) TCP_OPTS_FLAG_##f = 1 << TCP_OPTS_FLAG_BIT_##f,
+ foreach_tcp_options_flag
+#undef _
+};
+
+typedef struct _sack_block
+{
+ u32 start; /**< Start sequence number */
+ u32 end; /**< End sequence number (first outside) */
+} sack_block_t;
+
+typedef struct
+{
+ u8 flags; /** Option flags, see above */
+
+ u16 mss; /**< Maximum segment size advertised */
+ u8 wscale; /**< Window scale advertised */
+ u32 tsval; /**< Timestamp value */
+ u32 tsecr; /**< Echoed/reflected time stamp */
+ sack_block_t *sacks; /**< SACK blocks */
+ u8 n_sack_blocks; /**< Number of SACKs blocks */
+} tcp_options_t;
+
+/* Flag tests that return 0 or !0 */
+#define tcp_opts_mss(_to) ((_to)->flags & TCP_OPTS_FLAG_MSS)
+#define tcp_opts_tstamp(_to) ((_to)->flags & TCP_OPTS_FLAG_TSTAMP)
+#define tcp_opts_wscale(_to) ((_to)->flags & TCP_OPTS_FLAG_WSCALE)
+#define tcp_opts_sack(_to) ((_to)->flags & TCP_OPTS_FLAG_SACK)
+#define tcp_opts_sack_permitted(_to) ((_to)->flags & TCP_OPTS_FLAG_SACK_PERMITTED)
+
+/* TCP option lengths */
+#define TCP_OPTION_LEN_EOL 1
+#define TCP_OPTION_LEN_NOOP 1
+#define TCP_OPTION_LEN_MSS 4
+#define TCP_OPTION_LEN_WINDOW_SCALE 3
+#define TCP_OPTION_LEN_SACK_PERMITTED 2
+#define TCP_OPTION_LEN_TIMESTAMP 10
+#define TCP_OPTION_LEN_SACK_BLOCK 8
+
+#define TCP_HDR_LEN_MAX 60
+#define TCP_WND_MAX 65535U
+#define TCP_MAX_WND_SCALE 14 /* See RFC 1323 */
+#define TCP_OPTS_ALIGN 4
+#define TCP_OPTS_MAX_SACK_BLOCKS 3
+#endif /* included_tcp_packet_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/tcp/tcp_pg.c b/src/vnet/tcp/tcp_pg.c
new file mode 100644
index 00000000..3be4592c
--- /dev/null
+++ b/src/vnet/tcp/tcp_pg.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/tcp_pg: TCP packet-generator interface
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/ip/ip.h>
+#include <vnet/pg/pg.h>
+
+/* TCP flags bit 0 first. */
+#define foreach_tcp_flag \
+ _ (FIN) \
+ _ (SYN) \
+ _ (RST) \
+ _ (PSH) \
+ _ (ACK) \
+ _ (URG) \
+ _ (ECE) \
+ _ (CWR)
+
+static void
+tcp_pg_edit_function (pg_main_t * pg,
+ pg_stream_t * s,
+ pg_edit_group_t * g, u32 * packets, u32 n_packets)
+{
+ vlib_main_t *vm = vlib_get_main ();
+ u32 ip_offset, tcp_offset;
+
+ tcp_offset = g->start_byte_offset;
+ ip_offset = (g - 1)->start_byte_offset;
+
+ while (n_packets >= 1)
+ {
+ vlib_buffer_t *p0;
+ ip4_header_t *ip0;
+ tcp_header_t *tcp0;
+ ip_csum_t sum0;
+ u32 tcp_len0;
+
+ p0 = vlib_get_buffer (vm, packets[0]);
+ n_packets -= 1;
+ packets += 1;
+
+ ASSERT (p0->current_data == 0);
+ ip0 = (void *) (p0->data + ip_offset);
+ tcp0 = (void *) (p0->data + tcp_offset);
+ tcp_len0 = clib_net_to_host_u16 (ip0->length) - sizeof (ip0[0]);
+
+ /* Initialize checksum with header. */
+ if (BITS (sum0) == 32)
+ {
+ sum0 = clib_mem_unaligned (&ip0->src_address, u32);
+ sum0 =
+ ip_csum_with_carry (sum0,
+ clib_mem_unaligned (&ip0->dst_address, u32));
+ }
+ else
+ sum0 = clib_mem_unaligned (&ip0->src_address, u64);
+
+ sum0 = ip_csum_with_carry
+ (sum0, clib_host_to_net_u32 (tcp_len0 + (ip0->protocol << 16)));
+
+ /* Invalidate possibly old checksum. */
+ tcp0->checksum = 0;
+
+ sum0 =
+ ip_incremental_checksum_buffer (vm, p0, tcp_offset, tcp_len0, sum0);
+
+ tcp0->checksum = ~ip_csum_fold (sum0);
+ }
+}
+
+typedef struct
+{
+ pg_edit_t src, dst;
+ pg_edit_t seq_number, ack_number;
+ pg_edit_t data_offset_and_reserved;
+#define _(f) pg_edit_t f##_flag;
+ foreach_tcp_flag
+#undef _
+ pg_edit_t window;
+ pg_edit_t checksum;
+ pg_edit_t urgent_pointer;
+} pg_tcp_header_t;
+
+static inline void
+pg_tcp_header_init (pg_tcp_header_t * p)
+{
+ /* Initialize fields that are not bit fields in the IP header. */
+#define _(f) pg_edit_init (&p->f, tcp_header_t, f);
+ _(src);
+ _(dst);
+ _(seq_number);
+ _(ack_number);
+ _(window);
+ _(checksum);
+ _(urgent_pointer);
+#undef _
+
+ /* Initialize bit fields. */
+#define _(f) \
+ pg_edit_init_bitfield (&p->f##_flag, tcp_header_t, \
+ flags, \
+ TCP_FLAG_BIT_##f, 1);
+
+ foreach_tcp_flag
+#undef _
+ pg_edit_init_bitfield (&p->data_offset_and_reserved, tcp_header_t,
+ data_offset_and_reserved, 4, 4);
+}
+
+uword
+unformat_pg_tcp_header (unformat_input_t * input, va_list * args)
+{
+ pg_stream_t *s = va_arg (*args, pg_stream_t *);
+ pg_tcp_header_t *p;
+ u32 group_index;
+
+ p = pg_create_edit_group (s, sizeof (p[0]), sizeof (tcp_header_t),
+ &group_index);
+ pg_tcp_header_init (p);
+
+ /* Defaults. */
+ pg_edit_set_fixed (&p->seq_number, 0);
+ pg_edit_set_fixed (&p->ack_number, 0);
+
+ pg_edit_set_fixed (&p->data_offset_and_reserved,
+ sizeof (tcp_header_t) / sizeof (u32));
+
+ pg_edit_set_fixed (&p->window, 4096);
+ pg_edit_set_fixed (&p->urgent_pointer, 0);
+
+#define _(f) pg_edit_set_fixed (&p->f##_flag, 0);
+ foreach_tcp_flag
+#undef _
+ p->checksum.type = PG_EDIT_UNSPECIFIED;
+
+ if (!unformat (input, "TCP: %U -> %U",
+ unformat_pg_edit,
+ unformat_tcp_udp_port, &p->src,
+ unformat_pg_edit, unformat_tcp_udp_port, &p->dst))
+ goto error;
+
+ /* Parse options. */
+ while (1)
+ {
+ if (unformat (input, "window %U",
+ unformat_pg_edit, unformat_pg_number, &p->window))
+ ;
+
+ else if (unformat (input, "checksum %U",
+ unformat_pg_edit, unformat_pg_number, &p->checksum))
+ ;
+
+ else if (unformat (input, "seqnum %U", unformat_pg_edit,
+ unformat_pg_number, &p->seq_number))
+ ;
+ else if (unformat (input, "acknum %U", unformat_pg_edit,
+ unformat_pg_number, &p->ack_number))
+ ;
+ /* Flags. */
+#define _(f) else if (unformat (input, #f)) pg_edit_set_fixed (&p->f##_flag, 1);
+ foreach_tcp_flag
+#undef _
+ /* Can't parse input: try next protocol level. */
+ else
+ break;
+ }
+
+ {
+ ip_main_t *im = &ip_main;
+ u16 dst_port;
+ tcp_udp_port_info_t *pi;
+
+ pi = 0;
+ if (p->dst.type == PG_EDIT_FIXED)
+ {
+ dst_port = pg_edit_get_value (&p->dst, PG_EDIT_LO);
+ pi = ip_get_tcp_udp_port_info (im, dst_port);
+ }
+
+ if (pi && pi->unformat_pg_edit
+ && unformat_user (input, pi->unformat_pg_edit, s))
+ ;
+
+ else if (!unformat_user (input, unformat_pg_payload, s))
+ goto error;
+
+ if (p->checksum.type == PG_EDIT_UNSPECIFIED)
+ {
+ pg_edit_group_t *g = pg_stream_get_group (s, group_index);
+ g->edit_function = tcp_pg_edit_function;
+ g->edit_function_opaque = 0;
+ }
+
+ return 1;
+ }
+
+error:
+ /* Free up any edits we may have added. */
+ pg_free_edit_group (s);
+ return 0;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/tcp/tcp_syn_filter4.c b/src/vnet/tcp/tcp_syn_filter4.c
new file mode 100644
index 00000000..9b2a8ac7
--- /dev/null
+++ b/src/vnet/tcp/tcp_syn_filter4.c
@@ -0,0 +1,545 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vppinfra/error.h>
+#include <vnet/feature/feature.h>
+#include <vnet/ip/ip.h>
+#include <vppinfra/xxhash.h>
+
+typedef struct
+{
+ f64 next_reset;
+ f64 reset_interval;
+ u8 *syn_counts;
+} syn_filter4_runtime_t;
+
+typedef struct
+{
+ u32 next_index;
+ int not_a_syn;
+ u8 filter_value;
+} syn_filter4_trace_t;
+
+/* packet trace format function */
+static u8 *
+format_syn_filter4_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ syn_filter4_trace_t *t = va_arg (*args, syn_filter4_trace_t *);
+
+ s = format (s, "SYN_FILTER4: next index %d, %s",
+ t->next_index, t->not_a_syn ? "not a syn" : "syn");
+ if (t->not_a_syn == 0)
+ s = format (s, ", filter value %d\n", t->filter_value);
+ else
+ s = format (s, "\n");
+ return s;
+}
+
+static vlib_node_registration_t syn_filter4_node;
+
+#define foreach_syn_filter_error \
+_(THROTTLED, "TCP SYN packet throttle drops") \
+_(OK, "TCP SYN packets passed")
+
+typedef enum
+{
+#define _(sym,str) SYN_FILTER_ERROR_##sym,
+ foreach_syn_filter_error
+#undef _
+ SYN_FILTER_N_ERROR,
+} syn_filter_error_t;
+
+static char *syn_filter4_error_strings[] = {
+#define _(sym,string) string,
+ foreach_syn_filter_error
+#undef _
+};
+
+typedef enum
+{
+ SYN_FILTER_NEXT_DROP,
+ SYN_FILTER_N_NEXT,
+} syn_filter_next_t;
+
+extern vnet_feature_arc_registration_t vnet_feat_arc_ip4_local;
+
+static uword
+syn_filter4_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node, vlib_frame_t * frame)
+{
+ u32 n_left_from, *from, *to_next;
+ syn_filter_next_t next_index;
+ u32 ok_syn_packets = 0;
+ vnet_feature_main_t *fm = &feature_main;
+ u8 arc_index = vnet_feat_arc_ip4_local.feature_arc_index;
+ vnet_feature_config_main_t *cm = &fm->feature_config_mains[arc_index];
+ syn_filter4_runtime_t *rt = (syn_filter4_runtime_t *) node->runtime_data;
+ f64 now = vlib_time_now (vm);
+ /* Shut up spurious gcc warnings. */
+ u8 *c0 = 0, *c1 = 0, *c2 = 0, *c3 = 0;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ if (now > rt->next_reset)
+ {
+ memset (rt->syn_counts, 0, vec_len (rt->syn_counts));
+ rt->next_reset = now + rt->reset_interval;
+ }
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from >= 8 && n_left_to_next >= 4)
+ {
+ u32 bi0, bi1, bi2, bi3;
+ vlib_buffer_t *b0, *b1, *b2, *b3;
+ u32 next0, next1, next2, next3;
+ ip4_header_t *ip0, *ip1, *ip2, *ip3;
+ tcp_header_t *tcp0, *tcp1, *tcp2, *tcp3;
+ u32 not_a_syn0 = 1, not_a_syn1 = 1, not_a_syn2 = 1, not_a_syn3 = 1;
+ u64 hash0, hash1, hash2, hash3;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t *p4, *p5, *p6, *p7;
+
+ p4 = vlib_get_buffer (vm, from[4]);
+ p5 = vlib_get_buffer (vm, from[5]);
+ p6 = vlib_get_buffer (vm, from[6]);
+ p7 = vlib_get_buffer (vm, from[7]);
+
+ vlib_prefetch_buffer_header (p4, LOAD);
+ vlib_prefetch_buffer_header (p5, LOAD);
+ vlib_prefetch_buffer_header (p6, LOAD);
+ vlib_prefetch_buffer_header (p7, LOAD);
+
+ CLIB_PREFETCH (p4->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p5->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p6->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p7->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ to_next[2] = bi2 = from[2];
+ to_next[3] = bi3 = from[3];
+ from += 4;
+ to_next += 4;
+ n_left_from -= 4;
+ n_left_to_next -= 4;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+ b2 = vlib_get_buffer (vm, bi2);
+ b3 = vlib_get_buffer (vm, bi3);
+
+ vnet_get_config_data
+ (&cm->config_main, &b0->current_config_index,
+ &next0, 0 /* sizeof (c0[0]) */ );
+ vnet_get_config_data
+ (&cm->config_main, &b1->current_config_index,
+ &next1, 0 /* sizeof (c0[0]) */ );
+ vnet_get_config_data
+ (&cm->config_main, &b2->current_config_index,
+ &next2, 0 /* sizeof (c0[0]) */ );
+ vnet_get_config_data
+ (&cm->config_main, &b3->current_config_index,
+ &next3, 0 /* sizeof (c0[0]) */ );
+
+ /* Not TCP? */
+ ip0 = vlib_buffer_get_current (b0);
+ if (ip0->protocol != IP_PROTOCOL_TCP)
+ goto trace00;
+
+ tcp0 = ip4_next_header (ip0);
+ /*
+ * Not a SYN?
+ * $$$$ hack: the TCP bitfield flags seem not to compile
+ * correct code.
+ */
+ if (PREDICT_TRUE (!(tcp0->flags & 0x2)))
+ goto trace00;
+
+ not_a_syn0 = 0;
+ hash0 = clib_xxhash ((u64) ip0->src_address.as_u32);
+ c0 = &rt->syn_counts[hash0 & (_vec_len (rt->syn_counts) - 1)];
+ if (PREDICT_FALSE (*c0 >= 0x80))
+ {
+ next0 = SYN_FILTER_NEXT_DROP;
+ b0->error = node->errors[SYN_FILTER_ERROR_THROTTLED];
+ goto trace00;
+ }
+ *c0 += 1;
+ ok_syn_packets++;
+
+ trace00:
+ if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ syn_filter4_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->not_a_syn = not_a_syn0;
+ t->next_index = next0;
+ t->filter_value = not_a_syn0 ? 0 : *c0;
+ }
+
+ /* Not TCP? */
+ ip1 = vlib_buffer_get_current (b1);
+ if (ip1->protocol != IP_PROTOCOL_TCP)
+ goto trace01;
+
+ tcp1 = ip4_next_header (ip1);
+ /*
+ * Not a SYN?
+ * $$$$ hack: the TCP bitfield flags seem not to compile
+ * correct code.
+ */
+ if (PREDICT_TRUE (!(tcp1->flags & 0x2)))
+ goto trace01;
+
+ not_a_syn1 = 0;
+ hash1 = clib_xxhash ((u64) ip1->src_address.as_u32);
+ c1 = &rt->syn_counts[hash1 & (_vec_len (rt->syn_counts) - 1)];
+ if (PREDICT_FALSE (*c1 >= 0x80))
+ {
+ next1 = SYN_FILTER_NEXT_DROP;
+ b1->error = node->errors[SYN_FILTER_ERROR_THROTTLED];
+ goto trace01;
+ }
+ *c1 += 1;
+ ok_syn_packets++;
+
+ trace01:
+ if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b1->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ syn_filter4_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ t->not_a_syn = not_a_syn1;
+ t->next_index = next1;
+ t->filter_value = not_a_syn1 ? 0 : *c1;
+ }
+
+ /* Not TCP? */
+ ip2 = vlib_buffer_get_current (b2);
+ if (ip2->protocol != IP_PROTOCOL_TCP)
+ goto trace02;
+
+ tcp2 = ip4_next_header (ip2);
+ /*
+ * Not a SYN?
+ * $$$$ hack: the TCP bitfield flags seem not to compile
+ * correct code.
+ */
+ if (PREDICT_TRUE (!(tcp2->flags & 0x2)))
+ goto trace02;
+
+ not_a_syn2 = 0;
+ hash2 = clib_xxhash ((u64) ip2->src_address.as_u32);
+ c2 = &rt->syn_counts[hash2 & (_vec_len (rt->syn_counts) - 1)];
+ if (PREDICT_FALSE (*c2 >= 0x80))
+ {
+ next2 = SYN_FILTER_NEXT_DROP;
+ b2->error = node->errors[SYN_FILTER_ERROR_THROTTLED];
+ goto trace02;
+ }
+ *c2 += 1;
+ ok_syn_packets++;
+
+ trace02:
+ if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b2->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ syn_filter4_trace_t *t =
+ vlib_add_trace (vm, node, b2, sizeof (*t));
+ t->not_a_syn = not_a_syn2;
+ t->next_index = next2;
+ t->filter_value = not_a_syn2 ? 0 : *c2;
+ }
+
+ /* Not TCP? */
+ ip3 = vlib_buffer_get_current (b3);
+ if (ip3->protocol != IP_PROTOCOL_TCP)
+ goto trace03;
+
+ tcp3 = ip4_next_header (ip3);
+ /*
+ * Not a SYN?
+ * $$$$ hack: the TCP bitfield flags seem not to compile
+ * correct code.
+ */
+ if (PREDICT_TRUE (!(tcp3->flags & 0x2)))
+ goto trace03;
+
+ not_a_syn3 = 0;
+ hash3 = clib_xxhash ((u64) ip3->src_address.as_u32);
+ c3 = &rt->syn_counts[hash3 & (_vec_len (rt->syn_counts) - 1)];
+ if (PREDICT_FALSE (*c3 >= 0x80))
+ {
+ next3 = SYN_FILTER_NEXT_DROP;
+ b3->error = node->errors[SYN_FILTER_ERROR_THROTTLED];
+ goto trace03;
+ }
+ *c3 += 1;
+ ok_syn_packets++;
+
+ trace03:
+ if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b3->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ syn_filter4_trace_t *t =
+ vlib_add_trace (vm, node, b3, sizeof (*t));
+ t->not_a_syn = not_a_syn3;
+ t->next_index = next3;
+ t->filter_value = not_a_syn3 ? 0 : *c3;
+ }
+ vlib_validate_buffer_enqueue_x4 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, bi2, bi3,
+ next0, next1, next2, next3);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t *b0;
+ u32 next0;
+ ip4_header_t *ip0;
+ tcp_header_t *tcp0;
+ u32 not_a_syn0 = 1;
+ u32 hash0;
+ u8 *c0;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ vnet_get_config_data
+ (&cm->config_main, &b0->current_config_index,
+ &next0, 0 /* sizeof (c0[0]) */ );
+
+ /* Not TCP? */
+ ip0 = vlib_buffer_get_current (b0);
+ if (ip0->protocol != IP_PROTOCOL_TCP)
+ goto trace0;
+
+ tcp0 = ip4_next_header (ip0);
+ /*
+ * Not a SYN?
+ * $$$$ hack: the TCP bitfield flags seem not to compile
+ * correct code.
+ */
+ if (PREDICT_TRUE (!(tcp0->flags & 0x2)))
+ goto trace0;
+
+ not_a_syn0 = 0;
+ hash0 = clib_xxhash ((u64) ip0->src_address.as_u32);
+ c0 = &rt->syn_counts[hash0 & (_vec_len (rt->syn_counts) - 1)];
+ if (PREDICT_FALSE (*c0 >= 0x80))
+ {
+ next0 = SYN_FILTER_NEXT_DROP;
+ b0->error = node->errors[SYN_FILTER_ERROR_THROTTLED];
+ goto trace0;
+ }
+ *c0 += 1;
+ ok_syn_packets++;
+
+ trace0:
+
+ if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ syn_filter4_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->not_a_syn = not_a_syn0;
+ t->next_index = next0;
+ t->filter_value = not_a_syn0 ? 0 : *c0;
+ }
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_node_increment_counter (vm, syn_filter4_node.index,
+ SYN_FILTER_ERROR_OK, ok_syn_packets);
+ return frame->n_vectors;
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (syn_filter4_node, static) =
+{
+ .function = syn_filter4_node_fn,
+ .name = "syn-filter-4",
+ .vector_size = sizeof (u32),
+ .format_trace = format_syn_filter4_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .runtime_data_bytes = sizeof (syn_filter4_runtime_t),
+ .n_errors = ARRAY_LEN(syn_filter4_error_strings),
+ .error_strings = syn_filter4_error_strings,
+
+ .n_next_nodes = SYN_FILTER_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [SYN_FILTER_NEXT_DROP] = "error-drop",
+ },
+};
+/* *INDENT-ON* */
+
+VLIB_NODE_FUNCTION_MULTIARCH (syn_filter4_node, syn_filter4_node_fn);
+
+/* *INDENT-OFF* */
+VNET_FEATURE_INIT (syn_filter_4, static) =
+{
+ .arc_name = "ip4-local",
+ .node_name = "syn-filter-4",
+ .runs_before = VNET_FEATURES("ip4-local-end-of-arc"),
+};
+/* *INDENT-ON* */
+
+int
+syn_filter_enable_disable (u32 sw_if_index, int enable_disable)
+{
+ vnet_main_t *vnm = vnet_get_main ();
+ vnet_sw_interface_t *sw;
+ int rv = 0;
+
+ /* Utterly wrong? */
+ if (pool_is_free_index (vnm->interface_main.sw_interfaces, sw_if_index))
+ return VNET_API_ERROR_INVALID_SW_IF_INDEX;
+
+ /* Not a physical port? */
+ sw = vnet_get_sw_interface (vnm, sw_if_index);
+ if (sw->type != VNET_SW_INTERFACE_TYPE_HARDWARE)
+ return VNET_API_ERROR_INVALID_SW_IF_INDEX;
+
+ if (enable_disable)
+ {
+ syn_filter4_runtime_t *rt;
+
+ /* *INDENT-OFF* */
+ foreach_vlib_main ({
+ rt = vlib_node_get_runtime_data (this_vlib_main, syn_filter4_node.index);
+ vec_validate (rt->syn_counts, 1023);
+ /*
+ * Given perfect disperson / optimal hashing results:
+ * Allow 128k (successful) syns/sec. 1024, buckets each of which
+ * absorb 128 syns before filtering. Reset table once a second.
+ * Reality bites, lets try resetting once every 100ms.
+ */
+ rt->reset_interval = 0.1; /* reset interval in seconds */
+ });
+ /* *INDENT-ON* */
+ }
+
+ rv = vnet_feature_enable_disable ("ip4-local", "syn-filter-4",
+ sw_if_index, enable_disable, 0, 0);
+
+ return rv;
+}
+
+static clib_error_t *
+syn_filter_enable_disable_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t *vnm = vnet_get_main ();
+ u32 sw_if_index = ~0;
+ int enable_disable = 1;
+ int rv;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "disable"))
+ enable_disable = 0;
+ else if (unformat (input, "%U", unformat_vnet_sw_interface,
+ vnm, &sw_if_index))
+ ;
+ else
+ break;
+ }
+
+ if (sw_if_index == ~0)
+ return clib_error_return (0, "Please specify an interface...");
+
+ rv = syn_filter_enable_disable (sw_if_index, enable_disable);
+
+ switch (rv)
+ {
+ case 0:
+ break;
+
+ case VNET_API_ERROR_INVALID_SW_IF_INDEX:
+ return clib_error_return
+ (0, "Invalid interface, only works on physical ports");
+ break;
+
+ case VNET_API_ERROR_UNIMPLEMENTED:
+ return clib_error_return (0,
+ "Device driver doesn't support redirection");
+ break;
+
+ case VNET_API_ERROR_INVALID_VALUE:
+ return clib_error_return (0, "feature arc not found");
+
+ case VNET_API_ERROR_INVALID_VALUE_2:
+ return clib_error_return (0, "feature node not found");
+
+ default:
+ return clib_error_return (0, "syn_filter_enable_disable returned %d",
+ rv);
+ }
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (sr_content_command, static) =
+{
+ .path = "ip syn filter",
+ .short_help = "ip syn filter <interface-name> [disable]",
+ .function = syn_filter_enable_disable_command_fn,
+};
+/* *INDENT-ON* */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c
new file mode 100644
index 00000000..37640cc6
--- /dev/null
+++ b/src/vnet/tcp/tcp_test.c
@@ -0,0 +1,1764 @@
+/*
+ * Copyright (c) 2017 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/tcp/tcp.h>
+
+#define TCP_TEST_I(_cond, _comment, _args...) \
+({ \
+ int _evald = (_cond); \
+ if (!(_evald)) { \
+ fformat(stderr, "FAIL:%d: " _comment "\n", \
+ __LINE__, ##_args); \
+ } else { \
+ fformat(stderr, "PASS:%d: " _comment "\n", \
+ __LINE__, ##_args); \
+ } \
+ _evald; \
+})
+
+#define TCP_TEST(_cond, _comment, _args...) \
+{ \
+ if (!TCP_TEST_I(_cond, _comment, ##_args)) { \
+ return 1; \
+ } \
+}
+
+/* *INDENT-OFF* */
+scoreboard_trace_elt_t sb_trace[] = {};
+/* *INDENT-ON* */
+
+static int
+tcp_test_scoreboard_replay (vlib_main_t * vm, unformat_input_t * input)
+{
+ int verbose = 0;
+ tcp_connection_t _tc, *tc = &_tc;
+ u8 *s = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "detail"))
+ verbose = 1;
+ else
+ {
+ clib_error_t *e = clib_error_return
+ (0, "unknown input `%U'", format_unformat_error, input);
+ clib_error_report (e);
+ return -1;
+ }
+ }
+
+#if TCP_SCOREBOARD_TRACE
+ tc->sack_sb.trace = sb_trace;
+#endif
+ s = tcp_scoreboard_replay (s, tc, verbose);
+ vlib_cli_output (vm, "%v", s);
+ return 0;
+}
+
+static int
+tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input)
+{
+ tcp_connection_t _tc, *tc = &_tc;
+ sack_scoreboard_t *sb = &tc->sack_sb;
+ sack_block_t *sacks = 0, block;
+ sack_scoreboard_hole_t *hole;
+ int i, verbose = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "verbose"))
+ verbose = 1;
+ else if (unformat (input, "replay"))
+ return tcp_test_scoreboard_replay (vm, input);
+ }
+
+ memset (tc, 0, sizeof (*tc));
+
+ tc->snd_una = 0;
+ tc->snd_una_max = 1000;
+ tc->snd_nxt = 1000;
+ tc->rcv_opts.flags |= TCP_OPTS_FLAG_SACK;
+ scoreboard_init (&tc->sack_sb);
+
+ for (i = 0; i < 1000 / 100; i++)
+ {
+ block.start = i * 100;
+ block.end = (i + 1) * 100;
+ vec_add1 (sacks, block);
+ }
+
+ /*
+ * Inject even blocks
+ */
+
+ for (i = 0; i < 1000 / 200; i++)
+ {
+ vec_add1 (tc->rcv_opts.sacks, sacks[i * 2]);
+ }
+ tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks);
+ tcp_rcv_sacks (tc, 0);
+
+ if (verbose)
+ vlib_cli_output (vm, "sb after even blocks:\n%U", format_tcp_scoreboard,
+ sb);
+
+ TCP_TEST ((pool_elts (sb->holes) == 5),
+ "scoreboard has %d elements", pool_elts (sb->holes));
+
+ /* First SACK block should be rejected */
+ hole = scoreboard_first_hole (sb);
+ TCP_TEST ((hole->start == 0 && hole->end == 200),
+ "first hole start %u end %u", hole->start, hole->end);
+ hole = scoreboard_last_hole (sb);
+ TCP_TEST ((hole->start == 900 && hole->end == 1000),
+ "last hole start %u end %u", hole->start, hole->end);
+ TCP_TEST ((sb->sacked_bytes == 400), "sacked bytes %d", sb->sacked_bytes);
+ TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv);
+ TCP_TEST ((sb->last_sacked_bytes == 400),
+ "last sacked bytes %d", sb->last_sacked_bytes);
+ TCP_TEST ((sb->high_sacked == 900), "max byte sacked %u", sb->high_sacked);
+ /*
+ * Inject odd blocks
+ */
+
+ vec_reset_length (tc->rcv_opts.sacks);
+ for (i = 0; i < 1000 / 200; i++)
+ {
+ vec_add1 (tc->rcv_opts.sacks, sacks[i * 2 + 1]);
+ }
+ tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks);
+ tcp_rcv_sacks (tc, 0);
+
+ if (verbose)
+ vlib_cli_output (vm, "sb after odd blocks:\n%U", format_tcp_scoreboard,
+ sb);
+
+ hole = scoreboard_first_hole (sb);
+ TCP_TEST ((pool_elts (sb->holes) == 1),
+ "scoreboard has %d holes", pool_elts (sb->holes));
+ TCP_TEST ((hole->start == 0 && hole->end == 100),
+ "first hole start %u end %u", hole->start, hole->end);
+ TCP_TEST ((sb->sacked_bytes == 900), "sacked bytes %d", sb->sacked_bytes);
+ TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv);
+ TCP_TEST ((sb->high_sacked == 1000), "max sacked byte %u", sb->high_sacked);
+ TCP_TEST ((sb->last_sacked_bytes == 500),
+ "last sacked bytes %d", sb->last_sacked_bytes);
+
+ /*
+ * Ack until byte 100, all bytes are now acked + sacked
+ */
+ tcp_rcv_sacks (tc, 100);
+ if (verbose)
+ vlib_cli_output (vm, "ack until byte 100:\n%U", format_tcp_scoreboard,
+ sb);
+
+ TCP_TEST ((pool_elts (sb->holes) == 0),
+ "scoreboard has %d elements", pool_elts (sb->holes));
+ TCP_TEST ((sb->snd_una_adv == 900),
+ "snd_una_adv after ack %u", sb->snd_una_adv);
+ TCP_TEST ((sb->high_sacked == 1000), "max sacked byte %u", sb->high_sacked);
+ TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes);
+ TCP_TEST ((sb->last_sacked_bytes == 0),
+ "last sacked bytes %d", sb->last_sacked_bytes);
+
+ /*
+ * Add new block
+ */
+
+ vec_reset_length (tc->rcv_opts.sacks);
+
+ block.start = 1200;
+ block.end = 1300;
+ vec_add1 (tc->rcv_opts.sacks, block);
+
+ if (verbose)
+ vlib_cli_output (vm, "add [1200, 1300]:\n%U", format_tcp_scoreboard, sb);
+ tc->snd_una_max = 1500;
+ tc->snd_una = 1000;
+ tc->snd_nxt = 1500;
+ tcp_rcv_sacks (tc, 1000);
+
+ if (verbose)
+ vlib_cli_output (vm, "sb snd_una_max 1500, snd_una 1000:\n%U",
+ format_tcp_scoreboard, sb);
+
+ TCP_TEST ((sb->snd_una_adv == 0),
+ "snd_una_adv after ack %u", sb->snd_una_adv);
+ TCP_TEST ((pool_elts (sb->holes) == 2),
+ "scoreboard has %d holes", pool_elts (sb->holes));
+ hole = scoreboard_first_hole (sb);
+ TCP_TEST ((hole->start == 1000 && hole->end == 1200),
+ "first hole start %u end %u", hole->start, hole->end);
+ TCP_TEST ((sb->snd_una_adv == 0),
+ "snd_una_adv after ack %u", sb->snd_una_adv);
+ TCP_TEST ((sb->high_sacked == 1300), "max sacked byte %u", sb->high_sacked);
+ hole = scoreboard_last_hole (sb);
+ TCP_TEST ((hole->start == 1300 && hole->end == 1500),
+ "last hole start %u end %u", hole->start, hole->end);
+ TCP_TEST ((sb->sacked_bytes == 100), "sacked bytes %d", sb->sacked_bytes);
+
+ /*
+ * Ack first hole
+ */
+
+ vec_reset_length (tc->rcv_opts.sacks);
+ tcp_rcv_sacks (tc, 1200);
+
+ if (verbose)
+ vlib_cli_output (vm, "sb ack up to byte 1200:\n%U", format_tcp_scoreboard,
+ sb);
+
+ TCP_TEST ((sb->snd_una_adv == 100),
+ "snd_una_adv after ack %u", sb->snd_una_adv);
+ TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes);
+ TCP_TEST ((pool_elts (sb->holes) == 1),
+ "scoreboard has %d elements", pool_elts (sb->holes));
+ hole = scoreboard_first_hole (sb);
+ TCP_TEST ((hole->prev == TCP_INVALID_SACK_HOLE_INDEX
+ && hole->next == TCP_INVALID_SACK_HOLE_INDEX), "hole is valid");
+ TCP_TEST ((sb->last_bytes_delivered == 100), "last bytes delivered %d",
+ sb->last_bytes_delivered);
+
+ /*
+ * Add some more blocks and then remove all
+ */
+ vec_reset_length (tc->rcv_opts.sacks);
+ tc->snd_una += sb->snd_una_adv;
+ tc->snd_una_max = 1900;
+ for (i = 0; i < 5; i++)
+ {
+ block.start = i * 100 + 1200;
+ block.end = (i + 1) * 100 + 1200;
+ vec_add1 (tc->rcv_opts.sacks, block);
+ }
+ tcp_rcv_sacks (tc, 1900);
+
+ scoreboard_clear (sb);
+ if (verbose)
+ vlib_cli_output (vm, "sb cleared all:\n%U", format_tcp_scoreboard, sb);
+
+ TCP_TEST ((pool_elts (sb->holes) == 0),
+ "number of holes %d", pool_elts (sb->holes));
+ TCP_TEST ((sb->head == TCP_INVALID_SACK_HOLE_INDEX), "head %u", sb->head);
+ TCP_TEST ((sb->tail == TCP_INVALID_SACK_HOLE_INDEX), "tail %u", sb->tail);
+
+ /*
+ * Re-inject odd blocks and ack them all
+ */
+
+ tc->snd_una = 0;
+ tc->snd_una_max = 1000;
+ tc->snd_nxt = 1000;
+ for (i = 0; i < 5; i++)
+ {
+ vec_add1 (tc->rcv_opts.sacks, sacks[i * 2 + 1]);
+ }
+ tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks);
+ tcp_rcv_sacks (tc, 0);
+ if (verbose)
+ vlib_cli_output (vm, "sb added odd blocks and ack [0, 950]:\n%U",
+ format_tcp_scoreboard, sb);
+
+ tcp_rcv_sacks (tc, 950);
+
+ if (verbose)
+ vlib_cli_output (vm, "sb added odd blocks and ack [0, 950]:\n%U",
+ format_tcp_scoreboard, sb);
+
+ TCP_TEST ((pool_elts (sb->holes) == 0),
+ "scoreboard has %d elements", pool_elts (sb->holes));
+ TCP_TEST ((sb->snd_una_adv == 50), "snd_una_adv %u", sb->snd_una_adv);
+ TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes);
+ TCP_TEST ((sb->last_sacked_bytes == 0),
+ "last sacked bytes %d", sb->last_sacked_bytes);
+
+ /*
+ * Inject one block, ack it and overlap hole
+ */
+
+ tc->snd_una = 0;
+ tc->snd_una_max = 1000;
+ tc->snd_nxt = 1000;
+
+ block.start = 100;
+ block.end = 500;
+ vec_add1 (tc->rcv_opts.sacks, block);
+ tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks);
+
+ tcp_rcv_sacks (tc, 0);
+
+ if (verbose)
+ vlib_cli_output (vm, "sb added [100, 500]:\n%U",
+ format_tcp_scoreboard, sb);
+
+ tcp_rcv_sacks (tc, 800);
+
+ if (verbose)
+ vlib_cli_output (vm, "sb ack [0, 800]:\n%U", format_tcp_scoreboard, sb);
+
+ TCP_TEST ((pool_elts (sb->holes) == 1),
+ "scoreboard has %d elements", pool_elts (sb->holes));
+ TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv);
+ TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes);
+ TCP_TEST ((sb->last_sacked_bytes == 0),
+ "last sacked bytes %d", sb->last_sacked_bytes);
+ TCP_TEST ((sb->last_bytes_delivered == 400),
+ "last bytes delivered %d", sb->last_bytes_delivered);
+
+ /*
+ * One hole close to head, patch head, split in two and start acking
+ * the lowest part
+ */
+ scoreboard_clear (sb);
+ tc->snd_una = 0;
+ tc->snd_una_max = 1000;
+ tc->snd_nxt = 1000;
+
+ block.start = 500;
+ block.end = 1000;
+ vec_add1 (tc->rcv_opts.sacks, block);
+ tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks);
+
+ tcp_rcv_sacks (tc, 0);
+ if (verbose)
+ vlib_cli_output (vm, "sb added [500, 1000]:\n%U",
+ format_tcp_scoreboard, sb);
+
+ vec_reset_length (tc->rcv_opts.sacks);
+ block.start = 300;
+ block.end = 400;
+ vec_add1 (tc->rcv_opts.sacks, block);
+ tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks);
+ tcp_rcv_sacks (tc, 100);
+ if (verbose)
+ vlib_cli_output (vm, "sb added [0, 100] [300, 400]:\n%U",
+ format_tcp_scoreboard, sb);
+ TCP_TEST ((pool_elts (sb->holes) == 2),
+ "scoreboard has %d elements", pool_elts (sb->holes));
+
+ tc->snd_una = 100;
+ tcp_rcv_sacks (tc, 200);
+ tcp_rcv_sacks (tc, 300);
+ if (verbose)
+ vlib_cli_output (vm, "sb added [0, 300]:\n%U", format_tcp_scoreboard, sb);
+ TCP_TEST ((sb->sacked_bytes == 500), "sacked bytes %d", sb->sacked_bytes);
+
+ return 0;
+}
+
+static int
+tcp_test_sack_tx (vlib_main_t * vm, unformat_input_t * input)
+{
+ tcp_connection_t _tc, *tc = &_tc;
+ sack_block_t *sacks;
+ int i, verbose = 0, expected;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "verbose"))
+ verbose = 1;
+ else
+ {
+ vlib_cli_output (vm, "parse error: '%U'", format_unformat_error,
+ input);
+ return -1;
+ }
+ }
+
+ memset (tc, 0, sizeof (*tc));
+
+ /*
+ * Add odd sack block pairs
+ */
+ for (i = 1; i < 10; i += 2)
+ {
+ tcp_update_sack_list (tc, i * 100, (i + 1) * 100);
+ }
+
+ TCP_TEST ((vec_len (tc->snd_sacks) == 5), "sack blocks %d expected %d",
+ vec_len (tc->snd_sacks), 5);
+ TCP_TEST ((tc->snd_sacks[0].start = 900),
+ "first sack block start %u expected %u", tc->snd_sacks[0].start,
+ 900);
+
+ /*
+ * Try to add one extra
+ */
+ sacks = vec_dup (tc->snd_sacks);
+
+ tcp_update_sack_list (tc, 1100, 1200);
+ if (verbose)
+ vlib_cli_output (vm, "add new segment [1100, 1200]\n%U",
+ format_tcp_sacks, tc);
+ expected = 5 < TCP_MAX_SACK_BLOCKS ? 6 : 5;
+ TCP_TEST ((vec_len (tc->snd_sacks) == expected),
+ "sack blocks %d expected %d", vec_len (tc->snd_sacks), expected);
+ TCP_TEST ((tc->snd_sacks[0].start == 1100),
+ "first sack block start %u expected %u", tc->snd_sacks[0].start,
+ 1100);
+
+ /* restore */
+ vec_free (tc->snd_sacks);
+ tc->snd_sacks = sacks;
+
+ /*
+ * Overlap first 2 segment
+ */
+ tc->rcv_nxt = 300;
+ tcp_update_sack_list (tc, 300, 300);
+ if (verbose)
+ vlib_cli_output (vm, "overlap first 2 segments:\n%U",
+ format_tcp_sacks, tc);
+ TCP_TEST ((vec_len (tc->snd_sacks) == 3), "sack blocks %d expected %d",
+ vec_len (tc->snd_sacks), 3);
+ TCP_TEST ((tc->snd_sacks[0].start == 900),
+ "first sack block start %u expected %u", tc->snd_sacks[0].start,
+ 500);
+
+ /*
+ * Add a new segment
+ */
+ tcp_update_sack_list (tc, 1100, 1200);
+ if (verbose)
+ vlib_cli_output (vm, "add new segment [1100, 1200]\n%U",
+ format_tcp_sacks, tc);
+ TCP_TEST ((vec_len (tc->snd_sacks) == 4), "sack blocks %d expected %d",
+ vec_len (tc->snd_sacks), 4);
+ TCP_TEST ((tc->snd_sacks[0].start == 1100),
+ "first sack block start %u expected %u", tc->snd_sacks[0].start,
+ 1100);
+
+ /*
+ * Join middle segments
+ */
+ tcp_update_sack_list (tc, 800, 900);
+ if (verbose)
+ vlib_cli_output (vm, "join middle segments [800, 900]\n%U",
+ format_tcp_sacks, tc);
+
+ TCP_TEST ((vec_len (tc->snd_sacks) == 3), "sack blocks %d expected %d",
+ vec_len (tc->snd_sacks), 3);
+ TCP_TEST ((tc->snd_sacks[0].start == 700),
+ "first sack block start %u expected %u", tc->snd_sacks[0].start,
+ 1100);
+
+ /*
+ * Advance rcv_nxt to overlap all
+ */
+ tc->rcv_nxt = 1200;
+ tcp_update_sack_list (tc, 1200, 1200);
+ if (verbose)
+ vlib_cli_output (vm, "advance rcv_nxt to 1200\n%U", format_tcp_sacks, tc);
+ TCP_TEST ((vec_len (tc->snd_sacks) == 0), "sack blocks %d expected %d",
+ vec_len (tc->snd_sacks), 0);
+
+
+ /*
+ * Add 2 blocks, overwrite first and update rcv_nxt to also remove it
+ */
+
+ vec_reset_length (tc->snd_sacks);
+ tc->rcv_nxt = 0;
+
+ tcp_update_sack_list (tc, 100, 200);
+ tcp_update_sack_list (tc, 300, 400);
+
+ if (verbose)
+ vlib_cli_output (vm, "add [100, 200] [300, 400]\n%U",
+ format_tcp_sacks, tc);
+ TCP_TEST ((vec_len (tc->snd_sacks) == 2),
+ "sack blocks %d expected %d", vec_len (tc->snd_sacks), 2);
+ TCP_TEST ((tc->snd_sacks[0].start == 300),
+ "first sack block start %u expected %u", tc->snd_sacks[0].start,
+ 300);
+
+ tc->rcv_nxt = 100;
+ tcp_update_sack_list (tc, 100, 100);
+ if (verbose)
+ vlib_cli_output (vm, "add [100, 200] rcv_nxt = 100\n%U",
+ format_tcp_sacks, tc);
+ TCP_TEST ((vec_len (tc->snd_sacks) == 1),
+ "sack blocks %d expected %d", vec_len (tc->snd_sacks), 1);
+ TCP_TEST ((tc->snd_sacks[0].start == 300),
+ "first sack block start %u expected %u", tc->snd_sacks[0].start,
+ 300);
+ return 0;
+}
+
+static int
+tcp_test_sack (vlib_main_t * vm, unformat_input_t * input)
+{
+ int res = 0;
+
+ /* Run all tests */
+ if (unformat_check_input (input) == UNFORMAT_END_OF_INPUT)
+ {
+ if (tcp_test_sack_tx (vm, input))
+ {
+ return -1;
+ }
+
+ if (tcp_test_sack_rx (vm, input))
+ {
+ return -1;
+ }
+ }
+ else
+ {
+ if (unformat (input, "tx"))
+ {
+ res = tcp_test_sack_tx (vm, input);
+ }
+ else if (unformat (input, "rx"))
+ {
+ res = tcp_test_sack_rx (vm, input);
+ }
+ }
+
+ return res;
+}
+
+
+typedef struct
+{
+ u32 offset;
+ u32 len;
+} test_pattern_t;
+
+/* *INDENT-OFF* */
+test_pattern_t test_pattern[] = {
+ {380, 8}, {768, 8}, {1156, 8}, {1544, 8}, {1932, 8}, {2320, 8}, {2708, 8},
+ {2992, 8}, {372, 8}, {760, 8}, {1148, 8}, {1536, 8}, {1924, 8}, {2312, 8},
+ {2700, 8}, {2984, 8}, {364, 8}, {752, 8}, {1140, 8}, {1528, 8}, {1916, 8},
+ {2304, 8}, {2692, 8}, {2976, 8}, {356, 8}, {744, 8}, {1132, 8}, {1520, 8},
+ {1908, 8}, {2296, 8}, {2684, 8}, {2968, 8}, {348, 8}, {736, 8}, {1124, 8},
+ {1512, 8}, {1900, 8}, {2288, 8}, {2676, 8}, {2960, 8}, {340, 8}, {728, 8},
+ {1116, 8}, {1504, 8}, {1892, 8}, {2280, 8}, {2668, 8}, {2952, 8}, {332, 8},
+ {720, 8}, {1108, 8}, {1496, 8}, {1884, 8}, {2272, 8}, {2660, 8}, {2944, 8},
+ {324, 8}, {712, 8}, {1100, 8}, {1488, 8}, {1876, 8}, {2264, 8}, {2652, 8},
+ {2936, 8}, {316, 8}, {704, 8}, {1092, 8}, {1480, 8}, {1868, 8}, {2256, 8},
+ {2644, 8}, {2928, 8}, {308, 8}, {696, 8}, {1084, 8}, {1472, 8}, {1860, 8},
+ {2248, 8}, {2636, 8}, {2920, 8}, {300, 8}, {688, 8}, {1076, 8}, {1464, 8},
+ {1852, 8}, {2240, 8}, {2628, 8}, {2912, 8}, {292, 8}, {680, 8}, {1068, 8},
+ {1456, 8}, {1844, 8}, {2232, 8}, {2620, 8}, {2904, 8}, {284, 8}, {672, 8},
+ {1060, 8}, {1448, 8}, {1836, 8}, {2224, 8}, {2612, 8}, {2896, 8}, {276, 8},
+ {664, 8}, {1052, 8}, {1440, 8}, {1828, 8}, {2216, 8}, {2604, 8}, {2888, 8},
+ {268, 8}, {656, 8}, {1044, 8}, {1432, 8}, {1820, 8}, {2208, 8}, {2596, 8},
+ {2880, 8}, {260, 8}, {648, 8}, {1036, 8}, {1424, 8}, {1812, 8}, {2200, 8},
+ {2588, 8}, {2872, 8}, {252, 8}, {640, 8}, {1028, 8}, {1416, 8}, {1804, 8},
+ {2192, 8}, {2580, 8}, {2864, 8}, {244, 8}, {632, 8}, {1020, 8}, {1408, 8},
+ {1796, 8}, {2184, 8}, {2572, 8}, {2856, 8}, {236, 8}, {624, 8}, {1012, 8},
+ {1400, 8}, {1788, 8}, {2176, 8}, {2564, 8}, {2848, 8}, {228, 8}, {616, 8},
+ {1004, 8}, {1392, 8}, {1780, 8}, {2168, 8}, {2556, 8}, {2840, 8}, {220, 8},
+ {608, 8}, {996, 8}, {1384, 8}, {1772, 8}, {2160, 8}, {2548, 8}, {2832, 8},
+ {212, 8}, {600, 8}, {988, 8}, {1376, 8}, {1764, 8}, {2152, 8}, {2540, 8},
+ {2824, 8}, {204, 8}, {592, 8}, {980, 8}, {1368, 8}, {1756, 8}, {2144, 8},
+ {2532, 8}, {2816, 8}, {196, 8}, {584, 8}, {972, 8}, {1360, 8}, {1748, 8},
+ {2136, 8}, {2524, 8}, {2808, 8}, {188, 8}, {576, 8}, {964, 8}, {1352, 8},
+ {1740, 8}, {2128, 8}, {2516, 8}, {2800, 8}, {180, 8}, {568, 8}, {956, 8},
+ {1344, 8}, {1732, 8}, {2120, 8}, {2508, 8}, {2792, 8}, {172, 8}, {560, 8},
+ {948, 8}, {1336, 8}, {1724, 8}, {2112, 8}, {2500, 8}, {2784, 8}, {164, 8},
+ {552, 8}, {940, 8}, {1328, 8}, {1716, 8}, {2104, 8}, {2492, 8}, {2776, 8},
+ {156, 8}, {544, 8}, {932, 8}, {1320, 8}, {1708, 8}, {2096, 8}, {2484, 8},
+ {2768, 8}, {148, 8}, {536, 8}, {924, 8}, {1312, 8}, {1700, 8}, {2088, 8},
+ {2476, 8}, {2760, 8}, {140, 8}, {528, 8}, {916, 8}, {1304, 8}, {1692, 8},
+ {2080, 8}, {2468, 8}, {2752, 8}, {132, 8}, {520, 8}, {908, 8}, {1296, 8},
+ {1684, 8}, {2072, 8}, {2460, 8}, {2744, 8}, {124, 8}, {512, 8}, {900, 8},
+ {1288, 8}, {1676, 8}, {2064, 8}, {2452, 8}, {2736, 8}, {116, 8}, {504, 8},
+ {892, 8}, {1280, 8}, {1668, 8}, {2056, 8}, {2444, 8}, {2728, 8}, {108, 8},
+ {496, 8}, {884, 8}, {1272, 8}, {1660, 8}, {2048, 8}, {2436, 8}, {2720, 8},
+ {100, 8}, {488, 8}, {876, 8}, {1264, 8}, {1652, 8}, {2040, 8}, {2428, 8},
+ {2716, 4}, {92, 8}, {480, 8}, {868, 8}, {1256, 8}, {1644, 8}, {2032, 8},
+ {2420, 8}, {84, 8}, {472, 8}, {860, 8}, {1248, 8}, {1636, 8}, {2024, 8},
+ {2412, 8}, {76, 8}, {464, 8}, {852, 8}, {1240, 8}, {1628, 8}, {2016, 8},
+ {2404, 8}, {68, 8}, {456, 8}, {844, 8}, {1232, 8}, {1620, 8}, {2008, 8},
+ {2396, 8}, {60, 8}, {448, 8}, {836, 8}, {1224, 8}, {1612, 8}, {2000, 8},
+ {2388, 8}, {52, 8}, {440, 8}, {828, 8}, {1216, 8}, {1604, 8}, {1992, 8},
+ {2380, 8}, {44, 8}, {432, 8}, {820, 8}, {1208, 8}, {1596, 8}, {1984, 8},
+ {2372, 8}, {36, 8}, {424, 8}, {812, 8}, {1200, 8}, {1588, 8}, {1976, 8},
+ {2364, 8}, {28, 8}, {416, 8}, {804, 8}, {1192, 8}, {1580, 8}, {1968, 8},
+ {2356, 8}, {20, 8}, {408, 8}, {796, 8}, {1184, 8}, {1572, 8}, {1960, 8},
+ {2348, 8}, {12, 8}, {400, 8}, {788, 8}, {1176, 8}, {1564, 8}, {1952, 8},
+ {2340, 8}, {4, 8}, {392, 8}, {780, 8}, {1168, 8}, {1556, 8}, {1944, 8},
+ {2332, 8},
+ /* missing from original data set */
+ {388, 4}, {776, 4}, {1164, 4}, {1552, 4}, {1940, 4}, {2328, 4},
+};
+/* *INDENT-ON* */
+
+int
+pattern_cmp (const void *arg1, const void *arg2)
+{
+ test_pattern_t *a1 = (test_pattern_t *) arg1;
+ test_pattern_t *a2 = (test_pattern_t *) arg2;
+
+ if (a1->offset < a2->offset)
+ return -1;
+ else if (a1->offset > a2->offset)
+ return 1;
+ return 0;
+}
+
+static u8
+fifo_validate_pattern (vlib_main_t * vm, test_pattern_t * pattern,
+ u32 pattern_length)
+{
+ test_pattern_t *tp = pattern;
+ int i;
+
+ /* Go through the pattern and make 100% sure it's sane */
+ for (i = 0; i < pattern_length - 1; i++)
+ {
+ if (tp->offset + tp->len != (tp + 1)->offset)
+ {
+ vlib_cli_output (vm, "[%d] missing {%d, %d}", i,
+ (tp->offset + tp->len),
+ (tp + 1)->offset - (tp->offset + tp->len));
+ return 0;
+ }
+ tp++;
+ }
+ return 1;
+}
+
+static test_pattern_t *
+fifo_get_validate_pattern (vlib_main_t * vm, test_pattern_t * test_data,
+ u32 test_data_len)
+{
+ test_pattern_t *validate_pattern = 0;
+
+ /* Validate, and try segments in order... */
+ vec_validate (validate_pattern, test_data_len - 1);
+ memcpy (validate_pattern, test_data,
+ test_data_len * sizeof (test_pattern_t));
+ qsort ((u8 *) validate_pattern, test_data_len, sizeof (test_pattern_t),
+ pattern_cmp);
+
+ if (fifo_validate_pattern (vm, validate_pattern, test_data_len) == 0)
+ return 0;
+
+ return validate_pattern;
+}
+
+static svm_fifo_t *
+fifo_prepare (u32 fifo_size)
+{
+ svm_fifo_t *f;
+ f = svm_fifo_create (fifo_size);
+
+ /* Paint fifo data vector with -1's */
+ memset (f->data, 0xFF, fifo_size);
+
+ return f;
+}
+
+static int
+compare_data (u8 * data1, u8 * data2, u32 start, u32 len, u32 * index)
+{
+ int i;
+
+ for (i = start; i < len; i++)
+ {
+ if (data1[i] != data2[i])
+ {
+ *index = i;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int
+tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input)
+{
+ svm_fifo_t *f;
+ u32 fifo_size = 1 << 20;
+ u32 *test_data = 0;
+ u32 offset;
+ int i, rv, verbose = 0;
+ u32 data_word, test_data_len, j;
+ ooo_segment_t *ooo_seg;
+ u8 *data, *s, *data_buf = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "verbose"))
+ verbose = 1;
+ }
+
+ test_data_len = fifo_size / sizeof (u32);
+ vec_validate (test_data, test_data_len - 1);
+
+ for (i = 0; i < vec_len (test_data); i++)
+ test_data[i] = i;
+
+ f = fifo_prepare (fifo_size);
+
+ /*
+ * Enqueue an initial (un-dequeued) chunk
+ */
+ rv = svm_fifo_enqueue_nowait (f, sizeof (u32), (u8 *) test_data);
+ TCP_TEST ((rv == sizeof (u32)), "enqueued %d", rv);
+ TCP_TEST ((f->tail == 4), "fifo tail %u", f->tail);
+
+ /*
+ * Create 3 chunks in the future. The offsets are relative
+ * to the current fifo tail
+ */
+ for (i = 0; i < 3; i++)
+ {
+ offset = (2 * i + 1) * sizeof (u32) - f->tail;
+ data = (u8 *) (test_data + (2 * i + 1));
+ if (i == 0)
+ {
+ rv = svm_fifo_enqueue_nowait (f, sizeof (u32), data);
+ rv = rv > 0 ? 0 : rv;
+ }
+ else
+ rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data);
+ if (verbose)
+ vlib_cli_output (vm, "add [%d] [%d, %d]", 2 * i + 1, offset,
+ offset + sizeof (u32));
+ if (rv)
+ {
+ clib_warning ("enqueue returned %d", rv);
+ goto err;
+ }
+ }
+
+ if (verbose)
+ vlib_cli_output (vm, "fifo after odd segs: %U", format_svm_fifo, f, 1);
+
+ TCP_TEST ((f->tail == 8), "fifo tail %u", f->tail);
+ TCP_TEST ((svm_fifo_number_ooo_segments (f) == 2),
+ "number of ooo segments %u", svm_fifo_number_ooo_segments (f));
+
+ /*
+ * Try adding a completely overlapped segment
+ */
+ offset = 3 * sizeof (u32) - f->tail;
+ data = (u8 *) (test_data + 3);
+ rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data);
+ if (rv)
+ {
+ clib_warning ("enqueue returned %d", rv);
+ goto err;
+ }
+
+ if (verbose)
+ vlib_cli_output (vm, "fifo after overlap seg: %U", format_svm_fifo, f, 1);
+
+ TCP_TEST ((svm_fifo_number_ooo_segments (f) == 2),
+ "number of ooo segments %u", svm_fifo_number_ooo_segments (f));
+
+ /*
+ * Make sure format functions are not buggy
+ */
+ s = format (0, "%U", format_svm_fifo, f, 2);
+ vec_free (s);
+
+ /*
+ * Paint some of missing data backwards
+ */
+ for (i = 3; i > 1; i--)
+ {
+ offset = (2 * i + 0) * sizeof (u32) - f->tail;
+ data = (u8 *) (test_data + (2 * i + 0));
+ rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data);
+ if (verbose)
+ vlib_cli_output (vm, "add [%d] [%d, %d]", 2 * i, offset,
+ offset + sizeof (u32));
+ if (rv)
+ {
+ clib_warning ("enqueue returned %d", rv);
+ goto err;
+ }
+ }
+
+ if (verbose)
+ vlib_cli_output (vm, "fifo before missing link: %U", format_svm_fifo, f,
+ 1);
+ TCP_TEST ((svm_fifo_number_ooo_segments (f) == 1),
+ "number of ooo segments %u", svm_fifo_number_ooo_segments (f));
+ ooo_seg = svm_fifo_first_ooo_segment (f);
+ TCP_TEST ((ooo_seg->start == 12),
+ "first ooo seg position %u", ooo_seg->start);
+ TCP_TEST ((ooo_seg->length == 16),
+ "first ooo seg length %u", ooo_seg->length);
+
+ /*
+ * Enqueue the missing u32
+ */
+ rv = svm_fifo_enqueue_nowait (f, sizeof (u32), (u8 *) (test_data + 2));
+ if (verbose)
+ vlib_cli_output (vm, "fifo after missing link: %U", format_svm_fifo, f,
+ 1);
+ TCP_TEST ((rv == 20), "bytes to be enqueued %u", rv);
+ TCP_TEST ((svm_fifo_number_ooo_segments (f) == 0),
+ "number of ooo segments %u", svm_fifo_number_ooo_segments (f));
+
+ /*
+ * Collect results
+ */
+ for (i = 0; i < 7; i++)
+ {
+ rv = svm_fifo_dequeue_nowait (f, sizeof (u32), (u8 *) & data_word);
+ if (rv != sizeof (u32))
+ {
+ clib_warning ("bytes dequeues %u", rv);
+ goto err;
+ }
+ if (data_word != test_data[i])
+ {
+ clib_warning ("recovered [%d] %d not %d", i, data_word,
+ test_data[i]);
+ goto err;
+ }
+ }
+
+ /*
+ * Test segment overlaps: last ooo segment overlaps all
+ */
+ svm_fifo_free (f);
+ f = fifo_prepare (fifo_size);
+
+ for (i = 0; i < 4; i++)
+ {
+ offset = (2 * i + 1) * sizeof (u32) - f->tail;
+ data = (u8 *) (test_data + (2 * i + 1));
+ rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data);
+ if (verbose)
+ vlib_cli_output (vm, "add [%d] [%d, %d]", 2 * i + 1, offset,
+ offset + sizeof (u32));
+ if (rv)
+ {
+ clib_warning ("enqueue returned %d", rv);
+ goto err;
+ }
+ }
+
+ rv = svm_fifo_enqueue_with_offset (f, 8 - f->tail, 21, data);
+ TCP_TEST ((rv == 0), "ooo enqueued %u", rv);
+ TCP_TEST ((svm_fifo_number_ooo_segments (f) == 1),
+ "number of ooo segments %u", svm_fifo_number_ooo_segments (f));
+
+ vec_validate (data_buf, vec_len (data));
+ svm_fifo_peek (f, 0, vec_len (data), data_buf);
+ if (compare_data (data_buf, data, 8, vec_len (data), &j))
+ {
+ TCP_TEST (0, "[%d] peeked %u expected %u", j, data_buf[j], data[j]);
+ }
+ vec_reset_length (data_buf);
+
+ /*
+ * Test segment overlaps: enqueue and overlap ooo segments
+ */
+ svm_fifo_free (f);
+ f = fifo_prepare (fifo_size);
+
+ for (i = 0; i < 4; i++)
+ {
+ offset = (2 * i + 1) * sizeof (u32) - f->tail;
+ data = (u8 *) (test_data + (2 * i + 1));
+ rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data);
+ if (verbose)
+ vlib_cli_output (vm, "add [%d] [%d, %d]", 2 * i + 1, offset,
+ offset + sizeof (u32));
+ if (rv)
+ {
+ clib_warning ("enqueue returned %d", rv);
+ goto err;
+ }
+ }
+
+ if (verbose)
+ vlib_cli_output (vm, "fifo after enqueue: %U", format_svm_fifo, f, 1);
+
+ rv = svm_fifo_enqueue_nowait (f, 29, data);
+ if (verbose)
+ vlib_cli_output (vm, "fifo after enqueueing 29: %U", format_svm_fifo, f,
+ 1);
+ TCP_TEST ((rv == 32), "ooo enqueued %u", rv);
+ TCP_TEST ((svm_fifo_number_ooo_segments (f) == 0),
+ "number of ooo segments %u", svm_fifo_number_ooo_segments (f));
+
+ vec_validate (data_buf, vec_len (data));
+ svm_fifo_peek (f, 0, vec_len (data), data_buf);
+ if (compare_data (data_buf, data, 0, vec_len (data), &j))
+ {
+ TCP_TEST (0, "[%d] peeked %u expected %u", j, data_buf[j], data[j]);
+ }
+
+ /* Try to peek beyond the data */
+ rv = svm_fifo_peek (f, svm_fifo_max_dequeue (f), vec_len (data), data_buf);
+ TCP_TEST ((rv == 0), "peeked %u expected 0", rv);
+
+ vec_free (data_buf);
+ svm_fifo_free (f);
+ vec_free (test_data);
+
+ return 0;
+
+err:
+ svm_fifo_free (f);
+ vec_free (test_data);
+ return -1;
+}
+
+static int
+tcp_test_fifo2 (vlib_main_t * vm)
+{
+ svm_fifo_t *f;
+ u32 fifo_size = 1 << 20;
+ int i, rv, test_data_len;
+ u64 data64;
+ test_pattern_t *tp, *vp, *test_data;
+ ooo_segment_t *ooo_seg;
+
+ test_data = test_pattern;
+ test_data_len = ARRAY_LEN (test_pattern);
+
+ vp = fifo_get_validate_pattern (vm, test_data, test_data_len);
+
+ /* Create a fifo */
+ f = fifo_prepare (fifo_size);
+
+ /*
+ * Try with sorted data
+ */
+ for (i = 0; i < test_data_len; i++)
+ {
+ tp = vp + i;
+ data64 = tp->offset;
+ svm_fifo_enqueue_with_offset (f, tp->offset - f->tail, tp->len,
+ (u8 *) & data64);
+ }
+
+ /* Expected result: one big fat chunk at offset 4 */
+ TCP_TEST ((svm_fifo_number_ooo_segments (f) == 1),
+ "number of ooo segments %u", svm_fifo_number_ooo_segments (f));
+ ooo_seg = svm_fifo_first_ooo_segment (f);
+ TCP_TEST ((ooo_seg->start == 4),
+ "first ooo seg position %u", ooo_seg->start);
+ TCP_TEST ((ooo_seg->length == 2996),
+ "first ooo seg length %u", ooo_seg->length);
+
+ data64 = 0;
+ rv = svm_fifo_enqueue_nowait (f, sizeof (u32), (u8 *) & data64);
+ TCP_TEST ((rv == 3000), "bytes to be enqueued %u", rv);
+
+ svm_fifo_free (f);
+ vec_free (vp);
+
+ /*
+ * Now try it again w/ unsorted data...
+ */
+
+ f = fifo_prepare (fifo_size);
+
+ for (i = 0; i < test_data_len; i++)
+ {
+ tp = &test_data[i];
+ data64 = tp->offset;
+ rv = svm_fifo_enqueue_with_offset (f, tp->offset - f->tail, tp->len,
+ (u8 *) & data64);
+ if (rv)
+ {
+ clib_warning ("enqueue returned %d", rv);
+ }
+ }
+
+ /* Expecting the same result: one big fat chunk at offset 4 */
+ TCP_TEST ((svm_fifo_number_ooo_segments (f) == 1),
+ "number of ooo segments %u", svm_fifo_number_ooo_segments (f));
+ ooo_seg = svm_fifo_first_ooo_segment (f);
+ TCP_TEST ((ooo_seg->start == 4),
+ "first ooo seg position %u", ooo_seg->start);
+ TCP_TEST ((ooo_seg->length == 2996),
+ "first ooo seg length %u", ooo_seg->length);
+
+ data64 = 0;
+ rv = svm_fifo_enqueue_nowait (f, sizeof (u32), (u8 *) & data64);
+
+ TCP_TEST ((rv == 3000), "bytes to be enqueued %u", rv);
+
+ svm_fifo_free (f);
+
+ return 0;
+}
+
+static int
+tcp_test_fifo3 (vlib_main_t * vm, unformat_input_t * input)
+{
+ svm_fifo_t *f;
+ u32 fifo_size = 4 << 10;
+ u32 fifo_initial_offset = 0;
+ u32 total_size = 2 << 10;
+ int overlap = 0, verbose = 0, randomize = 1, drop = 0, in_seq_all = 0;
+ u8 *data_pattern = 0, *data_buf = 0;
+ test_pattern_t *tp, *generate = 0;
+ u32 nsegs = 2, seg_size, length_so_far;
+ u32 current_offset, offset_increment, len_this_chunk;
+ u32 seed = 0xdeaddabe, j;
+ int i, rv;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "fifo-size %d", &fifo_size))
+ ;
+ else if (unformat (input, "total-size %d", &total_size))
+ ;
+ else if (unformat (input, "verbose"))
+ verbose = 1;
+ else if (unformat (input, "overlap"))
+ overlap = 1;
+ else if (unformat (input, "initial-offset %d", &fifo_initial_offset))
+ ;
+ else if (unformat (input, "seed %d", &seed))
+ ;
+ else if (unformat (input, "nsegs %d", &nsegs))
+ ;
+ else if (unformat (input, "no-randomize"))
+ randomize = 0;
+ else if (unformat (input, "in-seq-all"))
+ in_seq_all = 1;
+ else if (unformat (input, "drop"))
+ drop = 1;
+ else
+ {
+ clib_error_t *e = clib_error_return
+ (0, "unknown input `%U'", format_unformat_error, input);
+ clib_error_report (e);
+ return -1;
+ }
+ }
+
+ if (total_size > fifo_size)
+ {
+ clib_warning ("total_size %d greater than fifo size %d", total_size,
+ fifo_size);
+ return -1;
+ }
+ if (overlap && randomize == 0)
+ {
+ clib_warning ("Can't enqueue in-order with overlap");
+ return -1;
+ }
+
+ /*
+ * Generate data
+ */
+ vec_validate (data_pattern, total_size - 1);
+ for (i = 0; i < vec_len (data_pattern); i++)
+ data_pattern[i] = i & 0xff;
+
+ /*
+ * Generate segments
+ */
+ seg_size = total_size / nsegs;
+ length_so_far = 0;
+ current_offset = randomize;
+ while (length_so_far < total_size)
+ {
+ vec_add2 (generate, tp, 1);
+ len_this_chunk = clib_min (seg_size, total_size - length_so_far);
+ tp->offset = current_offset;
+ tp->len = len_this_chunk;
+
+ if (overlap && (len_this_chunk == seg_size))
+ do
+ {
+ offset_increment = len_this_chunk
+ % (1 + (random_u32 (&seed) % len_this_chunk));
+ }
+ while (offset_increment == 0);
+ else
+ offset_increment = len_this_chunk;
+
+ current_offset += offset_increment;
+ length_so_far = tp->offset + tp->len;
+ }
+
+ /*
+ * Validate segment list. Only valid for non-overlap cases.
+ */
+ if (overlap == 0)
+ fifo_validate_pattern (vm, generate, vec_len (generate));
+
+ if (verbose)
+ {
+ vlib_cli_output (vm, "raw data pattern:");
+ for (i = 0; i < vec_len (generate); i++)
+ {
+ vlib_cli_output (vm, "[%d] offset %u len %u", i,
+ generate[i].offset, generate[i].len);
+ }
+ }
+
+ /* Randomize data pattern */
+ if (randomize)
+ {
+ for (i = 0; i < vec_len (generate) / 2; i++)
+ {
+ u32 src_index, dst_index;
+ test_pattern_t _tmp, *tmp = &_tmp;
+
+ src_index = random_u32 (&seed) % vec_len (generate);
+ dst_index = random_u32 (&seed) % vec_len (generate);
+
+ tmp[0] = generate[dst_index];
+ generate[dst_index] = generate[src_index];
+ generate[src_index] = tmp[0];
+ }
+ if (verbose)
+ {
+ vlib_cli_output (vm, "randomized data pattern:");
+ for (i = 0; i < vec_len (generate); i++)
+ {
+ vlib_cli_output (vm, "[%d] offset %u len %u", i,
+ generate[i].offset, generate[i].len);
+ }
+ }
+ }
+
+ /*
+ * Create a fifo and add segments
+ */
+ f = fifo_prepare (fifo_size);
+
+ /* manually set head and tail pointers to validate modular arithmetic */
+ fifo_initial_offset = fifo_initial_offset % fifo_size;
+ f->head = fifo_initial_offset;
+ f->tail = fifo_initial_offset;
+
+ for (i = !randomize; i < vec_len (generate); i++)
+ {
+ tp = generate + i;
+ svm_fifo_enqueue_with_offset (f,
+ fifo_initial_offset + tp->offset -
+ f->tail, tp->len,
+ (u8 *) data_pattern + tp->offset);
+ }
+
+ /* Add the first segment in order for non random data */
+ if (!randomize)
+ svm_fifo_enqueue_nowait (f, generate[0].len, (u8 *) data_pattern);
+
+ /*
+ * Expected result: one big fat chunk at offset 1 if randomize == 1
+ */
+
+ if (verbose)
+ vlib_cli_output (vm, "fifo before missing link: %U",
+ format_svm_fifo, f, 1 /* verbose */ );
+
+ /*
+ * Add the missing byte if segments were randomized
+ */
+ if (randomize)
+ {
+ u32 bytes_to_enq = 1;
+ if (in_seq_all)
+ bytes_to_enq = total_size;
+ rv = svm_fifo_enqueue_nowait (f, bytes_to_enq, data_pattern + 0);
+
+ if (verbose)
+ vlib_cli_output (vm, "in-order enqueue returned %d", rv);
+
+ TCP_TEST ((rv == total_size), "enqueued %u expected %u", rv,
+ total_size);
+
+ }
+
+ TCP_TEST ((svm_fifo_has_ooo_data (f) == 0), "number of ooo segments %u",
+ svm_fifo_number_ooo_segments (f));
+
+ /*
+ * Test if peeked data is the same as original data
+ */
+ vec_validate (data_buf, vec_len (data_pattern));
+ svm_fifo_peek (f, 0, vec_len (data_pattern), data_buf);
+ if (compare_data (data_buf, data_pattern, 0, vec_len (data_pattern), &j))
+ {
+ TCP_TEST (0, "[%d] peeked %u expected %u", j, data_buf[j],
+ data_pattern[j]);
+ }
+ vec_reset_length (data_buf);
+
+ /*
+ * Dequeue or drop all data
+ */
+ if (drop)
+ {
+ svm_fifo_dequeue_drop (f, vec_len (data_pattern));
+ }
+ else
+ {
+ svm_fifo_dequeue_nowait (f, vec_len (data_pattern), data_buf);
+ if (compare_data
+ (data_buf, data_pattern, 0, vec_len (data_pattern), &j))
+ {
+ TCP_TEST (0, "[%d] dequeued %u expected %u", j, data_buf[j],
+ data_pattern[j]);
+ }
+ }
+
+ TCP_TEST ((svm_fifo_max_dequeue (f) == 0), "fifo has %d bytes",
+ svm_fifo_max_dequeue (f));
+
+ svm_fifo_free (f);
+ vec_free (data_pattern);
+ vec_free (data_buf);
+
+ return 0;
+}
+
+static int
+tcp_test_fifo4 (vlib_main_t * vm, unformat_input_t * input)
+{
+ svm_fifo_t *f;
+ u32 fifo_size = 6 << 10;
+ u32 fifo_initial_offset = 1000000000;
+ u32 test_n_bytes = 5000, j;
+ u8 *test_data = 0, *data_buf = 0;
+ int i, rv, verbose = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "verbose"))
+ verbose = 1;
+ else
+ {
+ clib_error_t *e = clib_error_return
+ (0, "unknown input `%U'", format_unformat_error, input);
+ clib_error_report (e);
+ return -1;
+ }
+ }
+
+ /*
+ * Create a fifo and add segments
+ */
+ f = fifo_prepare (fifo_size);
+
+ /* Set head and tail pointers */
+ fifo_initial_offset = fifo_initial_offset % fifo_size;
+ svm_fifo_init_pointers (f, fifo_initial_offset);
+
+ vec_validate (test_data, test_n_bytes - 1);
+ for (i = 0; i < vec_len (test_data); i++)
+ test_data[i] = i;
+
+ for (i = test_n_bytes - 1; i > 0; i--)
+ {
+ rv = svm_fifo_enqueue_with_offset (f, fifo_initial_offset + i - f->tail,
+ sizeof (u8), &test_data[i]);
+ if (verbose)
+ vlib_cli_output (vm, "add [%d] [%d, %d]", i, i, i + sizeof (u8));
+ if (rv)
+ {
+ clib_warning ("enqueue returned %d", rv);
+ svm_fifo_free (f);
+ vec_free (test_data);
+ return -1;
+ }
+ }
+
+ svm_fifo_enqueue_nowait (f, sizeof (u8), &test_data[0]);
+
+ vec_validate (data_buf, vec_len (test_data));
+
+ svm_fifo_dequeue_nowait (f, vec_len (test_data), data_buf);
+ rv = compare_data (data_buf, test_data, 0, vec_len (test_data), &j);
+ if (rv)
+ vlib_cli_output (vm, "[%d] dequeued %u expected %u", j, data_buf[j],
+ test_data[j]);
+ TCP_TEST ((rv == 0), "dequeued compared to original returned %d", rv);
+
+ svm_fifo_free (f);
+ vec_free (test_data);
+ return 0;
+}
+
+static u32
+fifo_pos (svm_fifo_t * f, u32 pos)
+{
+ return pos % f->nitems;
+}
+
+static int
+tcp_test_fifo5 (vlib_main_t * vm, unformat_input_t * input)
+{
+ svm_fifo_t *f;
+ u32 fifo_size = 400, j = 0, offset = 200;
+ int i, rv, verbose = 0;
+ u8 *test_data = 0, *data_buf = 0;
+ ooo_segment_t *ooo_seg;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "verbose"))
+ verbose = 1;
+ else
+ {
+ clib_error_t *e = clib_error_return
+ (0, "unknown input `%U'", format_unformat_error, input);
+ clib_error_report (e);
+ return -1;
+ }
+ }
+
+ f = fifo_prepare (fifo_size);
+ svm_fifo_init_pointers (f, offset);
+
+ vec_validate (test_data, 399);
+ for (i = 0; i < vec_len (test_data); i++)
+ test_data[i] = i % 0xff;
+
+ /*
+ * Start with [100, 200] and [300, 400]
+ */
+ svm_fifo_enqueue_with_offset (f, 100, 100, &test_data[100]);
+ svm_fifo_enqueue_with_offset (f, 300, 100, &test_data[300]);
+
+ TCP_TEST ((svm_fifo_number_ooo_segments (f) == 2),
+ "number of ooo segments %u", svm_fifo_number_ooo_segments (f));
+ TCP_TEST ((f->ooos_newest == 1), "newest %u", f->ooos_newest);
+ if (verbose)
+ vlib_cli_output (vm, "fifo after [100, 200] and [300, 400] : %U",
+ format_svm_fifo, f, 2 /* verbose */ );
+
+ /*
+ * Add [225, 275]
+ */
+
+ rv = svm_fifo_enqueue_with_offset (f, 225, 50, &test_data[200]);
+ if (verbose)
+ vlib_cli_output (vm, "fifo after [225, 275] : %U",
+ format_svm_fifo, f, 2 /* verbose */ );
+ TCP_TEST ((svm_fifo_number_ooo_segments (f) == 3),
+ "number of ooo segments %u", svm_fifo_number_ooo_segments (f));
+ ooo_seg = svm_fifo_first_ooo_segment (f);
+ TCP_TEST ((ooo_seg->start == fifo_pos (f, 100 + offset)),
+ "first seg start %u expected %u", ooo_seg->start,
+ fifo_pos (f, 100 + offset));
+ TCP_TEST ((ooo_seg->length == 100), "first seg length %u expected %u",
+ ooo_seg->length, 100);
+ ooo_seg = ooo_segment_next (f, ooo_seg);
+ TCP_TEST ((ooo_seg->start == fifo_pos (f, 225 + offset)),
+ "second seg start %u expected %u",
+ ooo_seg->start, fifo_pos (f, 225 + offset));
+ TCP_TEST ((ooo_seg->length == 50), "second seg length %u expected %u",
+ ooo_seg->length, 50);
+ ooo_seg = ooo_segment_next (f, ooo_seg);
+ TCP_TEST ((ooo_seg->start == fifo_pos (f, 300 + offset)),
+ "third seg start %u expected %u",
+ ooo_seg->start, fifo_pos (f, 300 + offset));
+ TCP_TEST ((ooo_seg->length == 100), "third seg length %u expected %u",
+ ooo_seg->length, 100);
+ TCP_TEST ((f->ooos_newest == 2), "newest %u", f->ooos_newest);
+ /*
+ * Add [190, 310]
+ */
+ rv = svm_fifo_enqueue_with_offset (f, 190, 120, &test_data[190]);
+ if (verbose)
+ vlib_cli_output (vm, "fifo after [190, 310] : %U",
+ format_svm_fifo, f, 1 /* verbose */ );
+ TCP_TEST ((svm_fifo_number_ooo_segments (f) == 1),
+ "number of ooo segments %u", svm_fifo_number_ooo_segments (f));
+ ooo_seg = svm_fifo_first_ooo_segment (f);
+ TCP_TEST ((ooo_seg->start == fifo_pos (f, offset + 100)),
+ "first seg start %u expected %u",
+ ooo_seg->start, fifo_pos (f, offset + 100));
+ TCP_TEST ((ooo_seg->length == 300), "first seg length %u expected %u",
+ ooo_seg->length, 300);
+
+ /*
+ * Add [0, 150]
+ */
+ rv = svm_fifo_enqueue_nowait (f, 150, test_data);
+
+ if (verbose)
+ vlib_cli_output (vm, "fifo after [0 150] : %U", format_svm_fifo, f,
+ 2 /* verbose */ );
+
+ TCP_TEST ((rv == 400), "managed to enqueue %u expected %u", rv, 400);
+ TCP_TEST ((svm_fifo_number_ooo_segments (f) == 0),
+ "number of ooo segments %u", svm_fifo_number_ooo_segments (f));
+
+ vec_validate (data_buf, 399);
+ svm_fifo_peek (f, 0, 400, data_buf);
+ if (compare_data (data_buf, test_data, 0, 400, &j))
+ {
+ TCP_TEST (0, "[%d] peeked %u expected %u", j, data_buf[j],
+ test_data[j]);
+ }
+
+ /*
+ * Add [100 200] and overlap it with [50 250]
+ */
+ svm_fifo_free (f);
+ f = fifo_prepare (fifo_size);
+
+ svm_fifo_enqueue_with_offset (f, 100, 100, &test_data[100]);
+ svm_fifo_enqueue_with_offset (f, 50, 200, &test_data[50]);
+ TCP_TEST ((svm_fifo_number_ooo_segments (f) == 1),
+ "number of ooo segments %u", svm_fifo_number_ooo_segments (f));
+ ooo_seg = svm_fifo_first_ooo_segment (f);
+ TCP_TEST ((ooo_seg->start == 50), "first seg start %u expected %u",
+ ooo_seg->start, 50);
+ TCP_TEST ((ooo_seg->length == 200), "first seg length %u expected %u",
+ ooo_seg->length, 200);
+
+ svm_fifo_free (f);
+ vec_free (test_data);
+ return 0;
+}
+
+/* *INDENT-OFF* */
+svm_fifo_trace_elem_t fifo_trace[] = {};
+/* *INDENT-ON* */
+
+static int
+tcp_test_fifo_replay (vlib_main_t * vm, unformat_input_t * input)
+{
+ svm_fifo_t f;
+ int verbose = 0;
+ u8 no_read = 0, *str = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "verbose"))
+ verbose = 1;
+ else if (unformat (input, "no-read"))
+ no_read = 1;
+ else
+ {
+ clib_error_t *e = clib_error_return
+ (0, "unknown input `%U'", format_unformat_error, input);
+ clib_error_report (e);
+ return -1;
+ }
+ }
+
+#if SVMF_FIFO_TRACE
+ f.trace = fifo_trace;
+#endif
+
+ str = svm_fifo_replay (str, &f, no_read, verbose);
+ vlib_cli_output (vm, "%v", str);
+ return 0;
+}
+
+static int
+tcp_test_fifo (vlib_main_t * vm, unformat_input_t * input)
+{
+ int res = 0;
+ char *str;
+
+ /* Run all tests */
+ if (unformat_check_input (input) == UNFORMAT_END_OF_INPUT)
+ {
+ res = tcp_test_fifo1 (vm, input);
+ if (res)
+ return res;
+
+ res = tcp_test_fifo2 (vm);
+ if (res)
+ return res;
+
+ /*
+ * Run a number of fifo3 configs
+ */
+ str = "nsegs 10 overlap seed 123";
+ unformat_init_cstring (input, str);
+ if (tcp_test_fifo3 (vm, input))
+ return -1;
+ unformat_free (input);
+
+ str = "nsegs 10 overlap seed 123 in-seq-all";
+ unformat_init_cstring (input, str);
+ if (tcp_test_fifo3 (vm, input))
+ return -1;
+ unformat_free (input);
+
+ str = "nsegs 10 overlap seed 123 initial-offset 3917";
+ unformat_init_cstring (input, str);
+ if (tcp_test_fifo3 (vm, input))
+ return -1;
+ unformat_free (input);
+
+ str = "nsegs 10 overlap seed 123 initial-offset 3917 drop";
+ unformat_init_cstring (input, str);
+ if (tcp_test_fifo3 (vm, input))
+ return -1;
+ unformat_free (input);
+
+ str = "nsegs 10 seed 123 initial-offset 3917 drop no-randomize";
+ unformat_init_cstring (input, str);
+ if (tcp_test_fifo3 (vm, input))
+ return -1;
+ unformat_free (input);
+
+ res = tcp_test_fifo4 (vm, input);
+ if (res)
+ return res;
+
+ res = tcp_test_fifo5 (vm, input);
+ if (res)
+ return res;
+ }
+ else
+ {
+ if (unformat (input, "fifo3"))
+ {
+ res = tcp_test_fifo3 (vm, input);
+ }
+ else if (unformat (input, "fifo2"))
+ {
+ res = tcp_test_fifo2 (vm);
+ }
+ else if (unformat (input, "fifo1"))
+ {
+ res = tcp_test_fifo1 (vm, input);
+ }
+ else if (unformat (input, "fifo4"))
+ {
+ res = tcp_test_fifo4 (vm, input);
+ }
+ else if (unformat (input, "fifo5"))
+ {
+ res = tcp_test_fifo5 (vm, input);
+ }
+ else if (unformat (input, "replay"))
+ {
+ res = tcp_test_fifo_replay (vm, input);
+ }
+ }
+
+ return res;
+}
+
+static int
+tcp_test_lookup (vlib_main_t * vm, unformat_input_t * input)
+{
+ session_manager_main_t *smm = &session_manager_main;
+ tcp_main_t *tm = &tcp_main;
+ transport_connection_t _tc1, *tc1 = &_tc1, _tc2, *tc2 = &_tc2, *tconn;
+ tcp_connection_t *tc;
+ stream_session_t *s;
+ u8 cmp = 0;
+
+ pool_get (smm->sessions[0], s);
+ memset (s, 0, sizeof (*s));
+ s->session_index = s - smm->sessions[0];
+
+ pool_get (tm->connections[0], tc);
+ memset (tc, 0, sizeof (*tc));
+ tc->connection.c_index = tc - tm->connections[0];
+ tc->connection.s_index = s->session_index;
+ s->connection_index = tc->connection.c_index;
+
+ tc->connection.lcl_ip.ip4.as_u32 = clib_host_to_net_u32 (0x06000101);
+ tc->connection.rmt_ip.ip4.as_u32 = clib_host_to_net_u32 (0x06000103);
+ tc->connection.lcl_port = 35051;
+ tc->connection.rmt_port = 53764;
+ tc->connection.transport_proto = 0;
+ clib_memcpy (tc1, &tc->connection, sizeof (*tc1));
+
+ pool_get (session_manager_main.sessions[0], s);
+ memset (s, 0, sizeof (*s));
+ s->session_index = s - smm->sessions[0];
+ pool_get (tm->connections[0], tc);
+ memset (tc, 0, sizeof (*tc));
+ tc->connection.c_index = tc - tm->connections[0];
+ tc->connection.s_index = s->session_index;
+ s->connection_index = tc->connection.c_index;
+
+ tc->connection.lcl_ip.ip4.as_u32 = clib_host_to_net_u32 (0x06000101);
+ tc->connection.rmt_ip.ip4.as_u32 = clib_host_to_net_u32 (0x06000102);
+ tc->connection.lcl_port = 38225;
+ tc->connection.rmt_port = 53764;
+ tc->connection.transport_proto = 0;
+ clib_memcpy (tc2, &tc->connection, sizeof (*tc2));
+
+ /*
+ * Confirm that connection lookup works
+ */
+
+ stream_session_table_add_for_tc (tc1, tc1->s_index);
+ tconn = stream_session_lookup_transport_wt4 (&tc1->lcl_ip.ip4,
+ &tc1->rmt_ip.ip4,
+ tc1->lcl_port, tc1->rmt_port,
+ tc1->transport_proto, 0);
+ cmp = (memcmp (&tconn->rmt_ip, &tc1->rmt_ip, sizeof (tc1->rmt_ip)) == 0);
+ TCP_TEST ((cmp), "rmt ip is identical %d", cmp);
+ TCP_TEST ((tconn->lcl_port == tc1->lcl_port),
+ "rmt port is identical %d", tconn->lcl_port == tc1->lcl_port);
+
+ /*
+ * Non-existing connection lookup should not work
+ */
+
+ tconn = stream_session_lookup_transport_wt4 (&tc2->lcl_ip.ip4,
+ &tc2->rmt_ip.ip4,
+ tc2->lcl_port, tc2->rmt_port,
+ tc2->transport_proto, 0);
+ TCP_TEST ((tconn == 0), "lookup result should be null");
+
+ /*
+ * Delete and lookup again
+ */
+ stream_session_table_del_for_tc (tc1);
+ tconn = stream_session_lookup_transport_wt4 (&tc1->lcl_ip.ip4,
+ &tc1->rmt_ip.ip4,
+ tc1->lcl_port, tc1->rmt_port,
+ tc1->transport_proto, 0);
+ TCP_TEST ((tconn == 0), "lookup result should be null");
+ tconn = stream_session_lookup_transport_wt4 (&tc2->lcl_ip.ip4,
+ &tc2->rmt_ip.ip4,
+ tc2->lcl_port, tc2->rmt_port,
+ tc2->transport_proto, 0);
+ TCP_TEST ((tconn == 0), "lookup result should be null");
+
+ /*
+ * Re-add and lookup tc2
+ */
+ stream_session_table_add_for_tc (tc1, tc1->s_index);
+ tconn = stream_session_lookup_transport_wt4 (&tc2->lcl_ip.ip4,
+ &tc2->rmt_ip.ip4,
+ tc2->lcl_port, tc2->rmt_port,
+ tc2->transport_proto, 0);
+ TCP_TEST ((tconn == 0), "lookup result should be null");
+
+ return 0;
+}
+
+static int
+tcp_test_session (vlib_main_t * vm, unformat_input_t * input)
+{
+ int rv = 0;
+ tcp_connection_t *tc0;
+ u8 sst = SESSION_TYPE_IP4_TCP;
+ ip4_address_t local, remote;
+ u16 local_port, remote_port;
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ int is_add = 1;
+
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "del"))
+ is_add = 0;
+ else if (unformat (input, "add"))
+ is_add = 1;
+ else
+ break;
+ }
+
+ if (is_add)
+ {
+ local.as_u32 = clib_host_to_net_u32 (0x06000101);
+ remote.as_u32 = clib_host_to_net_u32 (0x06000102);
+ local_port = clib_host_to_net_u16 (1234);
+ remote_port = clib_host_to_net_u16 (11234);
+
+ pool_get (tm->connections[0], tc0);
+ memset (tc0, 0, sizeof (*tc0));
+
+ tc0->state = TCP_STATE_ESTABLISHED;
+ tc0->rcv_las = 1;
+ tc0->c_c_index = tc0 - tm->connections[0];
+ tc0->c_lcl_port = local_port;
+ tc0->c_rmt_port = remote_port;
+ tc0->c_is_ip4 = 1;
+ tc0->c_thread_index = 0;
+ tc0->c_lcl_ip4.as_u32 = local.as_u32;
+ tc0->c_rmt_ip4.as_u32 = remote.as_u32;
+ tc0->rcv_opts.mss = 1450;
+ tcp_connection_init_vars (tc0);
+
+ TCP_EVT_DBG (TCP_EVT_OPEN, tc0);
+
+ if (stream_session_accept (&tc0->connection, 0 /* listener index */ ,
+ sst, 0 /* notify */ ))
+ clib_warning ("stream_session_accept failed");
+
+ stream_session_accept_notify (&tc0->connection);
+ }
+ else
+ {
+ tc0 = tcp_connection_get (0 /* connection index */ , 0 /* thread */ );
+ tc0->state = TCP_STATE_CLOSED;
+ stream_session_disconnect_notify (&tc0->connection);
+ }
+
+ return rv;
+}
+
+static clib_error_t *
+tcp_test (vlib_main_t * vm,
+ unformat_input_t * input, vlib_cli_command_t * cmd_arg)
+{
+ int res = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "sack"))
+ {
+ res = tcp_test_sack (vm, input);
+ }
+ else if (unformat (input, "fifo"))
+ {
+ res = tcp_test_fifo (vm, input);
+ }
+ else if (unformat (input, "session"))
+ {
+ res = tcp_test_session (vm, input);
+ }
+ else if (unformat (input, "lookup"))
+ {
+ res = tcp_test_lookup (vm, input);
+ }
+ else
+ break;
+ }
+
+ if (res)
+ {
+ return clib_error_return (0, "TCP unit test failed");
+ }
+ else
+ {
+ return 0;
+ }
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (tcp_test_command, static) =
+{
+ .path = "test tcp",
+ .short_help = "internal tcp unit tests",
+ .function = tcp_test,
+};
+/* *INDENT-ON* */
+
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/tcp/tcp_timer.h b/src/vnet/tcp/tcp_timer.h
new file mode 100644
index 00000000..fa25268c
--- /dev/null
+++ b/src/vnet/tcp/tcp_timer.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_tcp_timer_h__
+#define __included_tcp_timer_h__
+
+#include <vppinfra/tw_timer_16t_2w_512sl.h>
+#include <vppinfra/tw_timer_16t_1w_2048sl.h>
+
+#endif /* __included_tcp_timer_h__ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */