diff options
Diffstat (limited to 'src/vnet/tcp')
-rw-r--r-- | src/vnet/tcp/builtin_client.c | 770 | ||||
-rw-r--r-- | src/vnet/tcp/builtin_client.h | 121 | ||||
-rw-r--r-- | src/vnet/tcp/builtin_http_server.c | 564 | ||||
-rw-r--r-- | src/vnet/tcp/builtin_proxy.c | 601 | ||||
-rw-r--r-- | src/vnet/tcp/builtin_proxy.h | 100 | ||||
-rw-r--r-- | src/vnet/tcp/builtin_server.c | 455 | ||||
-rw-r--r-- | src/vnet/tcp/tcp.api | 42 | ||||
-rw-r--r-- | src/vnet/tcp/tcp.c | 1943 | ||||
-rw-r--r-- | src/vnet/tcp/tcp.h | 985 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_api.c | 119 | ||||
-rwxr-xr-x | src/vnet/tcp/tcp_debug.h | 761 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_error.def | 43 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_format.c | 137 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_input.c | 3215 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_newreno.c | 107 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_output.c | 2113 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_packet.h | 184 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_pg.c | 244 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_syn_filter4.c | 545 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_test.c | 1764 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_timer.h | 29 |
21 files changed, 14842 insertions, 0 deletions
diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c new file mode 100644 index 00000000..527b3289 --- /dev/null +++ b/src/vnet/tcp/builtin_client.c @@ -0,0 +1,770 @@ +/* + * builtin_client.c - vpp built-in tcp client/connect code + * + * Copyright (c) 2017 by Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/vnet.h> +#include <vnet/plugin/plugin.h> +#include <vnet/tcp/builtin_client.h> + +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> +#include <vpp/app/version.h> + +#define TCP_BUILTIN_CLIENT_DBG (0) + +static void +signal_evt_to_cli_i (int *code) +{ + tclient_main_t *tm = &tclient_main; + ASSERT (vlib_get_thread_index () == 0); + vlib_process_signal_event (tm->vlib_main, tm->cli_node_index, *code, 0); +} + +static void +signal_evt_to_cli (int code) +{ + if (vlib_get_thread_index () != 0) + vl_api_rpc_call_main_thread (signal_evt_to_cli_i, (u8 *) & code, + sizeof (code)); + else + signal_evt_to_cli_i (&code); +} + +static void +send_test_chunk (tclient_main_t * tm, session_t * s) +{ + u8 *test_data = tm->connect_test_data; + int test_buf_offset; + u32 bytes_this_chunk; + session_fifo_event_t evt; + static int serial_number = 0; + svm_fifo_t *txf; + int rv; + + ASSERT (vec_len (test_data) > 0); + + test_buf_offset = s->bytes_sent % vec_len (test_data); + bytes_this_chunk = vec_len (test_data) - test_buf_offset; + + bytes_this_chunk = bytes_this_chunk < s->bytes_to_send + ? bytes_this_chunk : s->bytes_to_send; + + txf = s->server_tx_fifo; + rv = svm_fifo_enqueue_nowait (txf, bytes_this_chunk, + test_data + test_buf_offset); + + /* If we managed to enqueue data... */ + if (rv > 0) + { + /* Account for it... */ + s->bytes_to_send -= rv; + s->bytes_sent += rv; + + if (TCP_BUILTIN_CLIENT_DBG) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "tx-enq: xfer %d bytes, sent %u remain %u", + .format_args = "i4i4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 data[3]; + } *ed; + ed = ELOG_DATA (&vlib_global_main.elog_main, e); + ed->data[0] = rv; + ed->data[1] = s->bytes_sent; + ed->data[2] = s->bytes_to_send; + } + + /* Poke the session layer */ + if (svm_fifo_set_event (txf)) + { + /* Fabricate TX event, send to vpp */ + evt.fifo = txf; + evt.event_type = FIFO_EVENT_APP_TX; + evt.event_id = serial_number++; + + if (unix_shared_memory_queue_add + (tm->vpp_event_queue[txf->master_thread_index], (u8 *) & evt, + 0 /* do wait for mutex */ )) + clib_warning ("could not enqueue event"); + } + } +} + +static void +receive_test_chunk (tclient_main_t * tm, session_t * s) +{ + svm_fifo_t *rx_fifo = s->server_rx_fifo; + int n_read, test_bytes = 0; + u32 my_thread_index = vlib_get_thread_index (); + + /* Allow enqueuing of new event */ + // svm_fifo_unset_event (rx_fifo); + + if (test_bytes) + { + n_read = svm_fifo_dequeue_nowait (rx_fifo, + vec_len (tm->rx_buf[my_thread_index]), + tm->rx_buf[my_thread_index]); + } + else + { + n_read = svm_fifo_max_dequeue (rx_fifo); + svm_fifo_dequeue_drop (rx_fifo, n_read); + } + + if (n_read > 0) + { + if (TCP_BUILTIN_CLIENT_DBG) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "rx-deq: %d bytes", + .format_args = "i4", + }; + /* *INDENT-ON* */ + struct + { + u32 data[1]; + } *ed; + ed = ELOG_DATA (&vlib_global_main.elog_main, e); + ed->data[0] = n_read; + } + + if (test_bytes) + { + int i; + for (i = 0; i < n_read; i++) + { + if (tm->rx_buf[my_thread_index][i] + != ((s->bytes_received + i) & 0xff)) + { + clib_warning ("read %d error at byte %lld, 0x%x not 0x%x", + n_read, s->bytes_received + i, + tm->rx_buf[my_thread_index][i], + ((s->bytes_received + i) & 0xff)); + } + } + } + s->bytes_to_receive -= n_read; + s->bytes_received += n_read; + } +} + +static uword +builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + tclient_main_t *tm = &tclient_main; + int my_thread_index = vlib_get_thread_index (); + session_t *sp; + int i; + int delete_session; + u32 *connection_indices; + u32 *connections_this_batch; + u32 nconnections_this_batch; + + connection_indices = tm->connection_index_by_thread[my_thread_index]; + connections_this_batch = + tm->connections_this_batch_by_thread[my_thread_index]; + + if ((tm->run_test == 0) || + ((vec_len (connection_indices) == 0) + && vec_len (connections_this_batch) == 0)) + return 0; + + /* Grab another pile of connections */ + if (PREDICT_FALSE (vec_len (connections_this_batch) == 0)) + { + nconnections_this_batch = + clib_min (tm->connections_per_batch, vec_len (connection_indices)); + + ASSERT (nconnections_this_batch > 0); + vec_validate (connections_this_batch, nconnections_this_batch - 1); + clib_memcpy (connections_this_batch, + connection_indices + vec_len (connection_indices) + - nconnections_this_batch, + nconnections_this_batch * sizeof (u32)); + _vec_len (connection_indices) -= nconnections_this_batch; + } + + if (PREDICT_FALSE (tm->prev_conns != tm->connections_per_batch + && tm->prev_conns == vec_len (connections_this_batch))) + { + tm->repeats++; + tm->prev_conns = vec_len (connections_this_batch); + if (tm->repeats == 500000) + { + clib_warning ("stuck clients"); + } + } + else + { + tm->prev_conns = vec_len (connections_this_batch); + tm->repeats = 0; + } + + for (i = 0; i < vec_len (connections_this_batch); i++) + { + delete_session = 1; + + sp = pool_elt_at_index (tm->sessions, connections_this_batch[i]); + + if (sp->bytes_to_send > 0) + { + send_test_chunk (tm, sp); + delete_session = 0; + } + if (sp->bytes_to_receive > 0) + { + receive_test_chunk (tm, sp); + delete_session = 0; + } + if (PREDICT_FALSE (delete_session == 1)) + { + u32 index, thread_index; + stream_session_t *s; + + __sync_fetch_and_add (&tm->tx_total, sp->bytes_sent); + __sync_fetch_and_add (&tm->rx_total, sp->bytes_received); + + stream_session_parse_handle (sp->vpp_session_handle, + &index, &thread_index); + s = stream_session_get_if_valid (index, thread_index); + + if (s) + { + vnet_disconnect_args_t _a, *a = &_a; + a->handle = stream_session_handle (s); + a->app_index = tm->app_index; + vnet_disconnect_session (a); + + vec_delete (connections_this_batch, 1, i); + i--; + __sync_fetch_and_add (&tm->ready_connections, -1); + } + else + clib_warning ("session AWOL?"); + + /* Kick the debug CLI process */ + if (tm->ready_connections == 0) + { + signal_evt_to_cli (2); + } + } + } + + tm->connection_index_by_thread[my_thread_index] = connection_indices; + tm->connections_this_batch_by_thread[my_thread_index] = + connections_this_batch; + return 0; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (builtin_client_node) = +{ + .function = builtin_client_node_fn, + .name = "builtin-tcp-client", + .type = VLIB_NODE_TYPE_INPUT, + .state = VLIB_NODE_STATE_DISABLED, +}; +/* *INDENT-ON* */ + +static int +create_api_loopback (tclient_main_t * tm) +{ + api_main_t *am = &api_main; + vl_shmem_hdr_t *shmem_hdr; + + shmem_hdr = am->shmem_hdr; + tm->vl_input_queue = shmem_hdr->vl_input_queue; + tm->my_client_index = + vl_api_memclnt_create_internal ("tcp_test_client", tm->vl_input_queue); + return 0; +} + +static int +tcp_test_clients_init (vlib_main_t * vm) +{ + tclient_main_t *tm = &tclient_main; + vlib_thread_main_t *vtm = vlib_get_thread_main (); + u32 num_threads; + int i; + + if (create_api_loopback (tm)) + return -1; + + num_threads = 1 /* main thread */ + vtm->n_threads; + + /* Init test data. Big buffer */ + vec_validate (tm->connect_test_data, 1024 * 1024 - 1); + for (i = 0; i < vec_len (tm->connect_test_data); i++) + tm->connect_test_data[i] = i & 0xff; + + vec_validate (tm->rx_buf, num_threads - 1); + for (i = 0; i < num_threads; i++) + vec_validate (tm->rx_buf[i], vec_len (tm->connect_test_data) - 1); + + tm->is_init = 1; + + vec_validate (tm->connection_index_by_thread, vtm->n_vlib_mains); + vec_validate (tm->connections_this_batch_by_thread, vtm->n_vlib_mains); + vec_validate (tm->vpp_event_queue, vtm->n_vlib_mains); + + return 0; +} + +static int +builtin_session_connected_callback (u32 app_index, u32 api_context, + stream_session_t * s, u8 is_fail) +{ + tclient_main_t *tm = &tclient_main; + session_t *session; + u32 session_index; + u8 thread_index = vlib_get_thread_index (); + + if (is_fail) + { + clib_warning ("connection %d failed!", api_context); + signal_evt_to_cli (-1); + return 0; + } + + ASSERT (s->thread_index == thread_index); + + if (!tm->vpp_event_queue[thread_index]) + tm->vpp_event_queue[thread_index] = + session_manager_get_vpp_event_queue (thread_index); + + /* + * Setup session + */ + clib_spinlock_lock_if_init (&tm->sessions_lock); + pool_get (tm->sessions, session); + clib_spinlock_unlock_if_init (&tm->sessions_lock); + + memset (session, 0, sizeof (*session)); + session_index = session - tm->sessions; + session->bytes_to_send = tm->bytes_to_send; + session->bytes_to_receive = tm->no_return ? 0ULL : tm->bytes_to_send; + session->server_rx_fifo = s->server_rx_fifo; + session->server_rx_fifo->client_session_index = session_index; + session->server_tx_fifo = s->server_tx_fifo; + session->server_tx_fifo->client_session_index = session_index; + session->vpp_session_handle = stream_session_handle (s); + + vec_add1 (tm->connection_index_by_thread[thread_index], session_index); + __sync_fetch_and_add (&tm->ready_connections, 1); + if (tm->ready_connections == tm->expected_connections) + { + tm->run_test = 1; + /* Signal the CLI process that the action is starting... */ + signal_evt_to_cli (1); + } + + return 0; +} + +static void +builtin_session_reset_callback (stream_session_t * s) +{ + if (s->session_state == SESSION_STATE_READY) + clib_warning ("Reset active connection %U", format_stream_session, s, 2); + stream_session_cleanup (s); + return; +} + +static int +builtin_session_create_callback (stream_session_t * s) +{ + return 0; +} + +static void +builtin_session_disconnect_callback (stream_session_t * s) +{ + tclient_main_t *tm = &tclient_main; + vnet_disconnect_args_t _a, *a = &_a; + a->handle = stream_session_handle (s); + a->app_index = tm->app_index; + vnet_disconnect_session (a); + return; +} + +static int +builtin_server_rx_callback (stream_session_t * s) +{ + return 0; +} + +/* *INDENT-OFF* */ +static session_cb_vft_t builtin_clients = { + .session_reset_callback = builtin_session_reset_callback, + .session_connected_callback = builtin_session_connected_callback, + .session_accept_callback = builtin_session_create_callback, + .session_disconnect_callback = builtin_session_disconnect_callback, + .builtin_server_rx_callback = builtin_server_rx_callback +}; +/* *INDENT-ON* */ + +static int +attach_builtin_test_clients_app (void) +{ + tclient_main_t *tm = &tclient_main; + vnet_app_attach_args_t _a, *a = &_a; + u8 segment_name[128]; + u32 segment_name_length, prealloc_fifos; + u64 options[16]; + + segment_name_length = ARRAY_LEN (segment_name); + + memset (a, 0, sizeof (*a)); + memset (options, 0, sizeof (options)); + + a->api_client_index = tm->my_client_index; + a->segment_name = segment_name; + a->segment_name_length = segment_name_length; + a->session_cb_vft = &builtin_clients; + + prealloc_fifos = tm->prealloc_fifos ? tm->expected_connections : 1; + + options[SESSION_OPTIONS_ACCEPT_COOKIE] = 0x12345678; + options[SESSION_OPTIONS_SEGMENT_SIZE] = (2ULL << 32); + options[SESSION_OPTIONS_RX_FIFO_SIZE] = tm->fifo_size; + options[SESSION_OPTIONS_TX_FIFO_SIZE] = tm->fifo_size; + options[APP_OPTIONS_PRIVATE_SEGMENT_COUNT] = tm->private_segment_count; + options[APP_OPTIONS_PRIVATE_SEGMENT_SIZE] = tm->private_segment_size; + options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = prealloc_fifos; + + options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP; + + a->options = options; + + if (vnet_application_attach (a)) + return -1; + + tm->app_index = a->app_index; + return 0; +} + +static void * +tclient_thread_fn (void *arg) +{ + return 0; +} + +/** Start a transmit thread */ +int +start_tx_pthread (tclient_main_t * tm) +{ + if (tm->client_thread_handle == 0) + { + int rv = pthread_create (&tm->client_thread_handle, + NULL /*attr */ , + tclient_thread_fn, 0); + if (rv) + { + tm->client_thread_handle = 0; + return -1; + } + } + return 0; +} + +void +clients_connect (vlib_main_t * vm, u8 * uri, u32 n_clients) +{ + tclient_main_t *tm = &tclient_main; + vnet_connect_args_t _a, *a = &_a; + int i; + for (i = 0; i < n_clients; i++) + { + memset (a, 0, sizeof (*a)); + + a->uri = (char *) uri; + a->api_context = i; + a->app_index = tm->app_index; + a->mp = 0; + vnet_connect_uri (a); + + /* Crude pacing for call setups */ + if ((i % 4) == 0) + vlib_process_suspend (vm, 10e-6); + ASSERT (i + 1 >= tm->ready_connections); + while (i + 1 - tm->ready_connections > 1000) + { + vlib_process_suspend (vm, 100e-6); + } + } +} + +static clib_error_t * +test_tcp_clients_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + tclient_main_t *tm = &tclient_main; + vlib_thread_main_t *thread_main = vlib_get_thread_main (); + uword *event_data = 0, event_type; + u8 *default_connect_uri = (u8 *) "tcp://6.0.1.1/1234", *uri; + u64 tmp, total_bytes; + f64 test_timeout = 20.0, syn_timeout = 20.0, delta; + f64 time_before_connects; + u32 n_clients = 1; + int preallocate_sessions = 0; + char *transfer_type; + int i; + + tm->bytes_to_send = 8192; + tm->no_return = 0; + tm->fifo_size = 64 << 10; + tm->connections_per_batch = 1000; + tm->private_segment_count = 0; + tm->private_segment_size = 0; + tm->vlib_main = vm; + if (thread_main->n_vlib_mains > 1) + clib_spinlock_init (&tm->sessions_lock); + vec_free (tm->connect_uri); + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "nclients %d", &n_clients)) + ; + else if (unformat (input, "mbytes %lld", &tmp)) + tm->bytes_to_send = tmp << 20; + else if (unformat (input, "gbytes %lld", &tmp)) + tm->bytes_to_send = tmp << 30; + else if (unformat (input, "bytes %lld", &tm->bytes_to_send)) + ; + else if (unformat (input, "uri %s", &tm->connect_uri)) + ; + else if (unformat (input, "test-timeout %f", &test_timeout)) + ; + else if (unformat (input, "syn-timeout %f", &syn_timeout)) + ; + else if (unformat (input, "no-return")) + tm->no_return = 1; + else if (unformat (input, "fifo-size %d", &tm->fifo_size)) + tm->fifo_size <<= 10; + else if (unformat (input, "private-segment-count %d", + &tm->private_segment_count)) + ; + else if (unformat (input, "private-segment-size %U", + unformat_memory_size, &tmp)) + { + if (tmp >= 0x100000000ULL) + return clib_error_return + (0, "private segment size %lld (%llu) too large", tmp, tmp); + tm->private_segment_size = tmp; + } + else if (unformat (input, "preallocate-fifos")) + tm->prealloc_fifos = 1; + else if (unformat (input, "preallocate-sessions")) + preallocate_sessions = 1; + else + if (unformat (input, "client-batch %d", &tm->connections_per_batch)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + /* Store cli process node index for signalling */ + tm->cli_node_index = vlib_get_current_process (vm)->node_runtime.node_index; + + if (tm->is_init == 0) + { + if (tcp_test_clients_init (vm)) + return clib_error_return (0, "failed init"); + } + + + tm->ready_connections = 0; + tm->expected_connections = n_clients; + tm->rx_total = 0; + tm->tx_total = 0; + + uri = default_connect_uri; + if (tm->connect_uri) + uri = tm->connect_uri; + +#if TCP_BUILTIN_CLIENT_PTHREAD + start_tx_pthread (); +#endif + + vlib_worker_thread_barrier_sync (vm); + vnet_session_enable_disable (vm, 1 /* turn on TCP, etc. */ ); + vlib_worker_thread_barrier_release (vm); + + if (tm->test_client_attached == 0) + { + if (attach_builtin_test_clients_app ()) + { + return clib_error_return (0, "app attach failed"); + } + } + tm->test_client_attached = 1; + + /* Turn on the builtin client input nodes */ + for (i = 0; i < thread_main->n_vlib_mains; i++) + vlib_node_set_state (vlib_mains[i], builtin_client_node.index, + VLIB_NODE_STATE_POLLING); + + if (preallocate_sessions) + { + session_t *sp __attribute__ ((unused)); + for (i = 0; i < n_clients; i++) + pool_get (tm->sessions, sp); + for (i = 0; i < n_clients; i++) + pool_put_index (tm->sessions, i); + } + + /* Fire off connect requests */ + time_before_connects = vlib_time_now (vm); + clients_connect (vm, uri, n_clients); + + /* Park until the sessions come up, or ten seconds elapse... */ + vlib_process_wait_for_event_or_clock (vm, syn_timeout); + event_type = vlib_process_get_events (vm, &event_data); + switch (event_type) + { + case ~0: + vlib_cli_output (vm, "Timeout with only %d sessions active...", + tm->ready_connections); + goto cleanup; + + case 1: + delta = vlib_time_now (vm) - time_before_connects; + + if (delta != 0.0) + { + vlib_cli_output + (vm, "%d three-way handshakes in %.2f seconds, %.2f/sec", + n_clients, delta, ((f64) n_clients) / delta); + } + + tm->test_start_time = vlib_time_now (tm->vlib_main); + vlib_cli_output (vm, "Test started at %.6f", tm->test_start_time); + break; + + default: + vlib_cli_output (vm, "unexpected event(1): %d", event_type); + goto cleanup; + } + + /* Now wait for the sessions to finish... */ + vlib_process_wait_for_event_or_clock (vm, test_timeout); + event_type = vlib_process_get_events (vm, &event_data); + switch (event_type) + { + case ~0: + vlib_cli_output (vm, "Timeout with %d sessions still active...", + tm->ready_connections); + goto cleanup; + + case 2: + tm->test_end_time = vlib_time_now (vm); + vlib_cli_output (vm, "Test finished at %.6f", tm->test_end_time); + break; + + default: + vlib_cli_output (vm, "unexpected event(2): %d", event_type); + goto cleanup; + } + + delta = tm->test_end_time - tm->test_start_time; + + if (delta != 0.0) + { + total_bytes = (tm->no_return ? tm->tx_total : tm->rx_total); + transfer_type = tm->no_return ? "half-duplex" : "full-duplex"; + vlib_cli_output (vm, + "%lld bytes (%lld mbytes, %lld gbytes) in %.2f seconds", + total_bytes, total_bytes / (1ULL << 20), + total_bytes / (1ULL << 30), delta); + vlib_cli_output (vm, "%.2f bytes/second %s", + ((f64) total_bytes) / (delta), transfer_type); + vlib_cli_output (vm, "%.4f gbit/second %s", + (((f64) total_bytes * 8.0) / delta / 1e9), + transfer_type); + } + else + vlib_cli_output (vm, "zero delta-t?"); + +cleanup: + tm->run_test = 0; + for (i = 0; i < vec_len (tm->connection_index_by_thread); i++) + { + vec_reset_length (tm->connection_index_by_thread[i]); + vec_reset_length (tm->connections_this_batch_by_thread[i]); + } + + pool_free (tm->sessions); + + /* Detach the application, so we can use different fifo sizes next time */ + if (tm->test_client_attached) + { + vnet_app_detach_args_t _da, *da = &_da; + int rv; + + da->app_index = tm->app_index; + + rv = vnet_application_detach (da); + if (rv) + vlib_cli_output (vm, "WARNING: app detach failed..."); + tm->test_client_attached = 0; + tm->app_index = ~0; + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (test_clients_command, static) = +{ + .path = "test tcp clients", + .short_help = "test tcp clients [nclients %d] [[m|g]bytes <bytes>] " + "[test-timeout <time>][syn-timeout <time>][no-return][fifo-size <size>]" + "[private-segment-count <count>][private-segment-size <bytes>[m|g]]" + "[preallocate-fifos][preallocate-sessions][client-batch <batch-size>]" + "[uri <tcp://ip/port>]", + .function = test_tcp_clients_command_fn, + .is_mp_safe = 1, +}; +/* *INDENT-ON* */ + +clib_error_t * +tcp_test_clients_main_init (vlib_main_t * vm) +{ + tclient_main_t *tm = &tclient_main; + tm->is_init = 0; + return 0; +} + +VLIB_INIT_FUNCTION (tcp_test_clients_main_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/builtin_client.h b/src/vnet/tcp/builtin_client.h new file mode 100644 index 00000000..06d239ef --- /dev/null +++ b/src/vnet/tcp/builtin_client.h @@ -0,0 +1,121 @@ + +/* + * tclient.h - skeleton vpp engine plug-in header file + * + * Copyright (c) <current-year> <your-organization> + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_tclient_h__ +#define __included_tclient_h__ + +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/ethernet/ethernet.h> + +#include <vppinfra/hash.h> +#include <vppinfra/error.h> +#include <vlibmemory/unix_shared_memory_queue.h> +#include <svm/svm_fifo_segment.h> +#include <vnet/session/session.h> +#include <vnet/session/application_interface.h> + +typedef struct +{ + u64 bytes_to_send; + u64 bytes_sent; + u64 bytes_to_receive; + u64 bytes_received; + + svm_fifo_t *server_rx_fifo; + svm_fifo_t *server_tx_fifo; + + u64 vpp_session_handle; +} session_t; + +typedef struct +{ + /* + * Application setup parameters + */ + unix_shared_memory_queue_t *vl_input_queue; /**< vpe input queue */ + unix_shared_memory_queue_t **vpp_event_queue; + + u32 cli_node_index; /**< cli process node index */ + u32 my_client_index; /**< loopback API client handle */ + u32 app_index; /**< app index after attach */ + + /* + * Configuration params + */ + u8 *connect_uri; /**< URI for slave's connect */ + u64 bytes_to_send; /**< Bytes to send */ + u32 configured_segment_size; + u32 fifo_size; + u32 expected_connections; /**< Number of clients/connections */ + u32 connections_per_batch; /**< Connections to rx/tx at once */ + u32 private_segment_count; /**< Number of private fifo segs */ + u32 private_segment_size; /**< size of private fifo segs */ + + /* + * Test state variables + */ + session_t *sessions; /**< Session pool, shared */ + clib_spinlock_t sessions_lock; + u8 **rx_buf; /**< intermediate rx buffers */ + u8 *connect_test_data; /**< Pre-computed test data */ + u32 **connection_index_by_thread; + u32 **connections_this_batch_by_thread; /**< active connection batch */ + pthread_t client_thread_handle; + + volatile u32 ready_connections; + volatile u32 finished_connections; + volatile u64 rx_total; + volatile u64 tx_total; + volatile int run_test; /**< Signal start of test */ + + f64 test_start_time; + f64 test_end_time; + u32 prev_conns; + u32 repeats; + /* + * Flags + */ + u8 is_init; + u8 test_client_attached; + u8 no_return; + u8 test_return_packets; + int i_am_master; + int drop_packets; /**< drop all packets */ + u8 prealloc_fifos; /**< Request fifo preallocation */ + + /* + * Convenience + */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; + ethernet_main_t *ethernet_main; +} tclient_main_t; + +tclient_main_t tclient_main; + +vlib_node_registration_t tclient_node; + +#endif /* __included_tclient_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/builtin_http_server.c b/src/vnet/tcp/builtin_http_server.c new file mode 100644 index 00000000..9ba19ce9 --- /dev/null +++ b/src/vnet/tcp/builtin_http_server.c @@ -0,0 +1,564 @@ +/* +* Copyright (c) 2015-2017 Cisco and/or its affiliates. +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at: +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include <vnet/vnet.h> +#include <vnet/session/application.h> +#include <vnet/session/application_interface.h> + +typedef enum +{ + EVENT_WAKEUP = 1, +} http_process_event_t; + +typedef struct +{ + u64 session_handle; + u64 node_index; + u8 *data; +} builtin_http_server_args; + +typedef struct +{ + u8 *rx_buf; + unix_shared_memory_queue_t **vpp_queue; + u64 byte_index; + + uword *handler_by_get_request; + + u32 *free_http_cli_process_node_indices; + + /* Sever's event queue */ + unix_shared_memory_queue_t *vl_input_queue; + + /* API client handle */ + u32 my_client_index; + + u32 app_index; + + /* process node index for evnt scheduling */ + u32 node_index; + vlib_main_t *vlib_main; +} http_server_main_t; + +http_server_main_t http_server_main; + +static void +free_http_process (builtin_http_server_args * args) +{ + vlib_node_runtime_t *rt; + vlib_main_t *vm = &vlib_global_main; + http_server_main_t *hsm = &http_server_main; + vlib_node_t *n; + u32 node_index; + builtin_http_server_args **save_args; + + node_index = args->node_index; + ASSERT (node_index != 0); + + n = vlib_get_node (vm, node_index); + rt = vlib_node_get_runtime (vm, n->index); + save_args = vlib_node_get_runtime_data (vm, n->index); + + /* Reset process session pointer */ + clib_mem_free (*save_args); + *save_args = 0; + + /* Turn off the process node */ + vlib_node_set_state (vm, rt->node_index, VLIB_NODE_STATE_DISABLED); + + /* add node index to the freelist */ + vec_add1 (hsm->free_http_cli_process_node_indices, node_index); +} + +static const char + *http_response = "HTTP/1.1 200 OK\r\n" + "Content-Type: text/html\r\n" + "Expires: Mon, 11 Jan 1970 10:10:10 GMT\r\n" + "Connection: close\r\n" + "Pragma: no-cache\r\n" "Content-Length: %d\r\n\r\n%s"; + +static const char + *http_error_template = "HTTP/1.1 %s\r\n" + "Content-Type: text/html\r\n" + "Expires: Mon, 11 Jan 1970 10:10:10 GMT\r\n" + "Connection: close\r\n" "Pragma: no-cache\r\n" "Content-Length: 0\r\n\r\n"; + +/* Header, including incantation to suppress favicon.ico requests */ +static const char + *html_header_template = "<html><head><title>%v</title>" + "</head><link rel=\"icon\" href=\"data:,\"><body><pre>"; + +static const char *html_footer = "</pre></body></html>\r\n"; + +static void +http_cli_output (uword arg, u8 * buffer, uword buffer_bytes) +{ + u8 **output_vecp = (u8 **) arg; + u8 *output_vec; + u32 offset; + + output_vec = *output_vecp; + + offset = vec_len (output_vec); + vec_validate (output_vec, offset + buffer_bytes - 1); + clib_memcpy (output_vec + offset, buffer, buffer_bytes); + + *output_vecp = output_vec; +} + +void +send_data (builtin_http_server_args * args, u8 * data) +{ + session_fifo_event_t evt; + u32 offset, bytes_to_send; + f64 delay = 10e-3; + http_server_main_t *hsm = &http_server_main; + vlib_main_t *vm = hsm->vlib_main; + f64 last_sent_timer = vlib_time_now (vm); + stream_session_t *s; + + s = stream_session_get_from_handle (args->session_handle); + ASSERT (s); + bytes_to_send = vec_len (data); + offset = 0; + + while (bytes_to_send > 0) + { + int actual_transfer; + + actual_transfer = svm_fifo_enqueue_nowait + (s->server_tx_fifo, bytes_to_send, data + offset); + + /* Made any progress? */ + if (actual_transfer <= 0) + { + vlib_process_suspend (vm, delay); + /* 10s deadman timer */ + if (vlib_time_now (vm) > last_sent_timer + 10.0) + { + /* $$$$ FC: reset transport session here? */ + break; + } + /* Exponential backoff, within reason */ + if (delay < 1.0) + delay = delay * 2.0; + } + else + { + last_sent_timer = vlib_time_now (vm); + offset += actual_transfer; + bytes_to_send -= actual_transfer; + + if (svm_fifo_set_event (s->server_tx_fifo)) + { + /* Fabricate TX event, send to vpp */ + evt.fifo = s->server_tx_fifo; + evt.event_type = FIFO_EVENT_APP_TX; + evt.event_id = 0; + + unix_shared_memory_queue_add (hsm->vpp_queue[s->thread_index], + (u8 *) & evt, + 0 /* do wait for mutex */ ); + } + delay = 10e-3; + } + } +} + +static void +send_error (builtin_http_server_args * args, char *str) +{ + u8 *data; + + data = format (0, http_error_template, str); + send_data (args, data); + vec_free (data); +} + +static uword +http_cli_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + http_server_main_t *hsm = &http_server_main; + u8 *request = 0, *reply = 0; + builtin_http_server_args **save_args; + builtin_http_server_args *args; + unformat_input_t input; + int i; + u8 *http = 0, *html = 0; + + save_args = vlib_node_get_runtime_data (hsm->vlib_main, rt->node_index); + args = *save_args; + + request = (u8 *) (void *) (args->data); + if (vec_len (request) < 7) + { + send_error (args, "400 Bad Request"); + goto out; + } + + for (i = 0; i < vec_len (request) - 4; i++) + { + if (request[i] == 'G' && + request[i + 1] == 'E' && + request[i + 2] == 'T' && request[i + 3] == ' ') + goto found; + } +bad_request: + send_error (args, "400 Bad Request"); + goto out; + +found: + /* Lose "GET " */ + vec_delete (request, i + 5, 0); + + /* Replace slashes with spaces, stop at the end of the path */ + i = 0; + while (1) + { + if (request[i] == '/') + request[i] = ' '; + else if (request[i] == ' ') + { + /* vlib_cli_input is vector-based, no need for a NULL */ + _vec_len (request) = i; + break; + } + i++; + /* Should never happen */ + if (i == vec_len (request)) + goto bad_request; + } + + /* Generate the html header */ + html = format (0, html_header_template, request /* title */ ); + + /* Run the command */ + unformat_init_vector (&input, request); + vlib_cli_input (vm, &input, http_cli_output, (uword) & reply); + unformat_free (&input); + request = 0; + + /* Generate the html page */ + html = format (html, "%v", reply); + html = format (html, html_footer); + /* And the http reply */ + http = format (0, http_response, vec_len (html), html); + + /* Send it */ + send_data (args, http); + +out: + /* Cleanup */ + vec_free (request); + vec_free (reply); + vec_free (html); + vec_free (http); + + free_http_process (args); + return (0); +} + +static void +alloc_http_process (builtin_http_server_args * args) +{ + char *name; + vlib_node_t *n; + http_server_main_t *hsm = &http_server_main; + vlib_main_t *vm = hsm->vlib_main; + uword l = vec_len (hsm->free_http_cli_process_node_indices); + builtin_http_server_args **save_args; + + if (vec_len (hsm->free_http_cli_process_node_indices) > 0) + { + n = vlib_get_node (vm, hsm->free_http_cli_process_node_indices[l - 1]); + vlib_node_set_state (vm, n->index, VLIB_NODE_STATE_POLLING); + _vec_len (hsm->free_http_cli_process_node_indices) = l - 1; + } + else + { + static vlib_node_registration_t r = { + .function = http_cli_process, + .type = VLIB_NODE_TYPE_PROCESS, + .process_log2_n_stack_bytes = 16, + .runtime_data_bytes = sizeof (void *), + }; + + name = (char *) format (0, "http-cli-%d", l); + r.name = name; + vlib_register_node (vm, &r); + vec_free (name); + + n = vlib_get_node (vm, r.index); + } + + /* Save the node index in the args. It won't be zero. */ + args->node_index = n->index; + + /* Save the args (pointer) in the node runtime */ + save_args = vlib_node_get_runtime_data (vm, n->index); + *save_args = args; + + vlib_start_process (vm, n->runtime_index); +} + +static void +alloc_http_process_callback (void *cb_args) +{ + alloc_http_process ((builtin_http_server_args *) cb_args); +} + +static int +http_server_rx_callback (stream_session_t * s) +{ + u32 max_dequeue; + int actual_transfer; + http_server_main_t *hsm = &http_server_main; + svm_fifo_t *rx_fifo; + builtin_http_server_args *args; + + rx_fifo = s->server_rx_fifo; + max_dequeue = svm_fifo_max_dequeue (rx_fifo); + svm_fifo_unset_event (rx_fifo); + if (PREDICT_FALSE (max_dequeue == 0)) + return 0; + + vec_validate (hsm->rx_buf, max_dequeue - 1); + _vec_len (hsm->rx_buf) = max_dequeue; + + actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, max_dequeue, + hsm->rx_buf); + ASSERT (actual_transfer > 0); + _vec_len (hsm->rx_buf) = actual_transfer; + + /* send the command to a new/recycled vlib process */ + args = clib_mem_alloc (sizeof (*args)); + args->data = vec_dup (hsm->rx_buf); + args->session_handle = stream_session_handle (s); + + /* Send an RPC request via the thread-0 input node */ + if (vlib_get_thread_index () != 0) + { + session_fifo_event_t evt; + evt.rpc_args.fp = alloc_http_process_callback; + evt.rpc_args.arg = args; + evt.event_type = FIFO_EVENT_RPC; + unix_shared_memory_queue_add + (session_manager_get_vpp_event_queue (0 /* main thread */ ), + (u8 *) & evt, 0 /* do wait for mutex */ ); + } + else + alloc_http_process (args); + return 0; +} + +static int +builtin_session_accept_callback (stream_session_t * s) +{ + http_server_main_t *bsm = &http_server_main; + + bsm->vpp_queue[s->thread_index] = + session_manager_get_vpp_event_queue (s->thread_index); + s->session_state = SESSION_STATE_READY; + bsm->byte_index = 0; + return 0; +} + +static void +builtin_session_disconnect_callback (stream_session_t * s) +{ + http_server_main_t *bsm = &http_server_main; + vnet_disconnect_args_t _a, *a = &_a; + + a->handle = stream_session_handle (s); + a->app_index = bsm->app_index; + vnet_disconnect_session (a); +} + +static void +builtin_session_reset_callback (stream_session_t * s) +{ + clib_warning ("called.. "); + + stream_session_cleanup (s); +} + +static int +builtin_session_connected_callback (u32 app_index, u32 api_context, + stream_session_t * s, u8 is_fail) +{ + clib_warning ("called..."); + return -1; +} + +static int +builtin_add_segment_callback (u32 client_index, + const u8 * seg_name, u32 seg_size) +{ + clib_warning ("called..."); + return -1; +} + +static int +builtin_redirect_connect_callback (u32 client_index, void *mp) +{ + clib_warning ("called..."); + return -1; +} + +static session_cb_vft_t builtin_session_cb_vft = { + .session_accept_callback = builtin_session_accept_callback, + .session_disconnect_callback = builtin_session_disconnect_callback, + .session_connected_callback = builtin_session_connected_callback, + .add_segment_callback = builtin_add_segment_callback, + .redirect_connect_callback = builtin_redirect_connect_callback, + .builtin_server_rx_callback = http_server_rx_callback, + .session_reset_callback = builtin_session_reset_callback +}; + +/* Abuse VPP's input queue */ +static int +create_api_loopback (vlib_main_t * vm) +{ + http_server_main_t *hsm = &http_server_main; + api_main_t *am = &api_main; + vl_shmem_hdr_t *shmem_hdr; + + shmem_hdr = am->shmem_hdr; + hsm->vl_input_queue = shmem_hdr->vl_input_queue; + hsm->my_client_index = + vl_api_memclnt_create_internal ("tcp_test_client", hsm->vl_input_queue); + return 0; +} + +static int +server_attach () +{ + http_server_main_t *hsm = &http_server_main; + u8 segment_name[128]; + u64 options[SESSION_OPTIONS_N_OPTIONS]; + vnet_app_attach_args_t _a, *a = &_a; + + memset (a, 0, sizeof (*a)); + memset (options, 0, sizeof (options)); + + a->api_client_index = hsm->my_client_index; + a->session_cb_vft = &builtin_session_cb_vft; + a->options = options; + a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 128 << 20; + a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 8 << 10; + a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 32 << 10; + a->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP; + a->options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = 16; + a->segment_name = segment_name; + a->segment_name_length = ARRAY_LEN (segment_name); + + if (vnet_application_attach (a)) + { + clib_warning ("failed to attach server"); + return -1; + } + hsm->app_index = a->app_index; + return 0; +} + +static int +server_listen () +{ + http_server_main_t *hsm = &http_server_main; + vnet_bind_args_t _a, *a = &_a; + memset (a, 0, sizeof (*a)); + a->app_index = hsm->app_index; + a->uri = "tcp://0.0.0.0/80"; + return vnet_bind_uri (a); +} + +static int +server_create (vlib_main_t * vm) +{ + http_server_main_t *hsm = &http_server_main; + u32 num_threads; + vlib_thread_main_t *vtm = vlib_get_thread_main (); + + ASSERT (hsm->my_client_index == (u32) ~ 0); + if (create_api_loopback (vm)) + return -1; + + num_threads = 1 /* main thread */ + vtm->n_threads; + vec_validate (http_server_main.vpp_queue, num_threads - 1); + + if (server_attach ()) + { + clib_warning ("failed to attach server"); + return -1; + } + if (server_listen ()) + { + clib_warning ("failed to start listening"); + return -1; + } + return 0; +} + +static clib_error_t * +server_create_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + http_server_main_t *hsm = &http_server_main; + int rv; + + if (hsm->my_client_index != (u32) ~ 0) + return clib_error_return (0, "test http server is already running"); + + vnet_session_enable_disable (vm, 1 /* turn on TCP, etc. */ ); + rv = server_create (vm); + switch (rv) + { + case 0: + break; + default: + return clib_error_return (0, "server_create returned %d", rv); + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (server_create_command, static) = +{ + .path = "test http server", + .short_help = "test http server", + .function = server_create_command_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +builtin_http_server_main_init (vlib_main_t * vm) +{ + http_server_main_t *hsm = &http_server_main; + hsm->my_client_index = ~0; + hsm->vlib_main = vm; + + return 0; +} + +VLIB_INIT_FUNCTION (builtin_http_server_main_init); + +/* +* fd.io coding-style-patch-verification: ON +* +* Local Variables: +* eval: (c-set-style "gnu") +* End: +*/ diff --git a/src/vnet/tcp/builtin_proxy.c b/src/vnet/tcp/builtin_proxy.c new file mode 100644 index 00000000..91377e76 --- /dev/null +++ b/src/vnet/tcp/builtin_proxy.c @@ -0,0 +1,601 @@ +/* +* Copyright (c) 2015-2017 Cisco and/or its affiliates. +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at: +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include <vnet/vnet.h> +#include <vlibmemory/api.h> +#include <vnet/session/application.h> +#include <vnet/session/application_interface.h> +#include <vnet/tcp/builtin_proxy.h> + +builtin_proxy_main_t builtin_proxy_main; + +static void +delete_proxy_session (stream_session_t * s, int is_active_open) +{ + builtin_proxy_main_t *bpm = &builtin_proxy_main; + proxy_session_t *ps = 0; + vnet_disconnect_args_t _a, *a = &_a; + stream_session_t *active_open_session = 0; + stream_session_t *server_session = 0; + uword *p; + u64 handle; + + handle = stream_session_handle (s); + + clib_spinlock_lock_if_init (&bpm->sessions_lock); + if (is_active_open) + { + active_open_session = s; + + p = hash_get (bpm->proxy_session_by_active_open_handle, handle); + if (p == 0) + { + clib_warning ("proxy session for %s handle %lld (%llx) AWOL", + is_active_open ? "active open" : "server", + handle, handle); + } + else + { + ps = pool_elt_at_index (bpm->sessions, p[0]); + if (ps->vpp_server_handle != ~0) + server_session = stream_session_get_from_handle + (ps->vpp_server_handle); + else + server_session = 0; + } + } + else + { + server_session = s; + + p = hash_get (bpm->proxy_session_by_server_handle, handle); + if (p == 0) + { + clib_warning ("proxy session for %s handle %lld (%llx) AWOL", + is_active_open ? "active open" : "server", + handle, handle); + } + else + { + ps = pool_elt_at_index (bpm->sessions, p[0]); + if (ps->vpp_server_handle != ~0) + active_open_session = stream_session_get_from_handle + (ps->vpp_server_handle); + else + active_open_session = 0; + } + } + + if (ps) + { + if (CLIB_DEBUG > 0) + memset (ps, 0xFE, sizeof (*ps)); + pool_put (bpm->sessions, ps); + } + + clib_spinlock_unlock_if_init (&bpm->sessions_lock); + + if (active_open_session) + { + a->handle = stream_session_handle (active_open_session); + a->app_index = bpm->active_open_app_index; + hash_unset (bpm->proxy_session_by_active_open_handle, + stream_session_handle (active_open_session)); + vnet_disconnect_session (a); + } + + if (server_session) + { + a->handle = stream_session_handle (server_session); + a->app_index = bpm->server_app_index; + hash_unset (bpm->proxy_session_by_server_handle, + stream_session_handle (server_session)); + vnet_disconnect_session (a); + } +} + +static int +server_accept_callback (stream_session_t * s) +{ + builtin_proxy_main_t *bpm = &builtin_proxy_main; + + s->session_state = SESSION_STATE_READY; + + clib_spinlock_lock_if_init (&bpm->sessions_lock); + + return 0; +} + +static void +server_disconnect_callback (stream_session_t * s) +{ + delete_proxy_session (s, 0 /* is_active_open */ ); +} + +static void +server_reset_callback (stream_session_t * s) +{ + clib_warning ("Reset session %U", format_stream_session, s, 2); + delete_proxy_session (s, 0 /* is_active_open */ ); +} + +static int +server_connected_callback (u32 app_index, u32 api_context, + stream_session_t * s, u8 is_fail) +{ + clib_warning ("called..."); + return -1; +} + +static int +server_add_segment_callback (u32 client_index, + const u8 * seg_name, u32 seg_size) +{ + clib_warning ("called..."); + return -1; +} + +static int +server_redirect_connect_callback (u32 client_index, void *mp) +{ + clib_warning ("called..."); + return -1; +} + +static int +server_rx_callback (stream_session_t * s) +{ + u32 max_dequeue; + int actual_transfer __attribute__ ((unused)); + svm_fifo_t *tx_fifo, *rx_fifo; + builtin_proxy_main_t *bpm = &builtin_proxy_main; + u32 thread_index = vlib_get_thread_index (); + vnet_connect_args_t _a, *a = &_a; + proxy_session_t *ps; + int proxy_index; + uword *p; + svm_fifo_t *active_open_tx_fifo; + session_fifo_event_t evt; + + ASSERT (s->thread_index == thread_index); + + clib_spinlock_lock_if_init (&bpm->sessions_lock); + p = + hash_get (bpm->proxy_session_by_server_handle, stream_session_handle (s)); + + if (PREDICT_TRUE (p != 0)) + { + clib_spinlock_unlock_if_init (&bpm->sessions_lock); + active_open_tx_fifo = s->server_rx_fifo; + + /* + * Send event for active open tx fifo + */ + if (svm_fifo_set_event (active_open_tx_fifo)) + { + evt.fifo = active_open_tx_fifo; + evt.event_type = FIFO_EVENT_APP_TX; + if (unix_shared_memory_queue_add + (bpm->active_open_event_queue[thread_index], (u8 *) & evt, + 0 /* do wait for mutex */ )) + clib_warning ("failed to enqueue tx evt"); + } + } + else + { + rx_fifo = s->server_rx_fifo; + tx_fifo = s->server_tx_fifo; + + ASSERT (rx_fifo->master_thread_index == thread_index); + ASSERT (tx_fifo->master_thread_index == thread_index); + + max_dequeue = svm_fifo_max_dequeue (s->server_rx_fifo); + + if (PREDICT_FALSE (max_dequeue == 0)) + return 0; + + actual_transfer = svm_fifo_peek (rx_fifo, 0 /* relative_offset */ , + max_dequeue, + bpm->rx_buf[thread_index]); + + /* $$$ your message in this space: parse url, etc. */ + + memset (a, 0, sizeof (*a)); + + clib_spinlock_lock_if_init (&bpm->sessions_lock); + pool_get (bpm->sessions, ps); + memset (ps, 0, sizeof (*ps)); + ps->server_rx_fifo = rx_fifo; + ps->server_tx_fifo = tx_fifo; + ps->vpp_server_handle = stream_session_handle (s); + + proxy_index = ps - bpm->sessions; + + hash_set (bpm->proxy_session_by_server_handle, ps->vpp_server_handle, + proxy_index); + + clib_spinlock_unlock_if_init (&bpm->sessions_lock); + + a->uri = "tcp://6.0.2.2/23"; + a->api_context = proxy_index; + a->app_index = bpm->active_open_app_index; + a->mp = 0; + vnet_connect_uri (a); + } + + return 0; +} + +static session_cb_vft_t builtin_session_cb_vft = { + .session_accept_callback = server_accept_callback, + .session_disconnect_callback = server_disconnect_callback, + .session_connected_callback = server_connected_callback, + .add_segment_callback = server_add_segment_callback, + .redirect_connect_callback = server_redirect_connect_callback, + .builtin_server_rx_callback = server_rx_callback, + .session_reset_callback = server_reset_callback +}; + +static int +active_open_connected_callback (u32 app_index, u32 opaque, + stream_session_t * s, u8 is_fail) +{ + builtin_proxy_main_t *bpm = &builtin_proxy_main; + proxy_session_t *ps; + u8 thread_index = vlib_get_thread_index (); + session_fifo_event_t evt; + + if (is_fail) + { + clib_warning ("connection %d failed!", opaque); + return 0; + } + + /* + * Setup proxy session handle. + */ + clib_spinlock_lock_if_init (&bpm->sessions_lock); + + ps = pool_elt_at_index (bpm->sessions, opaque); + ps->vpp_active_open_handle = stream_session_handle (s); + + s->server_tx_fifo = ps->server_rx_fifo; + s->server_rx_fifo = ps->server_tx_fifo; + + /* + * Reset the active-open tx-fifo master indices so the active-open session + * will receive data, etc. + */ + s->server_tx_fifo->master_session_index = s->session_index; + s->server_tx_fifo->master_thread_index = s->thread_index; + + /* + * Account for the active-open session's use of the fifos + * so they won't disappear until the last session which uses + * them disappears + */ + s->server_tx_fifo->refcnt++; + s->server_rx_fifo->refcnt++; + + hash_set (bpm->proxy_session_by_active_open_handle, + ps->vpp_active_open_handle, opaque); + + clib_spinlock_unlock_if_init (&bpm->sessions_lock); + + /* + * Send event for active open tx fifo + */ + if (svm_fifo_set_event (s->server_tx_fifo)) + { + evt.fifo = s->server_tx_fifo; + evt.event_type = FIFO_EVENT_APP_TX; + if (unix_shared_memory_queue_add + (bpm->active_open_event_queue[thread_index], (u8 *) & evt, + 0 /* do wait for mutex */ )) + clib_warning ("failed to enqueue tx evt"); + } + + return 0; +} + +static void +active_open_reset_callback (stream_session_t * s) +{ + delete_proxy_session (s, 1 /* is_active_open */ ); +} + +static int +active_open_create_callback (stream_session_t * s) +{ + return 0; +} + +static void +active_open_disconnect_callback (stream_session_t * s) +{ + delete_proxy_session (s, 1 /* is_active_open */ ); +} + +static int +active_open_rx_callback (stream_session_t * s) +{ + builtin_proxy_main_t *bpm = &builtin_proxy_main; + session_fifo_event_t evt; + svm_fifo_t *server_rx_fifo; + u32 thread_index = vlib_get_thread_index (); + + server_rx_fifo = s->server_rx_fifo; + + /* + * Send event for server tx fifo + */ + if (svm_fifo_set_event (server_rx_fifo)) + { + evt.fifo = server_rx_fifo; + evt.event_type = FIFO_EVENT_APP_TX; + if (unix_shared_memory_queue_add + (bpm->server_event_queue[thread_index], (u8 *) & evt, + 0 /* do wait for mutex */ )) + clib_warning ("failed to enqueue server rx evt"); + } + + return 0; +} + +/* *INDENT-OFF* */ +static session_cb_vft_t builtin_clients = { + .session_reset_callback = active_open_reset_callback, + .session_connected_callback = active_open_connected_callback, + .session_accept_callback = active_open_create_callback, + .session_disconnect_callback = active_open_disconnect_callback, + .builtin_server_rx_callback = active_open_rx_callback +}; +/* *INDENT-ON* */ + + +static void +create_api_loopbacks (vlib_main_t * vm) +{ + builtin_proxy_main_t *bpm = &builtin_proxy_main; + api_main_t *am = &api_main; + vl_shmem_hdr_t *shmem_hdr; + + shmem_hdr = am->shmem_hdr; + bpm->vl_input_queue = shmem_hdr->vl_input_queue; + bpm->server_client_index = + vl_api_memclnt_create_internal ("proxy_server", bpm->vl_input_queue); + bpm->active_open_client_index = + vl_api_memclnt_create_internal ("proxy_active_open", bpm->vl_input_queue); +} + +static int +server_attach () +{ + builtin_proxy_main_t *bpm = &builtin_proxy_main; + u8 segment_name[128]; + u64 options[SESSION_OPTIONS_N_OPTIONS]; + vnet_app_attach_args_t _a, *a = &_a; + + memset (a, 0, sizeof (*a)); + memset (options, 0, sizeof (options)); + + a->api_client_index = bpm->server_client_index; + a->session_cb_vft = &builtin_session_cb_vft; + a->options = options; + a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 512 << 20; + a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = bpm->fifo_size; + a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = bpm->fifo_size; + a->options[APP_OPTIONS_PRIVATE_SEGMENT_COUNT] = bpm->private_segment_count; + a->options[APP_OPTIONS_PRIVATE_SEGMENT_SIZE] = bpm->private_segment_size; + a->options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = + bpm->prealloc_fifos ? bpm->prealloc_fifos : 1; + + a->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP; + + a->segment_name = segment_name; + a->segment_name_length = ARRAY_LEN (segment_name); + + if (vnet_application_attach (a)) + { + clib_warning ("failed to attach server"); + return -1; + } + bpm->server_app_index = a->app_index; + + return 0; +} + +static int +active_open_attach (void) +{ + builtin_proxy_main_t *bpm = &builtin_proxy_main; + vnet_app_attach_args_t _a, *a = &_a; + u8 segment_name[128]; + u32 segment_name_length; + u64 options[16]; + + segment_name_length = ARRAY_LEN (segment_name); + + memset (a, 0, sizeof (*a)); + memset (options, 0, sizeof (options)); + + a->api_client_index = bpm->active_open_client_index; + a->segment_name = segment_name; + a->segment_name_length = segment_name_length; + a->session_cb_vft = &builtin_clients; + + options[SESSION_OPTIONS_ACCEPT_COOKIE] = 0x12345678; + options[SESSION_OPTIONS_SEGMENT_SIZE] = 512 << 20; + options[SESSION_OPTIONS_RX_FIFO_SIZE] = bpm->fifo_size; + options[SESSION_OPTIONS_TX_FIFO_SIZE] = bpm->fifo_size; + options[APP_OPTIONS_PRIVATE_SEGMENT_COUNT] = bpm->private_segment_count; + options[APP_OPTIONS_PRIVATE_SEGMENT_SIZE] = bpm->private_segment_size; + options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = + bpm->prealloc_fifos ? bpm->prealloc_fifos : 1; + + options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP + | APP_OPTIONS_FLAGS_IS_PROXY; + + a->options = options; + + if (vnet_application_attach (a)) + return -1; + + bpm->active_open_app_index = a->app_index; + + return 0; +} + +static int +server_listen () +{ + builtin_proxy_main_t *bpm = &builtin_proxy_main; + vnet_bind_args_t _a, *a = &_a; + memset (a, 0, sizeof (*a)); + a->app_index = bpm->server_app_index; + a->uri = "tcp://0.0.0.0/23"; + return vnet_bind_uri (a); +} + +static int +server_create (vlib_main_t * vm) +{ + builtin_proxy_main_t *bpm = &builtin_proxy_main; + vlib_thread_main_t *vtm = vlib_get_thread_main (); + u32 num_threads; + int i; + + if (bpm->server_client_index == (u32) ~ 0) + create_api_loopbacks (vm); + + num_threads = 1 /* main thread */ + vtm->n_threads; + vec_validate (builtin_proxy_main.server_event_queue, num_threads - 1); + vec_validate (builtin_proxy_main.active_open_event_queue, num_threads - 1); + vec_validate (bpm->rx_buf, num_threads - 1); + + for (i = 0; i < num_threads; i++) + vec_validate (bpm->rx_buf[i], bpm->rcv_buffer_size); + + if (server_attach ()) + { + clib_warning ("failed to attach server app"); + return -1; + } + if (server_listen ()) + { + clib_warning ("failed to start listening"); + return -1; + } + if (active_open_attach ()) + { + clib_warning ("failed to attach active open app"); + return -1; + } + + for (i = 0; i < num_threads; i++) + { + bpm->active_open_event_queue[i] = + session_manager_get_vpp_event_queue (i); + + ASSERT (bpm->active_open_event_queue[i]); + + bpm->server_event_queue[i] = session_manager_get_vpp_event_queue (i); + } + + return 0; +} + +static clib_error_t * +proxy_server_create_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + builtin_proxy_main_t *bpm = &builtin_proxy_main; + int rv; + u64 tmp; + + bpm->fifo_size = 64 << 10; + bpm->rcv_buffer_size = 1024; + bpm->prealloc_fifos = 0; + bpm->private_segment_count = 0; + bpm->private_segment_size = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "fifo-size %d", &bpm->fifo_size)) + bpm->fifo_size <<= 10; + else if (unformat (input, "rcv-buf-size %d", &bpm->rcv_buffer_size)) + ; + else if (unformat (input, "prealloc-fifos %d", &bpm->prealloc_fifos)) + ; + else if (unformat (input, "private-segment-count %d", + &bpm->private_segment_count)) + ; + else if (unformat (input, "private-segment-size %U", + unformat_memory_size, &tmp)) + { + if (tmp >= 0x100000000ULL) + return clib_error_return + (0, "private segment size %lld (%llu) too large", tmp, tmp); + bpm->private_segment_size = tmp; + } + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + vnet_session_enable_disable (vm, 1 /* turn on TCP, etc. */ ); + + rv = server_create (vm); + switch (rv) + { + case 0: + break; + default: + return clib_error_return (0, "server_create returned %d", rv); + } + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (server_create_command, static) = +{ + .path = "test proxy server", + .short_help = "test proxy server", + .function = proxy_server_create_command_fn, +}; +/* *INDENT-ON* */ + +clib_error_t * +builtin_tcp_proxy_main_init (vlib_main_t * vm) +{ + builtin_proxy_main_t *bpm = &builtin_proxy_main; + bpm->server_client_index = ~0; + bpm->active_open_client_index = ~0; + bpm->proxy_session_by_active_open_handle = hash_create (0, sizeof (uword)); + bpm->proxy_session_by_server_handle = hash_create (0, sizeof (uword)); + + return 0; +} + +VLIB_INIT_FUNCTION (builtin_tcp_proxy_main_init); + +/* +* fd.io coding-style-patch-verification: ON +* +* Local Variables: +* eval: (c-set-style "gnu") +* End: +*/ diff --git a/src/vnet/tcp/builtin_proxy.h b/src/vnet/tcp/builtin_proxy.h new file mode 100644 index 00000000..cf707a15 --- /dev/null +++ b/src/vnet/tcp/builtin_proxy.h @@ -0,0 +1,100 @@ + +/* + * builtin_proxy.h - skeleton vpp engine plug-in header file + * + * Copyright (c) <current-year> <your-organization> + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_builtin_proxy_h__ +#define __included_builtin_proxy_h__ + +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/ethernet/ethernet.h> + +#include <vppinfra/hash.h> +#include <vppinfra/error.h> +#include <vlibmemory/unix_shared_memory_queue.h> +#include <svm/svm_fifo_segment.h> +#include <vnet/session/session.h> +#include <vnet/session/application_interface.h> + +typedef struct +{ + svm_fifo_t *server_rx_fifo; + svm_fifo_t *server_tx_fifo; + + u64 vpp_server_handle; + u64 vpp_active_open_handle; +} proxy_session_t; + +typedef struct +{ + unix_shared_memory_queue_t *vl_input_queue; /**< vpe input queue */ + /** per-thread vectors */ + unix_shared_memory_queue_t **server_event_queue; + unix_shared_memory_queue_t **active_open_event_queue; + u8 **rx_buf; /**< intermediate rx buffers */ + + u32 cli_node_index; /**< cli process node index */ + u32 server_client_index; /**< server API client handle */ + u32 server_app_index; /**< server app index */ + u32 active_open_client_index; /**< active open API client handle */ + u32 active_open_app_index; /**< active open index after attach */ + + uword *proxy_session_by_server_handle; + uword *proxy_session_by_active_open_handle; + + /* + * Configuration params + */ + u8 *connect_uri; /**< URI for slave's connect */ + u32 configured_segment_size; + u32 fifo_size; + u32 private_segment_count; /**< Number of private fifo segs */ + u32 private_segment_size; /**< size of private fifo segs */ + int rcv_buffer_size; + + /* + * Test state variables + */ + proxy_session_t *sessions; /**< Session pool, shared */ + clib_spinlock_t sessions_lock; + u32 **connection_index_by_thread; + pthread_t client_thread_handle; + + /* + * Flags + */ + u8 is_init; + u8 prealloc_fifos; /**< Request fifo preallocation */ + + /* + * Convenience + */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; + ethernet_main_t *ethernet_main; +} builtin_proxy_main_t; + +builtin_proxy_main_t builtin_proxy_main; + +#endif /* __included_builtin_proxy_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c new file mode 100644 index 00000000..93314529 --- /dev/null +++ b/src/vnet/tcp/builtin_server.c @@ -0,0 +1,455 @@ +/* +* Copyright (c) 2015-2017 Cisco and/or its affiliates. +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at: +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include <vnet/vnet.h> +#include <vlibmemory/api.h> +#include <vnet/session/application.h> +#include <vnet/session/application_interface.h> + +typedef struct +{ + /* + * Server app parameters + */ + unix_shared_memory_queue_t **vpp_queue; + unix_shared_memory_queue_t *vl_input_queue; /**< Sever's event queue */ + + u32 app_index; /**< Server app index */ + u32 my_client_index; /**< API client handle */ + u32 node_index; /**< process node index for evnt scheduling */ + + /* + * Config params + */ + u8 no_echo; /**< Don't echo traffic */ + u32 fifo_size; /**< Fifo size */ + u32 rcv_buffer_size; /**< Rcv buffer size */ + u32 prealloc_fifos; /**< Preallocate fifos */ + u32 private_segment_count; /**< Number of private segments */ + u32 private_segment_size; /**< Size of private segments */ + char *server_uri; /**< Server URI */ + + /* + * Test state + */ + u8 **rx_buf; /**< Per-thread RX buffer */ + u64 byte_index; + u32 **rx_retries; + + vlib_main_t *vlib_main; +} builtin_server_main_t; + +builtin_server_main_t builtin_server_main; + +int +builtin_session_accept_callback (stream_session_t * s) +{ + builtin_server_main_t *bsm = &builtin_server_main; + + bsm->vpp_queue[s->thread_index] = + session_manager_get_vpp_event_queue (s->thread_index); + s->session_state = SESSION_STATE_READY; + bsm->byte_index = 0; + vec_validate (bsm->rx_retries[s->thread_index], s->session_index); + bsm->rx_retries[s->thread_index][s->session_index] = 0; + return 0; +} + +void +builtin_session_disconnect_callback (stream_session_t * s) +{ + builtin_server_main_t *bsm = &builtin_server_main; + vnet_disconnect_args_t _a, *a = &_a; + + a->handle = stream_session_handle (s); + a->app_index = bsm->app_index; + vnet_disconnect_session (a); +} + +void +builtin_session_reset_callback (stream_session_t * s) +{ + clib_warning ("Reset session %U", format_stream_session, s, 2); + stream_session_cleanup (s); +} + + +int +builtin_session_connected_callback (u32 app_index, u32 api_context, + stream_session_t * s, u8 is_fail) +{ + clib_warning ("called..."); + return -1; +} + +int +builtin_add_segment_callback (u32 client_index, + const u8 * seg_name, u32 seg_size) +{ + clib_warning ("called..."); + return -1; +} + +int +builtin_redirect_connect_callback (u32 client_index, void *mp) +{ + clib_warning ("called..."); + return -1; +} + +void +test_bytes (builtin_server_main_t * bsm, int actual_transfer) +{ + int i; + u32 my_thread_id = vlib_get_thread_index (); + + for (i = 0; i < actual_transfer; i++) + { + if (bsm->rx_buf[my_thread_id][i] != ((bsm->byte_index + i) & 0xff)) + { + clib_warning ("at %lld expected %d got %d", bsm->byte_index + i, + (bsm->byte_index + i) & 0xff, + bsm->rx_buf[my_thread_id][i]); + } + } + bsm->byte_index += actual_transfer; +} + +/* + * If no-echo, just read the data and be done with it + */ +int +builtin_server_rx_callback_no_echo (stream_session_t * s) +{ + builtin_server_main_t *bsm = &builtin_server_main; + u32 my_thread_id = vlib_get_thread_index (); + int actual_transfer; + svm_fifo_t *rx_fifo; + + rx_fifo = s->server_rx_fifo; + + do + { + actual_transfer = + svm_fifo_dequeue_nowait (rx_fifo, bsm->rcv_buffer_size, + bsm->rx_buf[my_thread_id]); + } + while (actual_transfer > 0); + return 0; +} + +int +builtin_server_rx_callback (stream_session_t * s) +{ + u32 n_written, max_dequeue, max_enqueue, max_transfer; + int actual_transfer; + svm_fifo_t *tx_fifo, *rx_fifo; + builtin_server_main_t *bsm = &builtin_server_main; + session_fifo_event_t evt; + static int serial_number = 0; + u32 thread_index = vlib_get_thread_index (); + + ASSERT (s->thread_index == thread_index); + + rx_fifo = s->server_rx_fifo; + tx_fifo = s->server_tx_fifo; + + ASSERT (rx_fifo->master_thread_index == thread_index); + ASSERT (tx_fifo->master_thread_index == thread_index); + + max_dequeue = svm_fifo_max_dequeue (s->server_rx_fifo); + max_enqueue = svm_fifo_max_enqueue (s->server_tx_fifo); + + if (PREDICT_FALSE (max_dequeue == 0)) + return 0; + + /* Number of bytes we're going to copy */ + max_transfer = (max_dequeue < max_enqueue) ? max_dequeue : max_enqueue; + + /* No space in tx fifo */ + if (PREDICT_FALSE (max_transfer == 0)) + { + /* XXX timeout for session that are stuck */ + + rx_event: + /* Program self-tap to retry */ + if (svm_fifo_set_event (rx_fifo)) + { + unix_shared_memory_queue_t *q; + evt.fifo = rx_fifo; + evt.event_type = FIFO_EVENT_BUILTIN_RX; + evt.event_id = 0; + + q = bsm->vpp_queue[thread_index]; + if (PREDICT_FALSE (q->cursize == q->maxsize)) + clib_warning ("out of event queue space"); + else if (unix_shared_memory_queue_add (q, (u8 *) & evt, 0)) + clib_warning ("failed to enqueue self-tap"); + + if (bsm->rx_retries[thread_index][s->session_index] == 500000) + { + clib_warning ("session stuck: %U", format_stream_session, s, 2); + } + if (bsm->rx_retries[thread_index][s->session_index] < 500001) + bsm->rx_retries[thread_index][s->session_index]++; + } + + return 0; + } + + _vec_len (bsm->rx_buf[thread_index]) = max_transfer; + + actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, max_transfer, + bsm->rx_buf[thread_index]); + ASSERT (actual_transfer == max_transfer); + +// test_bytes (bsm, actual_transfer); + + /* + * Echo back + */ + + n_written = svm_fifo_enqueue_nowait (tx_fifo, actual_transfer, + bsm->rx_buf[thread_index]); + + if (n_written != max_transfer) + clib_warning ("short trout!"); + + if (svm_fifo_set_event (tx_fifo)) + { + /* Fabricate TX event, send to vpp */ + evt.fifo = tx_fifo; + evt.event_type = FIFO_EVENT_APP_TX; + evt.event_id = serial_number++; + + if (unix_shared_memory_queue_add (bsm->vpp_queue[s->thread_index], + (u8 *) & evt, + 0 /* do wait for mutex */ )) + clib_warning ("failed to enqueue tx evt"); + } + + if (PREDICT_FALSE (n_written < max_dequeue)) + goto rx_event; + + return 0; +} + +static session_cb_vft_t builtin_session_cb_vft = { + .session_accept_callback = builtin_session_accept_callback, + .session_disconnect_callback = builtin_session_disconnect_callback, + .session_connected_callback = builtin_session_connected_callback, + .add_segment_callback = builtin_add_segment_callback, + .redirect_connect_callback = builtin_redirect_connect_callback, + .builtin_server_rx_callback = builtin_server_rx_callback, + .session_reset_callback = builtin_session_reset_callback +}; + +/* Abuse VPP's input queue */ +static int +create_api_loopback (vlib_main_t * vm) +{ + builtin_server_main_t *bsm = &builtin_server_main; + api_main_t *am = &api_main; + vl_shmem_hdr_t *shmem_hdr; + + shmem_hdr = am->shmem_hdr; + bsm->vl_input_queue = shmem_hdr->vl_input_queue; + bsm->my_client_index = + vl_api_memclnt_create_internal ("tcp_test_server", bsm->vl_input_queue); + return 0; +} + +static int +server_attach () +{ + builtin_server_main_t *bsm = &builtin_server_main; + u8 segment_name[128]; + u64 options[SESSION_OPTIONS_N_OPTIONS]; + vnet_app_attach_args_t _a, *a = &_a; + + memset (a, 0, sizeof (*a)); + memset (options, 0, sizeof (options)); + + if (bsm->no_echo) + builtin_session_cb_vft.builtin_server_rx_callback = + builtin_server_rx_callback_no_echo; + else + builtin_session_cb_vft.builtin_server_rx_callback = + builtin_server_rx_callback; + a->api_client_index = bsm->my_client_index; + a->session_cb_vft = &builtin_session_cb_vft; + a->options = options; + a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 512 << 20; + a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = bsm->fifo_size; + a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = bsm->fifo_size; + a->options[APP_OPTIONS_PRIVATE_SEGMENT_COUNT] = bsm->private_segment_count; + a->options[APP_OPTIONS_PRIVATE_SEGMENT_SIZE] = bsm->private_segment_size; + a->options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = + bsm->prealloc_fifos ? bsm->prealloc_fifos : 1; + + a->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP; + + a->segment_name = segment_name; + a->segment_name_length = ARRAY_LEN (segment_name); + + if (vnet_application_attach (a)) + { + clib_warning ("failed to attach server"); + return -1; + } + bsm->app_index = a->app_index; + return 0; +} + +static int +server_listen () +{ + builtin_server_main_t *bsm = &builtin_server_main; + vnet_bind_args_t _a, *a = &_a; + memset (a, 0, sizeof (*a)); + a->app_index = bsm->app_index; + a->uri = bsm->server_uri; + return vnet_bind_uri (a); +} + +static int +server_create (vlib_main_t * vm) +{ + builtin_server_main_t *bsm = &builtin_server_main; + vlib_thread_main_t *vtm = vlib_get_thread_main (); + u32 num_threads; + int i; + + if (bsm->my_client_index == (u32) ~ 0) + { + if (create_api_loopback (vm)) + { + clib_warning ("failed to create api loopback"); + return -1; + } + } + + num_threads = 1 /* main thread */ + vtm->n_threads; + vec_validate (builtin_server_main.vpp_queue, num_threads - 1); + vec_validate (bsm->rx_buf, num_threads - 1); + vec_validate (bsm->rx_retries, num_threads - 1); + + for (i = 0; i < num_threads; i++) + vec_validate (bsm->rx_buf[i], bsm->rcv_buffer_size); + + if (server_attach ()) + { + clib_warning ("failed to attach server"); + return -1; + } + if (server_listen ()) + { + clib_warning ("failed to start listening"); + return -1; + } + return 0; +} + +static clib_error_t * +server_create_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + builtin_server_main_t *bsm = &builtin_server_main; + u8 server_uri_set = 0; + int rv; + u64 tmp; + + bsm->no_echo = 0; + bsm->fifo_size = 64 << 10; + bsm->rcv_buffer_size = 128 << 10; + bsm->prealloc_fifos = 0; + bsm->private_segment_count = 0; + bsm->private_segment_size = 0; + vec_free (bsm->server_uri); + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "no-echo")) + bsm->no_echo = 1; + else if (unformat (input, "fifo-size %d", &bsm->fifo_size)) + bsm->fifo_size <<= 10; + else if (unformat (input, "rcv-buf-size %d", &bsm->rcv_buffer_size)) + ; + else if (unformat (input, "prealloc-fifos %d", &bsm->prealloc_fifos)) + ; + else if (unformat (input, "private-segment-count %d", + &bsm->private_segment_count)) + ; + else if (unformat (input, "private-segment-size %U", + unformat_memory_size, &tmp)) + { + if (tmp >= 0x100000000ULL) + return clib_error_return + (0, "private segment size %lld (%llu) too large", tmp, tmp); + bsm->private_segment_size = tmp; + } + else if (unformat (input, "uri %s", &bsm->server_uri)) + server_uri_set = 1; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + vnet_session_enable_disable (vm, 1 /* turn on TCP, etc. */ ); + + if (!server_uri_set) + bsm->server_uri = (char *) format (0, "tcp://0.0.0.0/1234%c", 0); + + rv = server_create (vm); + switch (rv) + { + case 0: + break; + default: + return clib_error_return (0, "server_create returned %d", rv); + } + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (server_create_command, static) = +{ + .path = "test tcp server", + .short_help = "test tcp server [no echo][fifo-size <mbytes>] " + "[rcv-buf-size <bytes>][prealloc-fifos <count>]" + "[private-segment-count <count>][private-segment-size <bytes[m|g]>]" + "[uri <tcp://ip/port>]", + .function = server_create_command_fn, +}; +/* *INDENT-ON* */ + +clib_error_t * +builtin_tcp_server_main_init (vlib_main_t * vm) +{ + builtin_server_main_t *bsm = &builtin_server_main; + bsm->my_client_index = ~0; + return 0; +} + +VLIB_INIT_FUNCTION (builtin_tcp_server_main_init); + +/* +* fd.io coding-style-patch-verification: ON +* +* Local Variables: +* eval: (c-set-style "gnu") +* End: +*/ diff --git a/src/vnet/tcp/tcp.api b/src/vnet/tcp/tcp.api new file mode 100644 index 00000000..093a5a89 --- /dev/null +++ b/src/vnet/tcp/tcp.api @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2015-2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** \brief Configure TCP source addresses, for active-open TCP sessions + + TCP src/dst ports are 16 bits, with the low-order 1024 ports + reserved. So, it's necessary to provide a considerable number of + source IP addresses if one wishes to initiate a large number of + connections. + + Each of those addresses needs to have a receive adjacency - + either a /32 or a /128 - and vpp needs to answer (proxy) arps or + neighbor discovery requests for the addresses. + + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_ipv6 - 1 for ipv6, 0 for ipv4 + @param vrf_id - fib table / vrf id for local adjacencies + @param first_address - first address that TCP will use + @param last_address - last address that TCP will use +*/ +autoreply define tcp_configure_src_addresses { + u32 client_index; + u32 context; + u8 is_ipv6; + u32 vrf_id; + u8 first_address[16]; + u8 last_address[16]; + }; + diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c new file mode 100644 index 00000000..a365cb48 --- /dev/null +++ b/src/vnet/tcp/tcp.c @@ -0,0 +1,1943 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * @brief TCP host stack utilities + */ + +#include <vnet/tcp/tcp.h> +#include <vnet/session/session.h> +#include <vnet/fib/fib.h> +#include <vnet/dpo/load_balance.h> +#include <vnet/dpo/receive_dpo.h> +#include <vnet/ip/ip6_neighbor.h> +#include <math.h> + +tcp_main_t tcp_main; + +static u32 +tcp_connection_bind (u32 session_index, transport_endpoint_t * lcl) +{ + tcp_main_t *tm = &tcp_main; + tcp_connection_t *listener; + + pool_get (tm->listener_pool, listener); + memset (listener, 0, sizeof (*listener)); + + listener->c_c_index = listener - tm->listener_pool; + listener->c_lcl_port = lcl->port; + + if (lcl->is_ip4) + { + listener->c_lcl_ip4.as_u32 = lcl->ip.ip4.as_u32; + listener->c_is_ip4 = 1; + } + else + { + clib_memcpy (&listener->c_lcl_ip6, &lcl->ip.ip6, + sizeof (ip6_address_t)); + + } + listener->c_transport_proto = TRANSPORT_PROTO_TCP; + listener->c_s_index = session_index; + listener->state = TCP_STATE_LISTEN; + + tcp_connection_timers_init (listener); + + TCP_EVT_DBG (TCP_EVT_BIND, listener); + + return listener->c_c_index; +} + +u32 +tcp_session_bind (u32 session_index, transport_endpoint_t * tep) +{ + return tcp_connection_bind (session_index, tep); +} + +static void +tcp_connection_unbind (u32 listener_index) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + tcp_connection_t *tc; + + tc = pool_elt_at_index (tm->listener_pool, listener_index); + + TCP_EVT_DBG (TCP_EVT_UNBIND, tc); + + /* Poison the entry */ + if (CLIB_DEBUG > 0) + memset (tc, 0xFA, sizeof (*tc)); + + pool_put_index (tm->listener_pool, listener_index); +} + +u32 +tcp_session_unbind (u32 listener_index) +{ + tcp_connection_unbind (listener_index); + return 0; +} + +transport_connection_t * +tcp_session_get_listener (u32 listener_index) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + tcp_connection_t *tc; + tc = pool_elt_at_index (tm->listener_pool, listener_index); + return &tc->connection; +} + +always_inline void +transport_endpoint_del (u32 tepi) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + clib_spinlock_lock_if_init (&tm->local_endpoints_lock); + pool_put_index (tm->local_endpoints, tepi); + clib_spinlock_unlock_if_init (&tm->local_endpoints_lock); +} + +always_inline transport_endpoint_t * +transport_endpoint_new (void) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + transport_endpoint_t *tep; + pool_get (tm->local_endpoints, tep); + return tep; +} + +/** + * Cleanup half-open connection + * + */ +void +tcp_half_open_connection_del (tcp_connection_t * tc) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + clib_spinlock_lock_if_init (&tm->half_open_lock); + pool_put_index (tm->half_open_connections, tc->c_c_index); + if (CLIB_DEBUG) + memset (tc, 0xFA, sizeof (*tc)); + clib_spinlock_unlock_if_init (&tm->half_open_lock); +} + +/** + * Try to cleanup half-open connection + * + * If called from a thread that doesn't own tc, the call won't have any + * effect. + * + * @param tc - connection to be cleaned up + * @return non-zero if cleanup failed. + */ +int +tcp_half_open_connection_cleanup (tcp_connection_t * tc) +{ + /* Make sure this is the owning thread */ + if (tc->c_thread_index != vlib_get_thread_index ()) + return 1; + tcp_timer_reset (tc, TCP_TIMER_ESTABLISH); + tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT_SYN); + tcp_half_open_connection_del (tc); + return 0; +} + +tcp_connection_t * +tcp_half_open_connection_new (void) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + tcp_connection_t *tc = 0; + ASSERT (vlib_get_thread_index () == 0); + pool_get (tm->half_open_connections, tc); + memset (tc, 0, sizeof (*tc)); + tc->c_c_index = tc - tm->half_open_connections; + return tc; +} + +/** + * Cleans up connection state. + * + * No notifications. + */ +void +tcp_connection_cleanup (tcp_connection_t * tc) +{ + tcp_main_t *tm = &tcp_main; + u32 tepi; + transport_endpoint_t *tep; + + /* Cleanup local endpoint if this was an active connect */ + tepi = transport_endpoint_lookup (&tm->local_endpoints_table, &tc->c_lcl_ip, + clib_net_to_host_u16 (tc->c_lcl_port)); + if (tepi != TRANSPORT_ENDPOINT_INVALID_INDEX) + { + tep = pool_elt_at_index (tm->local_endpoints, tepi); + transport_endpoint_table_del (&tm->local_endpoints_table, tep); + transport_endpoint_del (tepi); + } + + /* Check if connection is not yet fully established */ + if (tc->state == TCP_STATE_SYN_SENT) + { + /* Try to remove the half-open connection. If this is not the owning + * thread, tc won't be removed. Retransmit or establish timers will + * eventually expire and call again cleanup on the right thread. */ + tcp_half_open_connection_cleanup (tc); + } + else + { + int thread_index = tc->c_thread_index; + + /* Make sure all timers are cleared */ + tcp_connection_timers_reset (tc); + + /* Poison the entry */ + if (CLIB_DEBUG > 0) + memset (tc, 0xFA, sizeof (*tc)); + pool_put (tm->connections[thread_index], tc); + } +} + +/** + * Connection removal. + * + * This should be called only once connection enters CLOSED state. Note + * that it notifies the session of the removal event, so if the goal is to + * just remove the connection, call tcp_connection_cleanup instead. + */ +void +tcp_connection_del (tcp_connection_t * tc) +{ + TCP_EVT_DBG (TCP_EVT_DELETE, tc); + stream_session_delete_notify (&tc->connection); + tcp_connection_cleanup (tc); +} + +tcp_connection_t * +tcp_connection_new (u8 thread_index) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + tcp_connection_t *tc; + + pool_get (tm->connections[thread_index], tc); + memset (tc, 0, sizeof (*tc)); + tc->c_c_index = tc - tm->connections[thread_index]; + tc->c_thread_index = thread_index; + return tc; +} + +/** Notify session that connection has been reset. + * + * Switch state to closed and wait for session to call cleanup. + */ +void +tcp_connection_reset (tcp_connection_t * tc) +{ + TCP_EVT_DBG (TCP_EVT_RST_RCVD, tc); + switch (tc->state) + { + case TCP_STATE_SYN_RCVD: + /* Cleanup everything. App wasn't notified yet */ + stream_session_delete_notify (&tc->connection); + tcp_connection_cleanup (tc); + break; + case TCP_STATE_SYN_SENT: + stream_session_connect_notify (&tc->connection, 1 /* fail */ ); + tcp_connection_cleanup (tc); + break; + case TCP_STATE_ESTABLISHED: + case TCP_STATE_CLOSE_WAIT: + case TCP_STATE_FIN_WAIT_1: + case TCP_STATE_FIN_WAIT_2: + case TCP_STATE_CLOSING: + tc->state = TCP_STATE_CLOSED; + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc); + + /* Make sure all timers are cleared */ + tcp_connection_timers_reset (tc); + stream_session_reset_notify (&tc->connection); + + /* Wait for cleanup from session layer but not forever */ + tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); + break; + case TCP_STATE_CLOSED: + return; + } +} + +/** + * Begin connection closing procedure. + * + * If at the end the connection is not in CLOSED state, it is not removed. + * Instead, we rely on on TCP to advance through state machine to either + * 1) LAST_ACK (passive close) whereby when the last ACK is received + * tcp_connection_del is called. This notifies session of the delete and + * calls cleanup. + * 2) TIME_WAIT (active close) whereby after 2MSL the 2MSL timer triggers + * and cleanup is called. + * + * N.B. Half-close connections are not supported + */ +void +tcp_connection_close (tcp_connection_t * tc) +{ + TCP_EVT_DBG (TCP_EVT_CLOSE, tc); + + /* Send/Program FIN if needed and switch state */ + switch (tc->state) + { + case TCP_STATE_SYN_SENT: + tc->state = TCP_STATE_CLOSED; + break; + case TCP_STATE_SYN_RCVD: + tcp_send_fin (tc); + tc->state = TCP_STATE_FIN_WAIT_1; + break; + case TCP_STATE_ESTABLISHED: + if (!stream_session_tx_fifo_max_dequeue (&tc->connection)) + tcp_send_fin (tc); + else + tc->flags |= TCP_CONN_FINPNDG; + tc->state = TCP_STATE_FIN_WAIT_1; + break; + case TCP_STATE_CLOSE_WAIT: + tcp_send_fin (tc); + tc->state = TCP_STATE_LAST_ACK; + break; + case TCP_STATE_FIN_WAIT_1: + break; + default: + clib_warning ("state: %u", tc->state); + } + + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc); + + /* If in CLOSED and WAITCLOSE timer is not set, delete connection now */ + if (tc->timers[TCP_TIMER_WAITCLOSE] == TCP_TIMER_HANDLE_INVALID + && tc->state == TCP_STATE_CLOSED) + tcp_connection_del (tc); +} + +void +tcp_session_close (u32 conn_index, u32 thread_index) +{ + tcp_connection_t *tc; + tc = tcp_connection_get (conn_index, thread_index); + tcp_connection_close (tc); +} + +void +tcp_session_cleanup (u32 conn_index, u32 thread_index) +{ + tcp_connection_t *tc; + tc = tcp_connection_get (conn_index, thread_index); + + /* Wait for the session tx events to clear */ + tc->state = TCP_STATE_CLOSED; + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc); + tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); +} + +void * +ip_interface_get_first_ip (u32 sw_if_index, u8 is_ip4) +{ + ip_lookup_main_t *lm4 = &ip4_main.lookup_main; + ip_lookup_main_t *lm6 = &ip6_main.lookup_main; + ip_interface_address_t *ia = 0; + + if (is_ip4) + { + /* *INDENT-OFF* */ + foreach_ip_interface_address (lm4, ia, sw_if_index, 1 /* unnumbered */ , + ({ + return ip_interface_address_get_address (lm4, ia); + })); + /* *INDENT-ON* */ + } + else + { + /* *INDENT-OFF* */ + foreach_ip_interface_address (lm6, ia, sw_if_index, 1 /* unnumbered */ , + ({ + ip6_address_t *rv; + rv = ip_interface_address_get_address (lm6, ia); + /* Trying to use a link-local ip6 src address is a fool's errand */ + if (!ip6_address_is_link_local_unicast (rv)) + return rv; + })); + /* *INDENT-ON* */ + } + + return 0; +} + +#define PORT_MASK ((1 << 16)- 1) +/** + * Allocate local port and add if successful add entry to local endpoint + * table to mark the pair as used. + */ +int +tcp_allocate_local_port (ip46_address_t * ip) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + transport_endpoint_t *tep; + u32 tei; + u16 min = 1024, max = 65535; /* XXX configurable ? */ + int tries, limit; + + limit = max - min; + + /* Only support active opens from thread 0 */ + ASSERT (vlib_get_thread_index () == 0); + + /* Search for first free slot */ + for (tries = 0; tries < limit; tries++) + { + u16 port = 0; + + /* Find a port in the specified range */ + while (1) + { + port = random_u32 (&tm->port_allocator_seed) & PORT_MASK; + if (PREDICT_TRUE (port >= min && port < max)) + break; + } + + /* Look it up */ + tei = transport_endpoint_lookup (&tm->local_endpoints_table, ip, port); + /* If not found, we're done */ + if (tei == TRANSPORT_ENDPOINT_INVALID_INDEX) + { + clib_spinlock_lock_if_init (&tm->local_endpoints_lock); + tep = transport_endpoint_new (); + clib_memcpy (&tep->ip, ip, sizeof (*ip)); + tep->port = port; + transport_endpoint_table_add (&tm->local_endpoints_table, tep, + tep - tm->local_endpoints); + clib_spinlock_unlock_if_init (&tm->local_endpoints_lock); + + return tep->port; + } + } + return -1; +} + +/** + * Initialize all connection timers as invalid + */ +void +tcp_connection_timers_init (tcp_connection_t * tc) +{ + int i; + + /* Set all to invalid */ + for (i = 0; i < TCP_N_TIMERS; i++) + { + tc->timers[i] = TCP_TIMER_HANDLE_INVALID; + } + + tc->rto = TCP_RTO_INIT; +} + +/** + * Stop all connection timers + */ +void +tcp_connection_timers_reset (tcp_connection_t * tc) +{ + int i; + for (i = 0; i < TCP_N_TIMERS; i++) + { + tcp_timer_reset (tc, i); + } +} + +#if 0 +typedef struct ip4_tcp_hdr +{ + ip4_header_t ip; + tcp_header_t tcp; +} ip4_tcp_hdr_t; + +typedef struct ip6_tcp_hdr +{ + ip6_header_t ip; + tcp_header_t tcp; +} ip6_tcp_hdr_t; + +static void +tcp_connection_select_lb_bucket (tcp_connection_t * tc, const dpo_id_t * dpo, + dpo_id_t * result) +{ + const dpo_id_t *choice; + load_balance_t *lb; + int hash; + + lb = load_balance_get (dpo->dpoi_index); + if (tc->c_is_ip4) + { + ip4_tcp_hdr_t hdr; + memset (&hdr, 0, sizeof (hdr)); + hdr.ip.protocol = IP_PROTOCOL_TCP; + hdr.ip.address_pair.src.as_u32 = tc->c_lcl_ip.ip4.as_u32; + hdr.ip.address_pair.dst.as_u32 = tc->c_rmt_ip.ip4.as_u32; + hdr.tcp.src_port = tc->c_lcl_port; + hdr.tcp.dst_port = tc->c_rmt_port; + hash = ip4_compute_flow_hash (&hdr.ip, lb->lb_hash_config); + } + else + { + ip6_tcp_hdr_t hdr; + memset (&hdr, 0, sizeof (hdr)); + hdr.ip.protocol = IP_PROTOCOL_TCP; + clib_memcpy (&hdr.ip.src_address, &tc->c_lcl_ip.ip6, + sizeof (ip6_address_t)); + clib_memcpy (&hdr.ip.dst_address, &tc->c_rmt_ip.ip6, + sizeof (ip6_address_t)); + hdr.tcp.src_port = tc->c_lcl_port; + hdr.tcp.dst_port = tc->c_rmt_port; + hash = ip6_compute_flow_hash (&hdr.ip, lb->lb_hash_config); + } + choice = load_balance_get_bucket_i (lb, hash & lb->lb_n_buckets_minus_1); + dpo_copy (result, choice); +} + +fib_node_index_t +tcp_lookup_rmt_in_fib (tcp_connection_t * tc) +{ + fib_prefix_t prefix; + u32 fib_index; + + clib_memcpy (&prefix.fp_addr, &tc->c_rmt_ip, sizeof (prefix.fp_addr)); + prefix.fp_proto = tc->c_is_ip4 ? FIB_PROTOCOL_IP4 : FIB_PROTOCOL_IP6; + prefix.fp_len = tc->c_is_ip4 ? 32 : 128; + fib_index = fib_table_find (prefix.fp_proto, tc->c_vrf); + return fib_table_lookup (fib_index, &prefix); +} + +static int +tcp_connection_stack_on_fib_entry (tcp_connection_t * tc) +{ + dpo_id_t choice = DPO_INVALID; + u32 output_node_index; + fib_entry_t *fe; + + fe = fib_entry_get (tc->c_rmt_fei); + if (fe->fe_lb.dpoi_type != DPO_LOAD_BALANCE) + return -1; + + tcp_connection_select_lb_bucket (tc, &fe->fe_lb, &choice); + + output_node_index = + tc->c_is_ip4 ? tcp4_output_node.index : tcp6_output_node.index; + dpo_stack_from_node (output_node_index, &tc->c_rmt_dpo, &choice); + return 0; +} + +/** Stack tcp connection on peer's fib entry. + * + * This ultimately populates the dpo the connection will use to send packets. + */ +static void +tcp_connection_fib_attach (tcp_connection_t * tc) +{ + tc->c_rmt_fei = tcp_lookup_rmt_in_fib (tc); + + ASSERT (tc->c_rmt_fei != FIB_NODE_INDEX_INVALID); + + tcp_connection_stack_on_fib_entry (tc); +} +#endif /* 0 */ + +/** + * Initialize connection send variables. + */ +void +tcp_init_snd_vars (tcp_connection_t * tc) +{ + u32 time_now; + + /* + * We use the time to randomize iss and for setting up the initial + * timestamp. Make sure it's updated otherwise syn and ack in the + * handshake may make it look as if time has flown in the opposite + * direction for us. + */ + tcp_set_time_now (vlib_get_thread_index ()); + time_now = tcp_time_now (); + + tc->iss = random_u32 (&time_now); + tc->snd_una = tc->iss; + tc->snd_nxt = tc->iss + 1; + tc->snd_una_max = tc->snd_nxt; +} + +/** Initialize tcp connection variables + * + * Should be called after having received a msg from the peer, i.e., a SYN or + * a SYNACK, such that connection options have already been exchanged. */ +void +tcp_connection_init_vars (tcp_connection_t * tc) +{ + tcp_connection_timers_init (tc); + tcp_init_mss (tc); + scoreboard_init (&tc->sack_sb); + tcp_cc_init (tc); + if (tc->state == TCP_STATE_SYN_RCVD) + tcp_init_snd_vars (tc); + + // tcp_connection_fib_attach (tc); +} + +int +tcp_connection_open (transport_endpoint_t * rmt) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + tcp_connection_t *tc; + fib_prefix_t prefix; + fib_node_index_t fei; + u32 sw_if_index, fib_index; + ip46_address_t lcl_addr; + int lcl_port; + + /* + * Find the local address and allocate port + */ + memset (&lcl_addr, 0, sizeof (lcl_addr)); + + /* Find a FIB path to the destination */ + clib_memcpy (&prefix.fp_addr, &rmt->ip, sizeof (rmt->ip)); + prefix.fp_proto = rmt->is_ip4 ? FIB_PROTOCOL_IP4 : FIB_PROTOCOL_IP6; + prefix.fp_len = rmt->is_ip4 ? 32 : 128; + + fib_index = fib_table_find (prefix.fp_proto, rmt->vrf); + if (fib_index == (u32) ~ 0) + { + clib_warning ("no fib table"); + return -1; + } + + fei = fib_table_lookup (fib_index, &prefix); + + /* Couldn't find route to destination. Bail out. */ + if (fei == FIB_NODE_INDEX_INVALID) + { + clib_warning ("no route to destination"); + return -1; + } + + sw_if_index = fib_entry_get_resolving_interface (fei); + + if (sw_if_index == (u32) ~ 0) + { + clib_warning ("no resolving interface for %U", format_ip46_address, + &rmt->ip, IP46_TYPE_IP4); + return -1; + } + + if (rmt->is_ip4) + { + ip4_address_t *ip4; + int index; + if (vec_len (tm->ip4_src_addresses)) + { + index = tm->last_v4_address_rotor++; + if (tm->last_v4_address_rotor >= vec_len (tm->ip4_src_addresses)) + tm->last_v4_address_rotor = 0; + lcl_addr.ip4.as_u32 = tm->ip4_src_addresses[index].as_u32; + } + else + { + ip4 = ip_interface_get_first_ip (sw_if_index, 1); + lcl_addr.ip4.as_u32 = ip4->as_u32; + } + } + else + { + ip6_address_t *ip6; + int index; + + if (vec_len (tm->ip6_src_addresses)) + { + index = tm->last_v6_address_rotor++; + if (tm->last_v6_address_rotor >= vec_len (tm->ip6_src_addresses)) + tm->last_v6_address_rotor = 0; + clib_memcpy (&lcl_addr.ip6, &tm->ip6_src_addresses[index], + sizeof (*ip6)); + } + else + { + ip6 = ip_interface_get_first_ip (sw_if_index, 0); + if (ip6 == 0) + { + clib_warning ("no routable ip6 addresses on %U", + format_vnet_sw_if_index_name, vnet_get_main (), + sw_if_index); + return -1; + } + + clib_memcpy (&lcl_addr.ip6, ip6, sizeof (*ip6)); + } + } + + /* Allocate source port */ + lcl_port = tcp_allocate_local_port (&lcl_addr); + if (lcl_port < 1) + { + clib_warning ("Failed to allocate src port"); + return -1; + } + + /* + * Create connection and send SYN + */ + clib_spinlock_lock_if_init (&tm->half_open_lock); + tc = tcp_half_open_connection_new (); + clib_memcpy (&tc->c_rmt_ip, &rmt->ip, sizeof (ip46_address_t)); + clib_memcpy (&tc->c_lcl_ip, &lcl_addr, sizeof (ip46_address_t)); + tc->c_rmt_port = rmt->port; + tc->c_lcl_port = clib_host_to_net_u16 (lcl_port); + tc->c_is_ip4 = rmt->is_ip4; + tc->c_transport_proto = TRANSPORT_PROTO_TCP; + tc->c_vrf = rmt->vrf; + /* The other connection vars will be initialized after SYN ACK */ + tcp_connection_timers_init (tc); + + TCP_EVT_DBG (TCP_EVT_OPEN, tc); + tc->state = TCP_STATE_SYN_SENT; + tcp_init_snd_vars (tc); + tcp_send_syn (tc); + clib_spinlock_unlock_if_init (&tm->half_open_lock); + + return tc->c_c_index; +} + +int +tcp_session_open (transport_endpoint_t * tep) +{ + return tcp_connection_open (tep); +} + +const char *tcp_dbg_evt_str[] = { +#define _(sym, str) str, + foreach_tcp_dbg_evt +#undef _ +}; + +const char *tcp_fsm_states[] = { +#define _(sym, str) str, + foreach_tcp_fsm_state +#undef _ +}; + +u8 * +format_tcp_state (u8 * s, va_list * args) +{ + u32 state = va_arg (*args, u32); + + if (state < TCP_N_STATES) + s = format (s, "%s", tcp_fsm_states[state]); + else + s = format (s, "UNKNOWN (%d (0x%x))", state, state); + return s; +} + +const char *tcp_conn_timers[] = { +#define _(sym, str) str, + foreach_tcp_timer +#undef _ +}; + +u8 * +format_tcp_timers (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + int i, last = -1; + + for (i = 0; i < TCP_N_TIMERS; i++) + if (tc->timers[i] != TCP_TIMER_HANDLE_INVALID) + last = i; + + s = format (s, "["); + for (i = 0; i < last; i++) + { + if (tc->timers[i] != TCP_TIMER_HANDLE_INVALID) + s = format (s, "%s,", tcp_conn_timers[i]); + } + + if (last >= 0) + s = format (s, "%s]", tcp_conn_timers[i]); + else + s = format (s, "]"); + + return s; +} + +u8 * +format_tcp_congestion_status (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + if (tcp_in_recovery (tc)) + s = format (s, "recovery"); + else if (tcp_in_fastrecovery (tc)) + s = format (s, "fastrecovery"); + else + s = format (s, "none"); + return s; +} + +u8 * +format_tcp_vars (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + s = format (s, " snd_una %u snd_nxt %u snd_una_max %u", + tc->snd_una - tc->iss, tc->snd_nxt - tc->iss, + tc->snd_una_max - tc->iss); + s = format (s, " rcv_nxt %u rcv_las %u\n", + tc->rcv_nxt - tc->irs, tc->rcv_las - tc->irs); + s = format (s, " snd_wnd %u rcv_wnd %u snd_wl1 %u snd_wl2 %u\n", + tc->snd_wnd, tc->rcv_wnd, tc->snd_wl1 - tc->irs, + tc->snd_wl2 - tc->iss); + s = format (s, " flight size %u send space %u rcv_wnd_av %d\n", + tcp_flight_size (tc), tcp_available_output_snd_space (tc), + tcp_rcv_wnd_available (tc)); + s = format (s, " cong %U ", format_tcp_congestion_status, tc); + s = format (s, "cwnd %u ssthresh %u rtx_bytes %u bytes_acked %u\n", + tc->cwnd, tc->ssthresh, tc->snd_rxt_bytes, tc->bytes_acked); + s = format (s, " prev_ssthresh %u snd_congestion %u dupack %u", + tc->prev_ssthresh, tc->snd_congestion - tc->iss, + tc->rcv_dupacks); + s = format (s, " limited_transmit %u\n", tc->limited_transmit - tc->iss); + s = format (s, " tsecr %u tsecr_last_ack %u\n", tc->rcv_opts.tsecr, + tc->tsecr_last_ack); + s = format (s, " rto %u rto_boff %u srtt %u rttvar %u rtt_ts %u ", tc->rto, + tc->rto_boff, tc->srtt, tc->rttvar, tc->rtt_ts); + s = format (s, "rtt_seq %u\n", tc->rtt_seq); + s = format (s, " tsval_recent %u tsval_recent_age %u\n", tc->tsval_recent, + tcp_time_now () - tc->tsval_recent_age); + if (tc->state >= TCP_STATE_ESTABLISHED) + s = format (s, " scoreboard: %U\n", format_tcp_scoreboard, &tc->sack_sb, + tc); + if (vec_len (tc->snd_sacks)) + s = format (s, " sacks tx: %U\n", format_tcp_sacks, tc); + + return s; +} + +u8 * +format_tcp_connection_id (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + if (!tc) + return s; + if (tc->c_is_ip4) + { + s = format (s, "[#%d][%s] %U:%d->%U:%d", tc->c_thread_index, "T", + format_ip4_address, &tc->c_lcl_ip4, + clib_net_to_host_u16 (tc->c_lcl_port), format_ip4_address, + &tc->c_rmt_ip4, clib_net_to_host_u16 (tc->c_rmt_port)); + } + else + { + s = format (s, "[#%d][%s] %U:%d->%U:%d", tc->c_thread_index, "T", + format_ip6_address, &tc->c_lcl_ip6, + clib_net_to_host_u16 (tc->c_lcl_port), format_ip6_address, + &tc->c_rmt_ip6, clib_net_to_host_u16 (tc->c_rmt_port)); + } + + return s; +} + +u8 * +format_tcp_connection (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + u32 verbose = va_arg (*args, u32); + + if (!tc) + return s; + s = format (s, "%-50U", format_tcp_connection_id, tc); + if (verbose) + { + s = format (s, "%-15U", format_tcp_state, tc->state); + if (verbose > 1) + s = format (s, " %U\n%U", format_tcp_timers, tc, format_tcp_vars, tc); + } + + return s; +} + +u8 * +format_tcp_session (u8 * s, va_list * args) +{ + u32 tci = va_arg (*args, u32); + u32 thread_index = va_arg (*args, u32); + u32 verbose = va_arg (*args, u32); + tcp_connection_t *tc; + + tc = tcp_connection_get (tci, thread_index); + if (tc) + s = format (s, "%U", format_tcp_connection, tc, verbose); + else + s = format (s, "empty\n"); + return s; +} + +u8 * +format_tcp_listener_session (u8 * s, va_list * args) +{ + u32 tci = va_arg (*args, u32); + tcp_connection_t *tc = tcp_listener_get (tci); + return format (s, "%U", format_tcp_connection_id, tc); +} + +u8 * +format_tcp_half_open_session (u8 * s, va_list * args) +{ + u32 tci = va_arg (*args, u32); + tcp_connection_t *tc = tcp_half_open_connection_get (tci); + return format (s, "%U", format_tcp_connection_id, tc); +} + +u8 * +format_tcp_sacks (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + sack_block_t *sacks = tc->snd_sacks; + sack_block_t *block; + int i, len = 0; + + len = vec_len (sacks); + for (i = 0; i < len - 1; i++) + { + block = &sacks[i]; + s = format (s, " start %u end %u\n", block->start - tc->irs, + block->end - tc->irs); + } + if (len) + { + block = &sacks[len - 1]; + s = format (s, " start %u end %u", block->start - tc->irs, + block->end - tc->irs); + } + return s; +} + +u8 * +format_tcp_rcv_sacks (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + sack_block_t *sacks = tc->rcv_opts.sacks; + sack_block_t *block; + int i, len = 0; + + len = vec_len (sacks); + for (i = 0; i < len - 1; i++) + { + block = &sacks[i]; + s = format (s, " start %u end %u\n", block->start - tc->iss, + block->end - tc->iss); + } + if (len) + { + block = &sacks[len - 1]; + s = format (s, " start %u end %u", block->start - tc->iss, + block->end - tc->iss); + } + return s; +} + +u8 * +format_tcp_sack_hole (u8 * s, va_list * args) +{ + sack_scoreboard_hole_t *hole = va_arg (*args, sack_scoreboard_hole_t *); + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + if (tc) + s = format (s, " [%u, %u]", hole->start - tc->iss, hole->end - tc->iss); + else + s = format (s, " [%u, %u]", hole->start, hole->end); + return s; +} + +u8 * +format_tcp_scoreboard (u8 * s, va_list * args) +{ + sack_scoreboard_t *sb = va_arg (*args, sack_scoreboard_t *); + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + sack_scoreboard_hole_t *hole; + s = format (s, "sacked_bytes %u last_sacked_bytes %u lost_bytes %u\n", + sb->sacked_bytes, sb->last_sacked_bytes, sb->lost_bytes); + s = format (s, " last_bytes_delivered %u high_sacked %u snd_una_adv %u\n", + sb->last_bytes_delivered, sb->high_sacked, sb->snd_una_adv); + s = format (s, " cur_rxt_hole %u high_rxt %u rescue_rxt %u", + sb->cur_rxt_hole, sb->high_rxt, sb->rescue_rxt); + + hole = scoreboard_first_hole (sb); + if (hole) + s = format (s, "\n head %u tail %u holes:\n", sb->head, sb->tail); + + while (hole) + { + s = format (s, "%U", format_tcp_sack_hole, hole, tc); + hole = scoreboard_next_hole (sb, hole); + } + + return s; +} + +transport_connection_t * +tcp_session_get_transport (u32 conn_index, u32 thread_index) +{ + tcp_connection_t *tc = tcp_connection_get (conn_index, thread_index); + return &tc->connection; +} + +transport_connection_t * +tcp_half_open_session_get_transport (u32 conn_index) +{ + tcp_connection_t *tc = tcp_half_open_connection_get (conn_index); + return &tc->connection; +} + +/** + * Compute maximum segment size for session layer. + * + * Since the result needs to be the actual data length, it first computes + * the tcp options to be used in the next burst and subtracts their + * length from the connection's snd_mss. + */ +u16 +tcp_session_send_mss (transport_connection_t * trans_conn) +{ + tcp_connection_t *tc = (tcp_connection_t *) trans_conn; + + /* Ensure snd_mss does accurately reflect the amount of data we can push + * in a segment. This also makes sure that options are updated according to + * the current state of the connection. */ + tcp_update_snd_mss (tc); + + return tc->snd_mss; +} + +always_inline u32 +tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space) +{ + if (PREDICT_FALSE (tc->snd_wnd < tc->snd_mss)) + { + return tc->snd_wnd <= snd_space ? tc->snd_wnd : 0; + } + + /* If not snd_wnd constrained and we can't write at least a segment, + * don't try at all */ + if (PREDICT_FALSE (snd_space < tc->snd_mss)) + return snd_space < tc->cwnd ? 0 : snd_space; + + /* round down to mss multiple */ + return snd_space - (snd_space % tc->snd_mss); +} + +/** + * Compute tx window session is allowed to fill. + * + * Takes into account available send space, snd_mss and the congestion + * state of the connection. If possible, the value returned is a multiple + * of snd_mss. + * + * @param tc tcp connection + * @return number of bytes session is allowed to write + */ +u32 +tcp_snd_space (tcp_connection_t * tc) +{ + int snd_space, snt_limited; + + if (PREDICT_TRUE (tcp_in_cong_recovery (tc) == 0)) + { + snd_space = tcp_available_output_snd_space (tc); + + /* If we haven't gotten dupacks or if we did and have gotten sacked + * bytes then we can still send as per Limited Transmit (RFC3042) */ + if (PREDICT_FALSE (tc->rcv_dupacks != 0 + && (tcp_opts_sack_permitted (tc) + && tc->sack_sb.last_sacked_bytes == 0))) + { + if (tc->rcv_dupacks == 1 && tc->limited_transmit != tc->snd_nxt) + tc->limited_transmit = tc->snd_nxt; + ASSERT (seq_leq (tc->limited_transmit, tc->snd_nxt)); + + snt_limited = tc->snd_nxt - tc->limited_transmit; + snd_space = clib_max (2 * tc->snd_mss - snt_limited, 0); + } + return tcp_round_snd_space (tc, snd_space); + } + + if (tcp_in_recovery (tc)) + { + tc->snd_nxt = tc->snd_una_max; + snd_space = tcp_available_snd_wnd (tc) - tc->snd_rxt_bytes + - (tc->snd_una_max - tc->snd_congestion); + if (snd_space <= 0 || (tc->snd_una_max - tc->snd_una) >= tc->snd_wnd) + return 0; + return tcp_round_snd_space (tc, snd_space); + } + + /* RFC 5681: When previously unsent data is available and the new value of + * cwnd and the receiver's advertised window allow, a TCP SHOULD send 1*SMSS + * bytes of previously unsent data. */ + if (tcp_in_fastrecovery (tc) && !tcp_fastrecovery_sent_1_smss (tc)) + { + if (tcp_available_output_snd_space (tc) < tc->snd_mss) + return 0; + tcp_fastrecovery_1_smss_on (tc); + return tc->snd_mss; + } + + return 0; +} + +u32 +tcp_session_send_space (transport_connection_t * trans_conn) +{ + tcp_connection_t *tc = (tcp_connection_t *) trans_conn; + return clib_min (tcp_snd_space (tc), + tc->snd_wnd - (tc->snd_nxt - tc->snd_una)); +} + +i32 +tcp_rcv_wnd_available (tcp_connection_t * tc) +{ + return (i32) tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las); +} + +u32 +tcp_session_tx_fifo_offset (transport_connection_t * trans_conn) +{ + tcp_connection_t *tc = (tcp_connection_t *) trans_conn; + + ASSERT (seq_geq (tc->snd_nxt, tc->snd_una)); + + /* This still works if fast retransmit is on */ + return (tc->snd_nxt - tc->snd_una); +} + +/* *INDENT-OFF* */ +const static transport_proto_vft_t tcp_proto = { + .bind = tcp_session_bind, + .unbind = tcp_session_unbind, + .push_header = tcp_push_header, + .get_connection = tcp_session_get_transport, + .get_listener = tcp_session_get_listener, + .get_half_open = tcp_half_open_session_get_transport, + .open = tcp_session_open, + .close = tcp_session_close, + .cleanup = tcp_session_cleanup, + .send_mss = tcp_session_send_mss, + .send_space = tcp_session_send_space, + .tx_fifo_offset = tcp_session_tx_fifo_offset, + .format_connection = format_tcp_session, + .format_listener = format_tcp_listener_session, + .format_half_open = format_tcp_half_open_session, +}; +/* *INDENT-ON* */ + +void +tcp_timer_keep_handler (u32 conn_index) +{ + u32 thread_index = vlib_get_thread_index (); + tcp_connection_t *tc; + + tc = tcp_connection_get (conn_index, thread_index); + tc->timers[TCP_TIMER_KEEP] = TCP_TIMER_HANDLE_INVALID; + + tcp_connection_close (tc); +} + +void +tcp_timer_establish_handler (u32 conn_index) +{ + tcp_connection_t *tc; + + tc = tcp_half_open_connection_get (conn_index); + if (tc) + { + ASSERT (tc->state == TCP_STATE_SYN_SENT); + stream_session_connect_notify (&tc->connection, 1 /* fail */ ); + TCP_DBG ("establish pop: %U", format_tcp_connection, tc, 2); + } + else + { + tc = tcp_connection_get (conn_index, vlib_get_thread_index ()); + /* note: the connection may have already disappeared */ + if (PREDICT_FALSE (tc == 0)) + return; + TCP_DBG ("establish pop: %U", format_tcp_connection, tc, 2); + ASSERT (tc->state == TCP_STATE_SYN_RCVD); + /* Start cleanup. App wasn't notified yet so use delete notify as + * opposed to delete to cleanup session layer state. */ + stream_session_delete_notify (&tc->connection); + } + tc->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID; + tcp_connection_cleanup (tc); +} + +void +tcp_timer_waitclose_handler (u32 conn_index) +{ + u32 thread_index = vlib_get_thread_index (); + tcp_connection_t *tc; + + tc = tcp_connection_get (conn_index, thread_index); + if (!tc) + return; + tc->timers[TCP_TIMER_WAITCLOSE] = TCP_TIMER_HANDLE_INVALID; + + /* Session didn't come back with a close(). Send FIN either way + * and switch to LAST_ACK. */ + if (tc->state == TCP_STATE_CLOSE_WAIT) + { + if (tc->flags & TCP_CONN_FINSNT) + { + clib_warning ("FIN was sent and still in CLOSE WAIT. Weird!"); + } + + tcp_send_fin (tc); + tc->state = TCP_STATE_LAST_ACK; + + /* Make sure we don't wait in LAST ACK forever */ + tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + + /* Don't delete the connection yet */ + return; + } + + tcp_connection_del (tc); +} + +/* *INDENT-OFF* */ +static timer_expiration_handler *timer_expiration_handlers[TCP_N_TIMERS] = +{ + tcp_timer_retransmit_handler, + tcp_timer_delack_handler, + tcp_timer_persist_handler, + tcp_timer_keep_handler, + tcp_timer_waitclose_handler, + tcp_timer_retransmit_syn_handler, + tcp_timer_establish_handler +}; +/* *INDENT-ON* */ + +static void +tcp_expired_timers_dispatch (u32 * expired_timers) +{ + int i; + u32 connection_index, timer_id; + + for (i = 0; i < vec_len (expired_timers); i++) + { + /* Get session index and timer id */ + connection_index = expired_timers[i] & 0x0FFFFFFF; + timer_id = expired_timers[i] >> 28; + + TCP_EVT_DBG (TCP_EVT_TIMER_POP, connection_index, timer_id); + + /* Handle expiration */ + (*timer_expiration_handlers[timer_id]) (connection_index); + } +} + +void +tcp_initialize_timer_wheels (tcp_main_t * tm) +{ + tw_timer_wheel_16t_2w_512sl_t *tw; + /* *INDENT-OFF* */ + foreach_vlib_main (({ + tw = &tm->timer_wheels[ii]; + tw_timer_wheel_init_16t_2w_512sl (tw, tcp_expired_timers_dispatch, + 100e-3 /* timer period 100ms */ , ~0); + tw->last_run_time = vlib_time_now (this_vlib_main); + })); + /* *INDENT-ON* */ +} + +clib_error_t * +tcp_main_enable (vlib_main_t * vm) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + ip_protocol_info_t *pi; + ip_main_t *im = &ip_main; + vlib_thread_main_t *vtm = vlib_get_thread_main (); + clib_error_t *error = 0; + u32 num_threads; + int thread; + tcp_connection_t *tc __attribute__ ((unused)); + u32 preallocated_connections_per_thread; + + if ((error = vlib_call_init_function (vm, ip_main_init))) + return error; + if ((error = vlib_call_init_function (vm, ip4_lookup_init))) + return error; + if ((error = vlib_call_init_function (vm, ip6_lookup_init))) + return error; + + /* + * Registrations + */ + + /* Register with IP */ + pi = ip_get_protocol_info (im, IP_PROTOCOL_TCP); + if (pi == 0) + return clib_error_return (0, "TCP protocol info AWOL"); + pi->format_header = format_tcp_header; + pi->unformat_pg_edit = unformat_pg_tcp_header; + + ip4_register_protocol (IP_PROTOCOL_TCP, tcp4_input_node.index); + ip6_register_protocol (IP_PROTOCOL_TCP, tcp6_input_node.index); + + /* Register as transport with session layer */ + session_register_transport (TRANSPORT_PROTO_TCP, 1, &tcp_proto); + session_register_transport (TRANSPORT_PROTO_TCP, 0, &tcp_proto); + + /* + * Initialize data structures + */ + + num_threads = 1 /* main thread */ + vtm->n_threads; + vec_validate (tm->connections, num_threads - 1); + + /* + * Preallocate connections. Assume that thread 0 won't + * use preallocated threads when running multi-core + */ + if (num_threads == 1) + { + thread = 0; + preallocated_connections_per_thread = tm->preallocated_connections; + } + else + { + thread = 1; + preallocated_connections_per_thread = + tm->preallocated_connections / (num_threads - 1); + } + for (; thread < num_threads; thread++) + { + if (preallocated_connections_per_thread) + pool_init_fixed (tm->connections[thread], + preallocated_connections_per_thread); + } + + /* + * Use a preallocated half-open connection pool? + */ + if (tm->preallocated_half_open_connections) + pool_init_fixed (tm->half_open_connections, + tm->preallocated_half_open_connections); + + /* Initialize per worker thread tx buffers (used for control messages) */ + vec_validate (tm->tx_buffers, num_threads - 1); + + /* Initialize timer wheels */ + vec_validate (tm->timer_wheels, num_threads - 1); + tcp_initialize_timer_wheels (tm); + + /* Initialize clocks per tick for TCP timestamp. Used to compute + * monotonically increasing timestamps. */ + tm->tstamp_ticks_per_clock = vm->clib_time.seconds_per_clock + / TCP_TSTAMP_RESOLUTION; + + if (tm->local_endpoints_table_buckets == 0) + tm->local_endpoints_table_buckets = 250000; + if (tm->local_endpoints_table_memory == 0) + tm->local_endpoints_table_memory = 512 << 20; + + clib_bihash_init_24_8 (&tm->local_endpoints_table, "local endpoint table", + tm->local_endpoints_table_buckets, + tm->local_endpoints_table_memory); + + /* Initialize [port-allocator] random number seed */ + tm->port_allocator_seed = (u32) clib_cpu_time_now (); + + if (num_threads > 1) + { + clib_spinlock_init (&tm->half_open_lock); + clib_spinlock_init (&tm->local_endpoints_lock); + } + + vec_validate (tm->tx_frames[0], num_threads - 1); + vec_validate (tm->tx_frames[1], num_threads - 1); + vec_validate (tm->ip_lookup_tx_frames[0], num_threads - 1); + vec_validate (tm->ip_lookup_tx_frames[1], num_threads - 1); + + tm->bytes_per_buffer = vlib_buffer_free_list_buffer_size + (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + + vec_validate (tm->time_now, num_threads - 1); + return error; +} + +clib_error_t * +vnet_tcp_enable_disable (vlib_main_t * vm, u8 is_en) +{ + if (is_en) + { + if (tcp_main.is_enabled) + return 0; + + return tcp_main_enable (vm); + } + else + { + tcp_main.is_enabled = 0; + } + + return 0; +} + +void +tcp_punt_unknown (vlib_main_t * vm, u8 is_ip4, u8 is_add) +{ + tcp_main_t *tm = &tcp_main; + if (is_ip4) + tm->punt_unknown4 = is_add; + else + tm->punt_unknown6 = is_add; +} + +clib_error_t * +tcp_init (vlib_main_t * vm) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + tm->is_enabled = 0; + tcp_api_reference (); + return 0; +} + +VLIB_INIT_FUNCTION (tcp_init); + +static clib_error_t * +tcp_config_fn (vlib_main_t * vm, unformat_input_t * input) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + u64 tmp; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (input, "preallocated-connections %d", + &tm->preallocated_connections)) + ; + else if (unformat (input, "preallocated-half-open-connections %d", + &tm->preallocated_half_open_connections)) + ; + else if (unformat (input, "local-endpoints-table-memory %U", + unformat_memory_size, &tmp)) + { + if (tmp >= 0x100000000) + return clib_error_return (0, "memory size %llx (%lld) too large", + tmp, tmp); + tm->local_endpoints_table_memory = tmp; + } + else if (unformat (input, "local-endpoints-table-buckets %d", + &tm->local_endpoints_table_buckets)) + ; + + + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + return 0; +} + +VLIB_CONFIG_FUNCTION (tcp_config_fn, "tcp"); + + +/** + * \brief Configure an ipv4 source address range + * @param vm vlib_main_t pointer + * @param start first ipv4 address in the source address range + * @param end last ipv4 address in the source address range + * @param table_id VRF / table ID, 0 for the default FIB + * @return 0 if all OK, else an error indication from api_errno.h + */ + +int +tcp_configure_v4_source_address_range (vlib_main_t * vm, + ip4_address_t * start, + ip4_address_t * end, u32 table_id) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vnet_main_t *vnm = vnet_get_main (); + u32 start_host_byte_order, end_host_byte_order; + fib_prefix_t prefix; + vnet_sw_interface_t *si; + fib_node_index_t fei; + u32 fib_index = 0; + u32 sw_if_index; + int rv; + int vnet_proxy_arp_add_del (ip4_address_t * lo_addr, + ip4_address_t * hi_addr, u32 fib_index, + int is_del); + + memset (&prefix, 0, sizeof (prefix)); + + fib_index = fib_table_find (FIB_PROTOCOL_IP4, table_id); + + if (fib_index == ~0) + return VNET_API_ERROR_NO_SUCH_FIB; + + start_host_byte_order = clib_net_to_host_u32 (start->as_u32); + end_host_byte_order = clib_net_to_host_u32 (end->as_u32); + + /* sanity check for reversed args or some such */ + if ((end_host_byte_order - start_host_byte_order) > (10 << 10)) + return VNET_API_ERROR_INVALID_ARGUMENT; + + /* Lookup the last address, to identify the interface involved */ + prefix.fp_len = 32; + prefix.fp_proto = FIB_PROTOCOL_IP4; + memcpy (&prefix.fp_addr.ip4, end, sizeof (ip4_address_t)); + + fei = fib_table_lookup (fib_index, &prefix); + + /* Couldn't find route to destination. Bail out. */ + if (fei == FIB_NODE_INDEX_INVALID) + return VNET_API_ERROR_NEXT_HOP_NOT_IN_FIB; + + sw_if_index = fib_entry_get_resolving_interface (fei); + + /* Enable proxy arp on the interface */ + si = vnet_get_sw_interface (vnm, sw_if_index); + si->flags |= VNET_SW_INTERFACE_FLAG_PROXY_ARP; + + /* Configure proxy arp across the range */ + rv = vnet_proxy_arp_add_del (start, end, fib_index, 0 /* is_del */ ); + + if (rv) + return rv; + + do + { + dpo_id_t dpo = DPO_INVALID; + + vec_add1 (tm->ip4_src_addresses, start[0]); + + /* Add local adjacencies for the range */ + + receive_dpo_add_or_lock (DPO_PROTO_IP4, ~0 /* sw_if_index */ , + NULL, &dpo); + prefix.fp_len = 32; + prefix.fp_proto = FIB_PROTOCOL_IP4; + prefix.fp_addr.ip4.as_u32 = start->as_u32; + + fib_table_entry_special_dpo_update (fib_index, + &prefix, + FIB_SOURCE_API, + FIB_ENTRY_FLAG_EXCLUSIVE, &dpo); + dpo_reset (&dpo); + + start_host_byte_order++; + start->as_u32 = clib_host_to_net_u32 (start_host_byte_order); + } + while (start_host_byte_order <= end_host_byte_order); + + return 0; +} + +/** + * \brief Configure an ipv6 source address range + * @param vm vlib_main_t pointer + * @param start first ipv6 address in the source address range + * @param end last ipv6 address in the source address range + * @param table_id VRF / table ID, 0 for the default FIB + * @return 0 if all OK, else an error indication from api_errno.h + */ + +int +tcp_configure_v6_source_address_range (vlib_main_t * vm, + ip6_address_t * start, + ip6_address_t * end, u32 table_id) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + fib_prefix_t prefix; + u32 fib_index = 0; + fib_node_index_t fei; + u32 sw_if_index; + + memset (&prefix, 0, sizeof (prefix)); + + fib_index = fib_table_find (FIB_PROTOCOL_IP6, table_id); + + if (fib_index == ~0) + return VNET_API_ERROR_NO_SUCH_FIB; + + while (1) + { + int i; + ip6_address_t tmp; + dpo_id_t dpo = DPO_INVALID; + + /* Remember this address */ + vec_add1 (tm->ip6_src_addresses, start[0]); + + /* Lookup the prefix, to identify the interface involved */ + prefix.fp_len = 128; + prefix.fp_proto = FIB_PROTOCOL_IP6; + memcpy (&prefix.fp_addr.ip6, start, sizeof (ip6_address_t)); + + fei = fib_table_lookup (fib_index, &prefix); + + /* Couldn't find route to destination. Bail out. */ + if (fei == FIB_NODE_INDEX_INVALID) + return VNET_API_ERROR_NEXT_HOP_NOT_IN_FIB; + + sw_if_index = fib_entry_get_resolving_interface (fei); + + if (sw_if_index == (u32) ~ 0) + return VNET_API_ERROR_NO_MATCHING_INTERFACE; + + /* Add a proxy neighbor discovery entry for this address */ + ip6_neighbor_proxy_add_del (sw_if_index, start, 0 /* is_del */ ); + + /* Add a receive adjacency for this address */ + receive_dpo_add_or_lock (DPO_PROTO_IP6, ~0 /* sw_if_index */ , + NULL, &dpo); + + fib_table_entry_special_dpo_update (fib_index, + &prefix, + FIB_SOURCE_API, + FIB_ENTRY_FLAG_EXCLUSIVE, &dpo); + dpo_reset (&dpo); + + /* Done with the entire range? */ + if (!memcmp (start, end, sizeof (start[0]))) + break; + + /* Increment the address. DGMS. */ + tmp = start[0]; + for (i = 15; i >= 0; i--) + { + tmp.as_u8[i] += 1; + if (tmp.as_u8[i] != 0) + break; + } + start[0] = tmp; + } + return 0; +} + +static clib_error_t * +tcp_src_address (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd_arg) +{ + ip4_address_t v4start, v4end; + ip6_address_t v6start, v6end; + u32 table_id = 0; + int v4set = 0; + int v6set = 0; + int rv; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "%U - %U", unformat_ip4_address, &v4start, + unformat_ip4_address, &v4end)) + v4set = 1; + else if (unformat (input, "%U", unformat_ip4_address, &v4start)) + { + memcpy (&v4end, &v4start, sizeof (v4start)); + v4set = 1; + } + else if (unformat (input, "%U - %U", unformat_ip6_address, &v6start, + unformat_ip6_address, &v6end)) + v6set = 1; + else if (unformat (input, "%U", unformat_ip6_address, &v6start)) + { + memcpy (&v6end, &v6start, sizeof (v6start)); + v6set = 1; + } + else if (unformat (input, "fib-table %d", &table_id)) + ; + else + break; + } + + if (!v4set && !v6set) + return clib_error_return (0, "at least one v4 or v6 address required"); + + if (v4set) + { + rv = tcp_configure_v4_source_address_range (vm, &v4start, &v4end, + table_id); + switch (rv) + { + case 0: + break; + + case VNET_API_ERROR_NO_SUCH_FIB: + return clib_error_return (0, "Invalid table-id %d", table_id); + + case VNET_API_ERROR_INVALID_ARGUMENT: + return clib_error_return (0, "Invalid address range %U - %U", + format_ip4_address, &v4start, + format_ip4_address, &v4end); + default: + return clib_error_return (0, "error %d", rv); + break; + } + } + if (v6set) + { + rv = tcp_configure_v6_source_address_range (vm, &v6start, &v6end, + table_id); + switch (rv) + { + case 0: + break; + + case VNET_API_ERROR_NO_SUCH_FIB: + return clib_error_return (0, "Invalid table-id %d", table_id); + + default: + return clib_error_return (0, "error %d", rv); + break; + } + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (tcp_src_address_command, static) = +{ + .path = "tcp src-address", + .short_help = "tcp src-address <ip-addr> [- <ip-addr>] add src address range", + .function = tcp_src_address, +}; +/* *INDENT-ON* */ + +static u8 * +tcp_scoreboard_dump_trace (u8 * s, sack_scoreboard_t * sb) +{ +#if TCP_SCOREBOARD_TRACE + + scoreboard_trace_elt_t *block; + int i = 0; + + if (!sb->trace) + return s; + + s = format (s, "scoreboard trace:"); + vec_foreach (block, sb->trace) + { + s = format (s, "{%u, %u, %u, %u, %u}, ", block->start, block->end, + block->ack, block->snd_una_max, block->group); + if ((++i % 3) == 0) + s = format (s, "\n"); + } + return s; +#else + return 0; +#endif +} + +static clib_error_t * +tcp_show_scoreboard_trace_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd_arg) +{ + transport_connection_t *tconn = 0; + tcp_connection_t *tc; + u8 *s = 0; + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "%U", unformat_transport_connection, &tconn, + TRANSPORT_PROTO_TCP)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + if (!TCP_SCOREBOARD_TRACE) + { + vlib_cli_output (vm, "scoreboard tracing not enabled"); + return 0; + } + + tc = tcp_get_connection_from_transport (tconn); + s = tcp_scoreboard_dump_trace (s, &tc->sack_sb); + vlib_cli_output (vm, "%v", s); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (tcp_show_scoreboard_trace_command, static) = +{ + .path = "show tcp scoreboard trace", + .short_help = "show tcp scoreboard trace <connection>", + .function = tcp_show_scoreboard_trace_fn, +}; +/* *INDENT-ON* */ + +u8 * +tcp_scoreboard_replay (u8 * s, tcp_connection_t * tc, u8 verbose) +{ + int i, trace_len; + scoreboard_trace_elt_t *trace; + u32 next_ack, left, group, has_new_ack = 0; + tcp_connection_t _dummy_tc, *dummy_tc = &_dummy_tc; + sack_block_t *block; + + if (!tc) + return s; + + memset (dummy_tc, 0, sizeof (*dummy_tc)); + tcp_connection_timers_init (dummy_tc); + scoreboard_init (&dummy_tc->sack_sb); + dummy_tc->rcv_opts.flags |= TCP_OPTS_FLAG_SACK; + +#if TCP_SCOREBOARD_TRACE + trace = tc->sack_sb.trace; + trace_len = vec_len (tc->sack_sb.trace); +#else + trace = 0; + trace_len = 0; +#endif + + for (i = 0; i < trace_len; i++) + { + if (trace[i].ack != 0) + { + dummy_tc->snd_una = trace[i].ack - 1448; + dummy_tc->snd_una_max = trace[i].ack; + } + } + + left = 0; + while (left < trace_len) + { + group = trace[left].group; + vec_reset_length (dummy_tc->rcv_opts.sacks); + has_new_ack = 0; + while (trace[left].group == group) + { + if (trace[left].ack != 0) + { + if (verbose) + s = format (s, "Adding ack %u, snd_una_max %u, segs: ", + trace[left].ack, trace[left].snd_una_max); + dummy_tc->snd_una_max = trace[left].snd_una_max; + next_ack = trace[left].ack; + has_new_ack = 1; + } + else + { + if (verbose) + s = format (s, "[%u, %u], ", trace[left].start, + trace[left].end); + vec_add2 (dummy_tc->rcv_opts.sacks, block, 1); + block->start = trace[left].start; + block->end = trace[left].end; + } + left++; + } + + /* Push segments */ + tcp_rcv_sacks (dummy_tc, next_ack); + if (has_new_ack) + dummy_tc->snd_una = next_ack + dummy_tc->sack_sb.snd_una_adv; + + if (verbose) + s = format (s, "result: %U", format_tcp_scoreboard, + &dummy_tc->sack_sb); + + } + s = format (s, "result: %U", format_tcp_scoreboard, &dummy_tc->sack_sb); + + return s; +} + +static clib_error_t * +tcp_scoreboard_trace_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd_arg) +{ + transport_connection_t *tconn = 0; + tcp_connection_t *tc = 0; + u8 *str = 0; + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "%U", unformat_transport_connection, &tconn, + TRANSPORT_PROTO_TCP)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + if (!TCP_SCOREBOARD_TRACE) + { + vlib_cli_output (vm, "scoreboard tracing not enabled"); + return 0; + } + + tc = tcp_get_connection_from_transport (tconn); + if (!tc) + { + vlib_cli_output (vm, "connection not found"); + return 0; + } + str = tcp_scoreboard_replay (str, tc, 1); + vlib_cli_output (vm, "%v", str); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (tcp_replay_scoreboard_command, static) = +{ + .path = "tcp replay scoreboard", + .short_help = "tcp replay scoreboard <connection>", + .function = tcp_scoreboard_trace_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +show_tcp_punt_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd_arg) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + if (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + return clib_error_return (0, "unknown input `%U'", format_unformat_error, + input); + vlib_cli_output (vm, "IPv4 TCP punt: %s", + tm->punt_unknown4 ? "enabled" : "disabled"); + vlib_cli_output (vm, "IPv6 TCP punt: %s", + tm->punt_unknown6 ? "enabled" : "disabled"); + return 0; +} +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_tcp_punt_command, static) = +{ + .path = "show tcp punt", + .short_help = "show tcp punt", + .function = show_tcp_punt_fn, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h new file mode 100644 index 00000000..259dbca1 --- /dev/null +++ b/src/vnet/tcp/tcp.h @@ -0,0 +1,985 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _vnet_tcp_h_ +#define _vnet_tcp_h_ + +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/tcp/tcp_packet.h> +#include <vnet/tcp/tcp_timer.h> +#include <vnet/session/transport.h> +#include <vnet/session/session.h> +#include <vnet/tcp/tcp_debug.h> + +#define TCP_TICK 0.001 /**< TCP tick period (s) */ +#define THZ (u32) (1/TCP_TICK) /**< TCP tick frequency */ +#define TCP_TSTAMP_RESOLUTION TCP_TICK /**< Time stamp resolution */ +#define TCP_PAWS_IDLE 24 * 24 * 60 * 60 * THZ /**< 24 days */ +#define TCP_FIB_RECHECK_PERIOD 1 * THZ /**< Recheck every 1s */ +#define TCP_MAX_OPTION_SPACE 40 + +#define TCP_DUPACK_THRESHOLD 3 +#define TCP_MAX_RX_FIFO_SIZE 4 << 20 +#define TCP_MIN_RX_FIFO_SIZE 4 << 10 +#define TCP_IW_N_SEGMENTS 10 +#define TCP_ALWAYS_ACK 1 /**< On/off delayed acks */ +#define TCP_USE_SACKS 1 /**< Disable only for testing */ + +/** TCP FSM state definitions as per RFC793. */ +#define foreach_tcp_fsm_state \ + _(CLOSED, "CLOSED") \ + _(LISTEN, "LISTEN") \ + _(SYN_SENT, "SYN_SENT") \ + _(SYN_RCVD, "SYN_RCVD") \ + _(ESTABLISHED, "ESTABLISHED") \ + _(CLOSE_WAIT, "CLOSE_WAIT") \ + _(FIN_WAIT_1, "FIN_WAIT_1") \ + _(LAST_ACK, "LAST_ACK") \ + _(CLOSING, "CLOSING") \ + _(FIN_WAIT_2, "FIN_WAIT_2") \ + _(TIME_WAIT, "TIME_WAIT") + +typedef enum _tcp_state +{ +#define _(sym, str) TCP_STATE_##sym, + foreach_tcp_fsm_state +#undef _ + TCP_N_STATES +} tcp_state_t; + +format_function_t format_tcp_state; +format_function_t format_tcp_flags; +format_function_t format_tcp_sacks; +format_function_t format_tcp_rcv_sacks; + +/** TCP timers */ +#define foreach_tcp_timer \ + _(RETRANSMIT, "RETRANSMIT") \ + _(DELACK, "DELAYED ACK") \ + _(PERSIST, "PERSIST") \ + _(KEEP, "KEEP") \ + _(WAITCLOSE, "WAIT CLOSE") \ + _(RETRANSMIT_SYN, "RETRANSMIT SYN") \ + _(ESTABLISH, "ESTABLISH") + +typedef enum _tcp_timers +{ +#define _(sym, str) TCP_TIMER_##sym, + foreach_tcp_timer +#undef _ + TCP_N_TIMERS +} tcp_timers_e; + +typedef void (timer_expiration_handler) (u32 index); + +extern timer_expiration_handler tcp_timer_delack_handler; +extern timer_expiration_handler tcp_timer_retransmit_handler; +extern timer_expiration_handler tcp_timer_persist_handler; +extern timer_expiration_handler tcp_timer_retransmit_syn_handler; + +#define TCP_TIMER_HANDLE_INVALID ((u32) ~0) + +/* Timer delays as multiples of 100ms */ +#define TCP_TO_TIMER_TICK TCP_TICK*10 /* Period for converting from TCP + * ticks to timer units */ +#define TCP_DELACK_TIME 1 /* 0.1s */ +#define TCP_ESTABLISH_TIME 750 /* 75s */ +#define TCP_SYN_RCVD_TIME 600 /* 60s */ +#define TCP_2MSL_TIME 300 /* 30s */ +#define TCP_CLOSEWAIT_TIME 20 /* 2s */ +#define TCP_TIMEWAIT_TIME 20 /* 2s */ +#define TCP_CLEANUP_TIME 10 /* 1s Time to wait before cleanup */ +#define TCP_TIMER_PERSIST_MIN 2 /* 0.2s */ + +#define TCP_RTO_MAX 60 * THZ /* Min max RTO (60s) as per RFC6298 */ +#define TCP_RTO_MIN 0.2 * THZ /* Min RTO (200ms) - lower than standard */ +#define TCP_RTT_MAX 30 * THZ /* 30s (probably too much) */ +#define TCP_RTO_SYN_RETRIES 3 /* SYN retries without doubling RTO */ +#define TCP_RTO_INIT 1 * THZ /* Initial retransmit timer */ + +/** TCP connection flags */ +#define foreach_tcp_connection_flag \ + _(SNDACK, "Send ACK") \ + _(FINSNT, "FIN sent") \ + _(SENT_RCV_WND0, "Sent 0 receive window") \ + _(RECOVERY, "Recovery on") \ + _(FAST_RECOVERY, "Fast Recovery on") \ + _(FR_1_SMSS, "Sent 1 SMSS") \ + _(HALF_OPEN_DONE, "Half-open completed") \ + _(FINPNDG, "FIN pending") + +typedef enum _tcp_connection_flag_bits +{ +#define _(sym, str) TCP_CONN_##sym##_BIT, + foreach_tcp_connection_flag +#undef _ + TCP_CONN_N_FLAG_BITS +} tcp_connection_flag_bits_e; + +typedef enum _tcp_connection_flag +{ +#define _(sym, str) TCP_CONN_##sym = 1 << TCP_CONN_##sym##_BIT, + foreach_tcp_connection_flag +#undef _ + TCP_CONN_N_FLAGS +} tcp_connection_flags_e; + +/** TCP buffer flags */ +#define foreach_tcp_buf_flag \ + _ (ACK) /**< Sending ACK. */ \ + _ (DUPACK) /**< Sending DUPACK. */ \ + +enum +{ +#define _(f) TCP_BUF_BIT_##f, + foreach_tcp_buf_flag +#undef _ + TCP_N_BUF_BITS, +}; + +enum +{ +#define _(f) TCP_BUF_FLAG_##f = 1 << TCP_BUF_BIT_##f, + foreach_tcp_buf_flag +#undef _ +}; + +#define TCP_SCOREBOARD_TRACE (0) +#define TCP_MAX_SACK_BLOCKS 15 /**< Max number of SACK blocks stored */ +#define TCP_INVALID_SACK_HOLE_INDEX ((u32)~0) + +typedef struct _scoreboard_trace_elt +{ + u32 start; + u32 end; + u32 ack; + u32 snd_una_max; + u32 group; +} scoreboard_trace_elt_t; + +typedef struct _sack_scoreboard_hole +{ + u32 next; /**< Index for next entry in linked list */ + u32 prev; /**< Index for previous entry in linked list */ + u32 start; /**< Start sequence number */ + u32 end; /**< End sequence number */ + u8 is_lost; /**< Mark hole as lost */ +} sack_scoreboard_hole_t; + +typedef struct _sack_scoreboard +{ + sack_scoreboard_hole_t *holes; /**< Pool of holes */ + u32 head; /**< Index of first entry */ + u32 tail; /**< Index of last entry */ + u32 sacked_bytes; /**< Number of bytes sacked in sb */ + u32 last_sacked_bytes; /**< Number of bytes last sacked */ + u32 last_bytes_delivered; /**< Number of sack bytes delivered */ + u32 snd_una_adv; /**< Bytes to add to snd_una */ + u32 high_sacked; /**< Highest byte sacked (fack) */ + u32 high_rxt; /**< Highest retransmitted sequence */ + u32 rescue_rxt; /**< Rescue sequence number */ + u32 lost_bytes; /**< Bytes lost as per RFC6675 */ + u32 cur_rxt_hole; /**< Retransmitting from this hole */ + +#if TCP_SCOREBOARD_TRACE + scoreboard_trace_elt_t *trace; +#endif + +} sack_scoreboard_t; + +#if TCP_SCOREBOARD_TRACE +#define tcp_scoreboard_trace_add(_tc, _ack) \ +{ \ + static u64 _group = 0; \ + sack_scoreboard_t *_sb = &_tc->sack_sb; \ + sack_block_t *_sack, *_sacks; \ + scoreboard_trace_elt_t *_elt; \ + int i; \ + _group++; \ + _sacks = _tc->rcv_opts.sacks; \ + for (i = 0; i < vec_len (_sacks); i++) \ + { \ + _sack = &_sacks[i]; \ + vec_add2 (_sb->trace, _elt, 1); \ + _elt->start = _sack->start; \ + _elt->end = _sack->end; \ + _elt->ack = _elt->end == _ack ? _ack : 0; \ + _elt->snd_una_max = _elt->end == _ack ? _tc->snd_una_max : 0; \ + _elt->group = _group; \ + } \ +} +#else +#define tcp_scoreboard_trace_add(_tc, _ack) +#endif + +typedef enum _tcp_cc_algorithm_type +{ + TCP_CC_NEWRENO, +} tcp_cc_algorithm_type_e; + +typedef struct _tcp_cc_algorithm tcp_cc_algorithm_t; + +typedef enum _tcp_cc_ack_t +{ + TCP_CC_ACK, + TCP_CC_DUPACK, + TCP_CC_PARTIALACK +} tcp_cc_ack_t; + +typedef struct _tcp_connection +{ + transport_connection_t connection; /**< Common transport data. First! */ + + u8 state; /**< TCP state as per tcp_state_t */ + u16 flags; /**< Connection flags (see tcp_conn_flags_e) */ + u32 timers[TCP_N_TIMERS]; /**< Timer handles into timer wheel */ + + /* TODO RFC4898 */ + + /** Send sequence variables RFC793 */ + u32 snd_una; /**< oldest unacknowledged sequence number */ + u32 snd_una_max; /**< newest unacknowledged sequence number + 1*/ + u32 snd_wnd; /**< send window */ + u32 snd_wl1; /**< seq number used for last snd.wnd update */ + u32 snd_wl2; /**< ack number used for last snd.wnd update */ + u32 snd_nxt; /**< next seq number to be sent */ + u16 snd_mss; /**< Effective send max seg (data) size */ + + /** Receive sequence variables RFC793 */ + u32 rcv_nxt; /**< next sequence number expected */ + u32 rcv_wnd; /**< receive window we expect */ + + u32 rcv_las; /**< rcv_nxt at last ack sent/rcv_wnd update */ + u32 iss; /**< initial sent sequence */ + u32 irs; /**< initial remote sequence */ + + /* Options */ + tcp_options_t rcv_opts; /**< Rx options for connection */ + tcp_options_t snd_opts; /**< Tx options for connection */ + u8 snd_opts_len; /**< Tx options len */ + u8 rcv_wscale; /**< Window scale to advertise to peer */ + u8 snd_wscale; /**< Window scale to use when sending */ + u32 tsval_recent; /**< Last timestamp received */ + u32 tsval_recent_age; /**< When last updated tstamp_recent*/ + + sack_block_t *snd_sacks; /**< Vector of SACKs to send. XXX Fixed size? */ + sack_scoreboard_t sack_sb; /**< SACK "scoreboard" that tracks holes */ + + u16 rcv_dupacks; /**< Number of DUPACKs received */ + u8 snt_dupacks; /**< Number of DUPACKs sent in a burst */ + + /* Congestion control */ + u32 cwnd; /**< Congestion window */ + u32 ssthresh; /**< Slow-start threshold */ + u32 prev_ssthresh; /**< ssthresh before congestion */ + u32 prev_cwnd; /**< ssthresh before congestion */ + u32 bytes_acked; /**< Bytes acknowledged by current segment */ + u32 snd_rxt_bytes; /**< Retransmitted bytes */ + u32 snd_rxt_ts; /**< Timestamp when first packet is retransmitted */ + u32 tsecr_last_ack; /**< Timestamp echoed to us in last healthy ACK */ + u32 snd_congestion; /**< snd_una_max when congestion is detected */ + tcp_cc_algorithm_t *cc_algo; /**< Congestion control algorithm */ + + /* RTT and RTO */ + u32 rto; /**< Retransmission timeout */ + u32 rto_boff; /**< Index for RTO backoff */ + u32 srtt; /**< Smoothed RTT */ + u32 rttvar; /**< Smoothed mean RTT difference. Approximates variance */ + u32 rtt_ts; /**< Timestamp for tracked ACK */ + u32 rtt_seq; /**< Sequence number for tracked ACK */ + + u16 mss; /**< Our max seg size that includes options */ + u32 limited_transmit; /**< snd_nxt when limited transmit starts */ + u32 last_fib_check; /**< Last time we checked fib route for peer */ +} tcp_connection_t; + +struct _tcp_cc_algorithm +{ + void (*rcv_ack) (tcp_connection_t * tc); + void (*rcv_cong_ack) (tcp_connection_t * tc, tcp_cc_ack_t ack); + void (*congestion) (tcp_connection_t * tc); + void (*recovered) (tcp_connection_t * tc); + void (*init) (tcp_connection_t * tc); +}; + +#define tcp_fastrecovery_on(tc) (tc)->flags |= TCP_CONN_FAST_RECOVERY +#define tcp_fastrecovery_off(tc) (tc)->flags &= ~TCP_CONN_FAST_RECOVERY +#define tcp_recovery_on(tc) (tc)->flags |= TCP_CONN_RECOVERY +#define tcp_recovery_off(tc) (tc)->flags &= ~TCP_CONN_RECOVERY +#define tcp_in_fastrecovery(tc) ((tc)->flags & TCP_CONN_FAST_RECOVERY) +#define tcp_in_recovery(tc) ((tc)->flags & (TCP_CONN_RECOVERY)) +#define tcp_in_slowstart(tc) (tc->cwnd < tc->ssthresh) +#define tcp_fastrecovery_sent_1_smss(tc) ((tc)->flags & TCP_CONN_FR_1_SMSS) +#define tcp_fastrecovery_1_smss_on(tc) ((tc)->flags |= TCP_CONN_FR_1_SMSS) +#define tcp_fastrecovery_1_smss_off(tc) ((tc)->flags &= ~TCP_CONN_FR_1_SMSS) + +#define tcp_in_cong_recovery(tc) ((tc)->flags & \ + (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY)) + +always_inline void +tcp_cong_recovery_off (tcp_connection_t * tc) +{ + tc->flags &= ~(TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY); + tcp_fastrecovery_1_smss_off (tc); +} + +typedef enum +{ + TCP_IP4, + TCP_IP6, + TCP_N_AF, +} tcp_af_t; + +typedef enum _tcp_error +{ +#define tcp_error(n,s) TCP_ERROR_##n, +#include <vnet/tcp/tcp_error.def> +#undef tcp_error + TCP_N_ERROR, +} tcp_error_t; + +typedef struct _tcp_lookup_dispatch +{ + u8 next, error; +} tcp_lookup_dispatch_t; + +typedef struct _tcp_main +{ + /* Per-worker thread tcp connection pools */ + tcp_connection_t **connections; + + /* Pool of listeners. */ + tcp_connection_t *listener_pool; + + /** Dispatch table by state and flags */ + tcp_lookup_dispatch_t dispatch_table[TCP_N_STATES][64]; + + u8 log2_tstamp_clocks_per_tick; + f64 tstamp_ticks_per_clock; + u32 *time_now; + + /** per-worker tx buffer free lists */ + u32 **tx_buffers; + /** per-worker tx frames to tcp 4/6 output nodes */ + vlib_frame_t **tx_frames[2]; + /** per-worker tx frames to ip 4/6 lookup nodes */ + vlib_frame_t **ip_lookup_tx_frames[2]; + + /* Per worker-thread timer wheel for connections timers */ + tw_timer_wheel_16t_2w_512sl_t *timer_wheels; + + /* Pool of half-open connections on which we've sent a SYN */ + tcp_connection_t *half_open_connections; + clib_spinlock_t half_open_lock; + + /* Pool of local TCP endpoints */ + transport_endpoint_t *local_endpoints; + + /* Local endpoints lookup table */ + transport_endpoint_table_t local_endpoints_table; + clib_spinlock_t local_endpoints_lock; + + /* Congestion control algorithms registered */ + tcp_cc_algorithm_t *cc_algos; + + /* Flag that indicates if stack is on or off */ + u8 is_enabled; + + /** Number of preallocated connections */ + u32 preallocated_connections; + u32 preallocated_half_open_connections; + + /** Transport table (preallocation) size parameters */ + u32 local_endpoints_table_memory; + u32 local_endpoints_table_buckets; + + /** Vectors of src addresses. Optional unless one needs > 63K active-opens */ + ip4_address_t *ip4_src_addresses; + u32 last_v4_address_rotor; + u32 last_v6_address_rotor; + ip6_address_t *ip6_src_addresses; + + /** Port allocator random number generator seed */ + u32 port_allocator_seed; + + /** vlib buffer size */ + u32 bytes_per_buffer; + + u8 punt_unknown4; + u8 punt_unknown6; +} tcp_main_t; + +extern tcp_main_t tcp_main; +extern vlib_node_registration_t tcp4_input_node; +extern vlib_node_registration_t tcp6_input_node; +extern vlib_node_registration_t tcp4_output_node; +extern vlib_node_registration_t tcp6_output_node; + +always_inline tcp_main_t * +vnet_get_tcp_main () +{ + return &tcp_main; +} + +always_inline tcp_header_t * +tcp_buffer_hdr (vlib_buffer_t * b) +{ + ASSERT ((signed) b->current_data >= (signed) -VLIB_BUFFER_PRE_DATA_SIZE); + return (tcp_header_t *) (b->data + b->current_data + + vnet_buffer (b)->tcp.hdr_offset); +} + +clib_error_t *vnet_tcp_enable_disable (vlib_main_t * vm, u8 is_en); + +void tcp_punt_unknown (vlib_main_t * vm, u8 is_ip4, u8 is_add); + +always_inline tcp_connection_t * +tcp_connection_get (u32 conn_index, u32 thread_index) +{ + if (PREDICT_FALSE + (pool_is_free_index (tcp_main.connections[thread_index], conn_index))) + return 0; + return pool_elt_at_index (tcp_main.connections[thread_index], conn_index); +} + +always_inline tcp_connection_t * +tcp_connection_get_if_valid (u32 conn_index, u32 thread_index) +{ + if (tcp_main.connections[thread_index] == 0) + return 0; + if (pool_is_free_index (tcp_main.connections[thread_index], conn_index)) + return 0; + return pool_elt_at_index (tcp_main.connections[thread_index], conn_index); +} + +always_inline tcp_connection_t * +tcp_get_connection_from_transport (transport_connection_t * tconn) +{ + return (tcp_connection_t *) tconn; +} + +void tcp_connection_close (tcp_connection_t * tc); +void tcp_connection_cleanup (tcp_connection_t * tc); +void tcp_connection_del (tcp_connection_t * tc); +int tcp_half_open_connection_cleanup (tcp_connection_t * tc); +tcp_connection_t *tcp_connection_new (u8 thread_index); +void tcp_connection_reset (tcp_connection_t * tc); +int tcp_configure_v4_source_address_range (vlib_main_t * vm, + ip4_address_t * start, + ip4_address_t * end, u32 table_id); +int tcp_configure_v6_source_address_range (vlib_main_t * vm, + ip6_address_t * start, + ip6_address_t * end, u32 table_id); +void tcp_api_reference (void); +u8 *format_tcp_connection_id (u8 * s, va_list * args); +u8 *format_tcp_connection (u8 * s, va_list * args); +u8 *format_tcp_scoreboard (u8 * s, va_list * args); + +u8 *tcp_scoreboard_replay (u8 * s, tcp_connection_t * tc, u8 verbose); + +always_inline tcp_connection_t * +tcp_listener_get (u32 tli) +{ + return pool_elt_at_index (tcp_main.listener_pool, tli); +} + +always_inline tcp_connection_t * +tcp_half_open_connection_get (u32 conn_index) +{ + tcp_connection_t *tc = 0; + clib_spinlock_lock_if_init (&tcp_main.half_open_lock); + if (!pool_is_free_index (tcp_main.half_open_connections, conn_index)) + tc = pool_elt_at_index (tcp_main.half_open_connections, conn_index); + clib_spinlock_unlock_if_init (&tcp_main.half_open_lock); + return tc; +} + +void tcp_make_ack (tcp_connection_t * ts, vlib_buffer_t * b); +void tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b); +void tcp_make_synack (tcp_connection_t * ts, vlib_buffer_t * b); +void tcp_send_reset_w_pkt (tcp_connection_t * tc, vlib_buffer_t * pkt, + u8 is_ip4); +void tcp_send_reset (tcp_connection_t * tc); +void tcp_send_syn (tcp_connection_t * tc); +void tcp_send_fin (tcp_connection_t * tc); +void tcp_init_mss (tcp_connection_t * tc); +void tcp_update_snd_mss (tcp_connection_t * tc); +void tcp_update_rto (tcp_connection_t * tc); +void tcp_flush_frame_to_output (vlib_main_t * vm, u8 thread_index, u8 is_ip4); +void tcp_flush_frames_to_output (u8 thread_index); + +always_inline u32 +tcp_end_seq (tcp_header_t * th, u32 len) +{ + return th->seq_number + tcp_is_syn (th) + tcp_is_fin (th) + len; +} + +/* Modulo arithmetic for TCP sequence numbers */ +#define seq_lt(_s1, _s2) ((i32)((_s1)-(_s2)) < 0) +#define seq_leq(_s1, _s2) ((i32)((_s1)-(_s2)) <= 0) +#define seq_gt(_s1, _s2) ((i32)((_s1)-(_s2)) > 0) +#define seq_geq(_s1, _s2) ((i32)((_s1)-(_s2)) >= 0) +#define seq_max(_s1, _s2) (seq_gt((_s1), (_s2)) ? (_s1) : (_s2)) + +/* Modulo arithmetic for timestamps */ +#define timestamp_lt(_t1, _t2) ((i32)((_t1)-(_t2)) < 0) +#define timestamp_leq(_t1, _t2) ((i32)((_t1)-(_t2)) <= 0) + +/** + * Our estimate of the number of bytes that have left the network + */ +always_inline u32 +tcp_bytes_out (const tcp_connection_t * tc) +{ + if (tcp_opts_sack_permitted (&tc->rcv_opts)) + return tc->sack_sb.sacked_bytes + tc->sack_sb.lost_bytes; + else + return tc->rcv_dupacks * tc->snd_mss; +} + +/** + * Our estimate of the number of bytes in flight (pipe size) + */ +always_inline u32 +tcp_flight_size (const tcp_connection_t * tc) +{ + int flight_size; + + flight_size = (int) (tc->snd_una_max - tc->snd_una) - tcp_bytes_out (tc) + + tc->snd_rxt_bytes; + + if (flight_size < 0) + { + if (0) + clib_warning + ("Negative: %u %u %u dupacks %u sacked bytes %u flags %d", + tc->snd_una_max - tc->snd_una, tcp_bytes_out (tc), + tc->snd_rxt_bytes, tc->rcv_dupacks, tc->sack_sb.sacked_bytes, + tc->rcv_opts.flags); + return 0; + } + + return flight_size; +} + +/** + * Initial cwnd as per RFC5681 + */ +always_inline u32 +tcp_initial_cwnd (const tcp_connection_t * tc) +{ + if (tc->snd_mss > 2190) + return 2 * tc->snd_mss; + else if (tc->snd_mss > 1095) + return 3 * tc->snd_mss; + else + return 4 * tc->snd_mss; +} + +always_inline u32 +tcp_loss_wnd (const tcp_connection_t * tc) +{ + return tc->snd_mss; +} + +always_inline u32 +tcp_available_snd_wnd (const tcp_connection_t * tc) +{ + return clib_min (tc->cwnd, tc->snd_wnd); +} + +always_inline u32 +tcp_available_output_snd_space (const tcp_connection_t * tc) +{ + u32 available_wnd = tcp_available_snd_wnd (tc); + int flight_size = (int) (tc->snd_nxt - tc->snd_una); + + if (available_wnd <= flight_size) + return 0; + + return available_wnd - flight_size; +} + +/** + * Estimate of how many bytes we can still push into the network + */ +always_inline u32 +tcp_available_snd_space (const tcp_connection_t * tc) +{ + u32 available_wnd = tcp_available_snd_wnd (tc); + u32 flight_size = tcp_flight_size (tc); + + if (available_wnd <= flight_size) + return 0; + + return available_wnd - flight_size; +} + +always_inline u8 +tcp_is_lost_fin (tcp_connection_t * tc) +{ + if ((tc->flags & TCP_CONN_FINSNT) && tc->snd_una_max - tc->snd_una == 1) + return 1; + return 0; +} + +i32 tcp_rcv_wnd_available (tcp_connection_t * tc); +u32 tcp_snd_space (tcp_connection_t * tc); +void tcp_update_rcv_wnd (tcp_connection_t * tc); + +void tcp_retransmit_first_unacked (tcp_connection_t * tc); +void tcp_fast_retransmit_no_sack (tcp_connection_t * tc); +void tcp_fast_retransmit_sack (tcp_connection_t * tc); +void tcp_fast_retransmit (tcp_connection_t * tc); +void tcp_cc_init_congestion (tcp_connection_t * tc); +int tcp_cc_recover (tcp_connection_t * tc); +void tcp_cc_fastrecovery_exit (tcp_connection_t * tc); + +fib_node_index_t tcp_lookup_rmt_in_fib (tcp_connection_t * tc); + +/* Made public for unit testing only */ +void tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end); + +always_inline u32 +tcp_time_now (void) +{ + return tcp_main.time_now[vlib_get_thread_index ()]; +} + +always_inline u32 +tcp_set_time_now (u32 thread_index) +{ + tcp_main.time_now[thread_index] = clib_cpu_time_now () + * tcp_main.tstamp_ticks_per_clock; + return tcp_main.time_now[thread_index]; +} + +always_inline void +tcp_update_time (f64 now, u32 thread_index) +{ + tcp_set_time_now (thread_index); + tw_timer_expire_timers_16t_2w_512sl (&tcp_main.timer_wheels[thread_index], + now); + tcp_flush_frames_to_output (thread_index); +} + +u32 tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b); + +u32 +tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, + u32 max_bytes, vlib_buffer_t ** b); + +void tcp_connection_timers_init (tcp_connection_t * tc); +void tcp_connection_timers_reset (tcp_connection_t * tc); +void tcp_init_snd_vars (tcp_connection_t * tc); +void tcp_connection_init_vars (tcp_connection_t * tc); + +always_inline void +tcp_connection_force_ack (tcp_connection_t * tc, vlib_buffer_t * b) +{ + /* Reset flags, make sure ack is sent */ + tc->flags = TCP_CONN_SNDACK; + vnet_buffer (b)->tcp.flags &= ~TCP_BUF_FLAG_DUPACK; +} + +always_inline void +tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval) +{ + ASSERT (tc->c_thread_index == vlib_get_thread_index ()); + ASSERT (tc->timers[timer_id] == TCP_TIMER_HANDLE_INVALID); + tc->timers[timer_id] + = tw_timer_start_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index], + tc->c_c_index, timer_id, interval); +} + +always_inline void +tcp_timer_reset (tcp_connection_t * tc, u8 timer_id) +{ + ASSERT (tc->c_thread_index == vlib_get_thread_index ()); + if (tc->timers[timer_id] == TCP_TIMER_HANDLE_INVALID) + return; + + tw_timer_stop_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index], + tc->timers[timer_id]); + tc->timers[timer_id] = TCP_TIMER_HANDLE_INVALID; +} + +always_inline void +tcp_timer_update (tcp_connection_t * tc, u8 timer_id, u32 interval) +{ + ASSERT (tc->c_thread_index == vlib_get_thread_index ()); + if (tc->timers[timer_id] != TCP_TIMER_HANDLE_INVALID) + tw_timer_stop_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index], + tc->timers[timer_id]); + tc->timers[timer_id] = + tw_timer_start_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index], + tc->c_c_index, timer_id, interval); +} + +always_inline void +tcp_retransmit_timer_set (tcp_connection_t * tc) +{ + ASSERT (tc->snd_una != tc->snd_una_max); + tcp_timer_set (tc, TCP_TIMER_RETRANSMIT, + clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); +} + +always_inline void +tcp_retransmit_timer_reset (tcp_connection_t * tc) +{ + tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT); +} + +always_inline void +tcp_retransmit_timer_force_update (tcp_connection_t * tc) +{ + tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, + clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); +} + +always_inline void +tcp_persist_timer_set (tcp_connection_t * tc) +{ + /* Reuse RTO. It's backed off in handler */ + tcp_timer_set (tc, TCP_TIMER_PERSIST, + clib_max (tc->rto * TCP_TO_TIMER_TICK, + TCP_TIMER_PERSIST_MIN)); +} + +always_inline void +tcp_persist_timer_update (tcp_connection_t * tc) +{ + tcp_timer_update (tc, TCP_TIMER_PERSIST, + clib_max (tc->rto * TCP_TO_TIMER_TICK, + TCP_TIMER_PERSIST_MIN)); +} + +always_inline void +tcp_persist_timer_reset (tcp_connection_t * tc) +{ + tcp_timer_reset (tc, TCP_TIMER_PERSIST); +} + +always_inline void +tcp_retransmit_timer_update (tcp_connection_t * tc) +{ + if (tc->snd_una == tc->snd_una_max) + { + tcp_retransmit_timer_reset (tc); + if (tc->snd_wnd < tc->snd_mss) + tcp_persist_timer_update (tc); + } + else + tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, + clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); +} + +always_inline u8 +tcp_timer_is_active (tcp_connection_t * tc, tcp_timers_e timer) +{ + return tc->timers[timer] != TCP_TIMER_HANDLE_INVALID; +} + +#define tcp_validate_txf_size(_tc, _a) \ + ASSERT(_tc->state != TCP_STATE_ESTABLISHED \ + || stream_session_tx_fifo_max_dequeue (&_tc->connection) >= _a) + +void +scoreboard_remove_hole (sack_scoreboard_t * sb, + sack_scoreboard_hole_t * hole); +void scoreboard_update_lost (tcp_connection_t * tc, sack_scoreboard_t * sb); +sack_scoreboard_hole_t *scoreboard_insert_hole (sack_scoreboard_t * sb, + u32 prev_index, u32 start, + u32 end); +sack_scoreboard_hole_t *scoreboard_next_rxt_hole (sack_scoreboard_t * sb, + sack_scoreboard_hole_t * + start, u8 have_sent_1_smss, + u8 * can_rescue, + u8 * snd_limited); +void scoreboard_init_high_rxt (sack_scoreboard_t * sb, u32 seq); + +always_inline sack_scoreboard_hole_t * +scoreboard_get_hole (sack_scoreboard_t * sb, u32 index) +{ + if (index != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, index); + return 0; +} + +always_inline sack_scoreboard_hole_t * +scoreboard_next_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) +{ + if (hole->next != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, hole->next); + return 0; +} + +always_inline sack_scoreboard_hole_t * +scoreboard_prev_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) +{ + if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, hole->prev); + return 0; +} + +always_inline sack_scoreboard_hole_t * +scoreboard_first_hole (sack_scoreboard_t * sb) +{ + if (sb->head != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, sb->head); + return 0; +} + +always_inline sack_scoreboard_hole_t * +scoreboard_last_hole (sack_scoreboard_t * sb) +{ + if (sb->tail != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, sb->tail); + return 0; +} + +always_inline void +scoreboard_clear (sack_scoreboard_t * sb) +{ + sack_scoreboard_hole_t *hole; + while ((hole = scoreboard_first_hole (sb))) + { + scoreboard_remove_hole (sb, hole); + } + ASSERT (sb->head == sb->tail && sb->head == TCP_INVALID_SACK_HOLE_INDEX); + ASSERT (pool_elts (sb->holes) == 0); + sb->sacked_bytes = 0; + sb->last_sacked_bytes = 0; + sb->last_bytes_delivered = 0; + sb->snd_una_adv = 0; + sb->high_sacked = 0; + sb->high_rxt = 0; + sb->lost_bytes = 0; + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; +} + +always_inline u32 +scoreboard_hole_bytes (sack_scoreboard_hole_t * hole) +{ + return hole->end - hole->start; +} + +always_inline u32 +scoreboard_hole_index (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) +{ + ASSERT (!pool_is_free_index (sb->holes, hole - sb->holes)); + return hole - sb->holes; +} + +always_inline void +scoreboard_init (sack_scoreboard_t * sb) +{ + sb->head = TCP_INVALID_SACK_HOLE_INDEX; + sb->tail = TCP_INVALID_SACK_HOLE_INDEX; + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; +} + +void tcp_rcv_sacks (tcp_connection_t * tc, u32 ack); + +always_inline void +tcp_cc_algo_register (tcp_cc_algorithm_type_e type, + const tcp_cc_algorithm_t * vft) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vec_validate (tm->cc_algos, type); + + tm->cc_algos[type] = *vft; +} + +always_inline tcp_cc_algorithm_t * +tcp_cc_algo_get (tcp_cc_algorithm_type_e type) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + return &tm->cc_algos[type]; +} + +void tcp_cc_init (tcp_connection_t * tc); + +/** + * Push TCP header to buffer + * + * @param vm - vlib_main + * @param b - buffer to write the header to + * @param sp_net - source port net order + * @param dp_net - destination port net order + * @param seq - sequence number net order + * @param ack - ack number net order + * @param tcp_hdr_opts_len - header and options length in bytes + * @param flags - header flags + * @param wnd - window size + * + * @return - pointer to start of TCP header + */ +always_inline void * +vlib_buffer_push_tcp_net_order (vlib_buffer_t * b, u16 sp, u16 dp, u32 seq, + u32 ack, u8 tcp_hdr_opts_len, u8 flags, + u16 wnd) +{ + tcp_header_t *th; + + th = vlib_buffer_push_uninit (b, tcp_hdr_opts_len); + + th->src_port = sp; + th->dst_port = dp; + th->seq_number = seq; + th->ack_number = ack; + th->data_offset_and_reserved = (tcp_hdr_opts_len >> 2) << 4; + th->flags = flags; + th->window = wnd; + th->checksum = 0; + th->urgent_pointer = 0; + return th; +} + +/** + * Push TCP header to buffer + * + * @param b - buffer to write the header to + * @param sp_net - source port net order + * @param dp_net - destination port net order + * @param seq - sequence number host order + * @param ack - ack number host order + * @param tcp_hdr_opts_len - header and options length in bytes + * @param flags - header flags + * @param wnd - window size + * + * @return - pointer to start of TCP header + */ +always_inline void * +vlib_buffer_push_tcp (vlib_buffer_t * b, u16 sp_net, u16 dp_net, u32 seq, + u32 ack, u8 tcp_hdr_opts_len, u8 flags, u16 wnd) +{ + return vlib_buffer_push_tcp_net_order (b, sp_net, dp_net, + clib_host_to_net_u32 (seq), + clib_host_to_net_u32 (ack), + tcp_hdr_opts_len, flags, + clib_host_to_net_u16 (wnd)); +} + +#endif /* _vnet_tcp_h_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_api.c b/src/vnet/tcp/tcp_api.c new file mode 100644 index 00000000..4c3e49ee --- /dev/null +++ b/src/vnet/tcp/tcp_api.c @@ -0,0 +1,119 @@ +/* + *------------------------------------------------------------------ + * tcp_api.c - vnet tcp-layer apis + * + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <vnet/vnet.h> +#include <vlibmemory/api.h> + +#include <vnet/tcp/tcp.h> + +#include <vnet/vnet_msg_enum.h> + +#define vl_typedefs /* define message structures */ +#include <vnet/vnet_all_api_h.h> +#undef vl_typedefs + +#define vl_endianfun /* define message structures */ +#include <vnet/vnet_all_api_h.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) +#define vl_printfun +#include <vnet/vnet_all_api_h.h> +#undef vl_printfun + +#include <vlibapi/api_helper_macros.h> + +#define foreach_tcp_api_msg \ +_(TCP_CONFIGURE_SRC_ADDRESSES, tcp_configure_src_addresses) + +static void + vl_api_tcp_configure_src_addresses_t_handler + (vl_api_tcp_configure_src_addresses_t * mp) +{ + vlib_main_t *vm = vlib_get_main (); + vl_api_tcp_configure_src_addresses_reply_t *rmp; + u32 vrf_id; + int rv; + + vrf_id = clib_net_to_host_u32 (mp->vrf_id); + + if (mp->is_ipv6) + rv = tcp_configure_v6_source_address_range + (vm, + (ip6_address_t *) mp->first_address, + (ip6_address_t *) mp->last_address, vrf_id); + else + rv = tcp_configure_v4_source_address_range + (vm, + (ip4_address_t *) mp->first_address, + (ip4_address_t *) mp->last_address, vrf_id); + + REPLY_MACRO (VL_API_TCP_CONFIGURE_SRC_ADDRESSES_REPLY); +} + +#define vl_msg_name_crc_list +#include <vnet/tcp/tcp.api.h> +#undef vl_msg_name_crc_list + +static void +setup_message_id_table (api_main_t * am) +{ +#define _(id,n,crc) vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id); + foreach_vl_msg_name_crc_tcp; +#undef _ +} + +static clib_error_t * +tcp_api_hookup (vlib_main_t * vm) +{ + api_main_t *am = &api_main; + +#define _(N,n) \ + vl_msg_api_set_handlers(VL_API_##N, #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_tcp_api_msg; +#undef _ + + /* + * Set up the (msg_name, crc, message-id) table + */ + setup_message_id_table (am); + + return 0; +} + +VLIB_API_INIT_FUNCTION (tcp_api_hookup); + +void +tcp_api_reference (void) +{ +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h new file mode 100755 index 00000000..eb318cde --- /dev/null +++ b/src/vnet/tcp/tcp_debug.h @@ -0,0 +1,761 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SRC_VNET_TCP_TCP_DEBUG_H_ +#define SRC_VNET_TCP_TCP_DEBUG_H_ + +#include <vlib/vlib.h> + +#define TCP_DEBUG (1) +#define TCP_DEBUG_SM (0) +#define TCP_DEBUG_CC (0) +#define TCP_DEBUG_CC_STAT (1) + +#define foreach_tcp_dbg_evt \ + _(INIT, "") \ + _(DEALLOC, "") \ + _(OPEN, "open") \ + _(CLOSE, "close") \ + _(BIND, "bind") \ + _(UNBIND, "unbind") \ + _(DELETE, "delete") \ + _(SYN_SENT, "SYN sent") \ + _(SYNACK_SENT, "SYNACK sent") \ + _(SYNACK_RCVD, "SYNACK rcvd") \ + _(SYN_RXT, "SYN retransmit") \ + _(FIN_SENT, "FIN sent") \ + _(ACK_SENT, "ACK sent") \ + _(DUPACK_SENT, "DUPACK sent") \ + _(RST_SENT, "RST sent") \ + _(SYN_RCVD, "SYN rcvd") \ + _(ACK_RCVD, "ACK rcvd") \ + _(DUPACK_RCVD, "DUPACK rcvd") \ + _(FIN_RCVD, "FIN rcvd") \ + _(RST_RCVD, "RST rcvd") \ + _(STATE_CHANGE, "state change") \ + _(PKTIZE, "packetize") \ + _(INPUT, "in") \ + _(SND_WND, "snd_wnd update") \ + _(OUTPUT, "output") \ + _(TIMER_POP, "timer pop") \ + _(CC_RTX, "retransmit") \ + _(CC_EVT, "cc event") \ + _(CC_PACK, "cc partial ack") \ + _(CC_STAT, "cc stats") \ + _(CC_RTO_STAT, "cc rto stats") \ + _(SEG_INVALID, "invalid segment") \ + _(PAWS_FAIL, "failed paws check") \ + _(ACK_RCV_ERR, "invalid ack") \ + _(RCV_WND_SHRUNK, "shrunk rcv_wnd") \ + +typedef enum _tcp_dbg +{ +#define _(sym, str) TCP_DBG_##sym, + foreach_tcp_dbg_evt +#undef _ +} tcp_dbg_e; + +typedef enum _tcp_dbg_evt +{ +#define _(sym, str) TCP_EVT_##sym, + foreach_tcp_dbg_evt +#undef _ +} tcp_dbg_evt_e; + +#if TCP_DEBUG + +#define TRANSPORT_DEBUG (1) + +/* + * Infra and evt track setup + */ + +#define TCP_DBG(_fmt, _args...) clib_warning (_fmt, ##_args) + +#define DECLARE_ETD(_tc, _e, _size) \ + struct \ + { \ + u32 data[_size]; \ + } * ed; \ + ed = ELOG_TRACK_DATA (&vlib_global_main.elog_main, \ + _e, _tc->c_elog_track) + +#define TCP_DBG_IP_TAG_LCL(_tc) \ +{ \ + if (_tc->c_is_ip4) \ + { \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "lcl: %d.%d.%d.%d:%d", \ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->c_lcl_ip.ip4.as_u8[0]; \ + ed->data[1] = _tc->c_lcl_ip.ip4.as_u8[1]; \ + ed->data[2] = _tc->c_lcl_ip.ip4.as_u8[2]; \ + ed->data[3] = _tc->c_lcl_ip.ip4.as_u8[3]; \ + ed->data[4] = clib_net_to_host_u16(_tc->c_lcl_port); \ + } \ +} + +#define TCP_DBG_IP_TAG_RMT(_tc) \ +{ \ + if (_tc->c_is_ip4) \ + { \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "rmt: %d.%d.%d.%d:%d", \ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->c_rmt_ip.ip4.as_u8[0]; \ + ed->data[1] = _tc->c_rmt_ip.ip4.as_u8[1]; \ + ed->data[2] = _tc->c_rmt_ip.ip4.as_u8[2]; \ + ed->data[3] = _tc->c_rmt_ip.ip4.as_u8[3]; \ + ed->data[4] = clib_net_to_host_u16(_tc->c_rmt_port); \ + } \ +} + +#define TCP_EVT_INIT_HANDLER(_tc, _is_l, ...) \ +{ \ + char *_fmt = _is_l ? "l[%d].%d:%d%c" : "[%d].%d:%d->.%d:%d%c"; \ + if (_tc->c_is_ip4) \ + { \ + _tc->c_elog_track.name = \ + (char *) format (0, _fmt, _tc->c_thread_index, \ + _tc->c_lcl_ip.ip4.as_u8[3], \ + clib_net_to_host_u16(_tc->c_lcl_port), \ + _tc->c_rmt_ip.ip4.as_u8[3], \ + clib_net_to_host_u16(_tc->c_rmt_port), 0); \ + } \ + else \ + _tc->c_elog_track.name = \ + (char *) format (0, _fmt, _tc->c_thread_index, \ + _tc->c_lcl_ip.ip6.as_u8[15], \ + clib_net_to_host_u16(_tc->c_lcl_port), \ + _tc->c_rmt_ip.ip6.as_u8[15], \ + clib_net_to_host_u16(_tc->c_rmt_port), 0); \ + elog_track_register (&vlib_global_main.elog_main, &_tc->c_elog_track);\ + TCP_DBG_IP_TAG_LCL(_tc); \ + TCP_DBG_IP_TAG_RMT(_tc); \ +} + +#define TCP_EVT_DEALLOC_HANDLER(_tc, ...) \ +{ \ + vec_free (_tc->c_elog_track.name); \ +} + +#define TCP_EVT_OPEN_HANDLER(_tc, ...) \ +{ \ + TCP_EVT_INIT_HANDLER(_tc, 0); \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "open: index %d", \ + .format_args = "i4", \ + }; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->c_c_index; \ +} + +#define TCP_EVT_CLOSE_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "close: %d", \ + .format_args = "i4", \ + }; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->c_c_index; \ +} + +#define TCP_EVT_BIND_HANDLER(_tc, ...) \ +{ \ + TCP_EVT_INIT_HANDLER(_tc, 1); \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "bind: listener %d", \ + }; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->c_c_index; \ +} + +#define TCP_EVT_SYN_RCVD_HANDLER(_tc,_init, ...) \ +{ \ + if (_init) \ + TCP_EVT_INIT_HANDLER(_tc, 0); \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "syn-rx: irs %u", \ + .format_args = "i4", \ + }; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->irs; \ + TCP_EVT_STATE_CHANGE_HANDLER(_tc); \ +} + +#define TCP_EVT_UNBIND_HANDLER(_tc, ...) \ +{ \ + TCP_EVT_DEALLOC_HANDLER(_tc); \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "unbind: listener %d", \ + }; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->c_c_index; \ + TCP_EVT_DEALLOC_HANDLER(_tc); \ +} + +#define TCP_EVT_DELETE_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "delete: %d", \ + .format_args = "i4", \ + }; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->c_c_index; \ + TCP_EVT_DEALLOC_HANDLER(_tc); \ +} + +#define CONCAT_HELPER(_a, _b) _a##_b +#define CC(_a, _b) CONCAT_HELPER(_a, _b) +#define TCP_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args) +#else +#define TCP_EVT_DBG(_evt, _args...) +#define TCP_DBG(_fmt, _args...) +#endif + +/* + * State machine + */ +#if TCP_DEBUG_SM + +#define TCP_EVT_STATE_CHANGE_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "state: %s", \ + .format_args = "t4", \ + .n_enum_strings = 11, \ + .enum_strings = { \ + "closed", \ + "listen", \ + "syn-sent", \ + "syn-rcvd", \ + "established", \ + "close_wait", \ + "fin-wait-1", \ + "last-ack", \ + "closing", \ + "fin-wait-2", \ + "time-wait", \ + }, \ + }; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->state; \ +} + +#define TCP_EVT_SYN_SENT_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "syn-tx: iss %u snd_una %u snd_una_max %u snd_nxt %u", \ + .format_args = "i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 4); \ + ed->data[0] = _tc->iss; \ + ed->data[1] = _tc->snd_una - _tc->iss; \ + ed->data[2] = _tc->snd_una_max - _tc->iss; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ + TCP_EVT_STATE_CHANGE_HANDLER(_tc); \ +} + +#define TCP_EVT_SYNACK_SENT_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "synack-tx: iss %u irs %u snd_una %u snd_nxt %u rcv_nxt %u",\ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->iss; \ + ed->data[1] = _tc->irs; \ + ed->data[2] = _tc->snd_una - _tc->iss; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ + ed->data[4] = _tc->rcv_nxt - _tc->irs; \ +} + +#define TCP_EVT_SYNACK_RCVD_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "synack-rx: iss %u irs %u snd_una %u snd_nxt %u rcv_nxt %u",\ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->iss; \ + ed->data[1] = _tc->irs; \ + ed->data[2] = _tc->snd_una - _tc->iss; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ + ed->data[4] = _tc->rcv_nxt - _tc->irs; \ + TCP_EVT_STATE_CHANGE_HANDLER(_tc); \ +} + +#define TCP_EVT_FIN_SENT_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "fin-tx: snd_nxt %d rcv_nxt %d", \ + .format_args = "i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = _tc->snd_nxt - _tc->iss; \ + ed->data[1] = _tc->rcv_nxt - _tc->irs; \ +} + +#define TCP_EVT_RST_SENT_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "rst-tx: snd_nxt %d rcv_nxt %d", \ + .format_args = "i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = _tc->snd_nxt - _tc->iss; \ + ed->data[1] = _tc->rcv_nxt - _tc->irs; \ + TCP_EVT_STATE_CHANGE_HANDLER(_tc); \ +} + +#define TCP_EVT_FIN_RCVD_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "fin-rx: snd_nxt %d rcv_nxt %d", \ + .format_args = "i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = _tc->snd_nxt - _tc->iss; \ + ed->data[1] = _tc->rcv_nxt - _tc->irs; \ +} + +#define TCP_EVT_RST_RCVD_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "rst-rx: snd_nxt %d rcv_nxt %d", \ + .format_args = "i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = _tc->snd_nxt - _tc->iss; \ + ed->data[1] = _tc->rcv_nxt - _tc->irs; \ +} + +#define TCP_EVT_SYN_RXT_HANDLER(_tc, _type, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "%s-rxt: iss %u irs %u snd_nxt %u rcv_nxt %u", \ + .format_args = "t4i4i4i4i4", \ + .n_enum_strings = 2, \ + .enum_strings = { \ + "syn", \ + "syn-ack", \ + }, \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _type; \ + ed->data[1] = _tc->iss; \ + ed->data[2] = _tc->irs; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ + ed->data[4] = _tc->rcv_nxt - _tc->irs; \ +} + +#else +#define TCP_EVT_SYN_SENT_HANDLER(_tc, ...) +#define TCP_EVT_SYNACK_SENT_HANDLER(_tc, ...) +#define TCP_EVT_SYNACK_RCVD_HANDLER(_tc, ...) +#define TCP_EVT_SYN_RXT_HANDLER(_tc, ...) +#define TCP_EVT_FIN_SENT_HANDLER(_tc, ...) +#define TCP_EVT_RST_SENT_HANDLER(_tc, ...) +#define TCP_EVT_FIN_RCVD_HANDLER(_tc, ...) +#define TCP_EVT_RST_RCVD_HANDLER(_tc, ...) +#define TCP_EVT_STATE_CHANGE_HANDLER(_tc, ...) +#endif + +#if TCP_DEBUG_SM > 1 + +#define TCP_EVT_ACK_SENT_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "ack-tx: acked %u rcv_nxt %u rcv_wnd %u snd_nxt %u snd_wnd %u",\ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->rcv_nxt - _tc->rcv_las; \ + ed->data[1] = _tc->rcv_nxt - _tc->irs; \ + ed->data[2] = _tc->rcv_wnd; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ + ed->data[4] = _tc->snd_wnd; \ +} + +#define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "ack-rx: %u snd_una %u snd_wnd %u cwnd %u inflight %u", \ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->bytes_acked; \ + ed->data[1] = _tc->snd_una - _tc->iss; \ + ed->data[2] = _tc->snd_wnd; \ + ed->data[3] = _tc->cwnd; \ + ed->data[4] = tcp_flight_size(_tc); \ +} + +#define TCP_EVT_PKTIZE_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "tx: una %u snd_nxt %u space %u flight %u rcv_wnd %u",\ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->snd_una - _tc->iss; \ + ed->data[1] = _tc->snd_nxt - _tc->iss; \ + ed->data[2] = tcp_available_output_snd_space (_tc); \ + ed->data[3] = tcp_flight_size (_tc); \ + ed->data[4] = _tc->rcv_wnd; \ +} + +#define TCP_EVT_INPUT_HANDLER(_tc, _type, _len, _written, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "in: %s len %u written %d rcv_nxt %u rcv_wnd(o) %d", \ + .format_args = "t4i4i4i4i4", \ + .n_enum_strings = 2, \ + .enum_strings = { \ + "order", \ + "ooo", \ + }, \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _type; \ + ed->data[1] = _len; \ + ed->data[2] = _written; \ + ed->data[3] = (_tc->rcv_nxt - _tc->irs) + _written; \ + ed->data[4] = _tc->rcv_wnd - (_tc->rcv_nxt - _tc->rcv_las); \ +} + +#define TCP_EVT_TIMER_POP_HANDLER(_tc_index, _timer_id, ...) \ +{ \ + tcp_connection_t *_tc; \ + if (_timer_id == TCP_TIMER_RETRANSMIT_SYN \ + || _timer_id == TCP_TIMER_ESTABLISH) \ + { \ + _tc = tcp_half_open_connection_get (_tc_index); \ + } \ + else \ + { \ + u32 _thread_index = vlib_get_thread_index (); \ + _tc = tcp_connection_get (_tc_index, _thread_index); \ + } \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "timer-pop: %s (%d)", \ + .format_args = "t4i4", \ + .n_enum_strings = 7, \ + .enum_strings = { \ + "retransmit", \ + "delack", \ + "persist", \ + "keep", \ + "waitclose", \ + "retransmit syn", \ + "establish", \ + }, \ + }; \ + if (_tc) \ + { \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = _timer_id; \ + ed->data[1] = _timer_id; \ + } \ + else \ + { \ + clib_warning ("pop %d for unexisting connection %d", _timer_id, \ + _tc_index); \ + } \ +} + +#define TCP_EVT_SEG_INVALID_HANDLER(_tc, _seq, _end, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "seg-inv: seq %u end %u rcv_las %u rcv_nxt %u rcv_wnd %u",\ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _seq - _tc->irs; \ + ed->data[1] = _end - _tc->irs; \ + ed->data[2] = _tc->rcv_las - _tc->irs; \ + ed->data[3] = _tc->rcv_nxt - _tc->irs; \ + ed->data[4] = _tc->rcv_wnd; \ +} + +#define TCP_EVT_PAWS_FAIL_HANDLER(_tc, _seq, _end, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "paws-err: seq %u end %u tsval %u tsval_recent %u", \ + .format_args = "i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 4); \ + ed->data[0] = _seq - _tc->irs; \ + ed->data[1] = _end - _tc->irs; \ + ed->data[2] = _tc->rcv_opts.tsval; \ + ed->data[3] = _tc->tsval_recent; \ +} + +#define TCP_EVT_ACK_RCV_ERR_HANDLER(_tc, _type, _ack, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "ack-err: %s ack %u snd_una %u snd_nxt %u una_max %u", \ + .format_args = "t4i4i4i4i4", \ + .n_enum_strings = 3, \ + .enum_strings = { \ + "invalid", \ + "old", \ + "future", \ + }, \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _type; \ + ed->data[1] = _ack - _tc->iss; \ + ed->data[2] = _tc->snd_una - _tc->iss; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ + ed->data[4] = _tc->snd_una_max - _tc->iss; \ +} + +#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...) \ +{ \ +if (_av > 0) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "huh?: rcv_wnd %u obsd %u av %u rcv_nxt %u rcv_las %u", \ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->rcv_wnd; \ + ed->data[1] = _obs; \ + ed->data[2] = _av; \ + ed->data[3] = _tc->rcv_nxt - _tc->irs; \ + ed->data[4] = _tc->rcv_las - _tc->irs; \ +} \ +} +#else +#define TCP_EVT_ACK_SENT_HANDLER(_tc, ...) +#define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...) +#define TCP_EVT_PKTIZE_HANDLER(_tc, ...) +#define TCP_EVT_INPUT_HANDLER(_tc, _type, _len, _written, ...) +#define TCP_EVT_TIMER_POP_HANDLER(_tc_index, _timer_id, ...) +#define TCP_EVT_SEG_INVALID_HANDLER(_tc, _seq, _end, ...) +#define TCP_EVT_PAWS_FAIL_HANDLER(_tc, _seq, _end, ...) +#define TCP_EVT_ACK_RCV_ERR_HANDLER(_tc, _type, _ack, ...) +#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...) +#endif + +/* + * State machine verbose + */ +#if TCP_DEBUG_SM > 2 +#define TCP_EVT_SND_WND_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "snd-wnd update: %u ", \ + .format_args = "i4", \ + }; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->snd_wnd; \ +} + +#define TCP_EVT_OUTPUT_HANDLER(_tc, flags, n_bytes,...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "out: flags %x, bytes %u", \ + .format_args = "i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = flags; \ + ed->data[1] = n_bytes; \ +} +#else +#define TCP_EVT_SND_WND_HANDLER(_tc, ...) +#define TCP_EVT_OUTPUT_HANDLER(_tc, flags, n_bytes,...) +#endif + +/* + * Congestion Control + */ + +#if TCP_DEBUG_CC + +#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "cc: %s wnd %u snd_cong %u rxt_bytes %u", \ + .format_args = "t4i4i4i4", \ + .n_enum_strings = 6, \ + .enum_strings = { \ + "fast-rxt", \ + "rxt-timeout", \ + "first-rxt", \ + "recovered", \ + "congestion", \ + "undo", \ + }, \ + }; \ + DECLARE_ETD(_tc, _e, 4); \ + ed->data[0] = _sub_evt; \ + ed->data[1] = tcp_available_snd_space (_tc); \ + ed->data[2] = _tc->snd_congestion - _tc->iss; \ + ed->data[3] = _tc->snd_rxt_bytes; \ +} + +#define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "rxt: snd_nxt %u offset %u snd %u rxt %u", \ + .format_args = "i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 4); \ + ed->data[0] = _tc->snd_nxt - _tc->iss; \ + ed->data[1] = offset; \ + ed->data[2] = n_bytes; \ + ed->data[3] = _tc->snd_rxt_bytes; \ +} + +#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "dack-tx: rcv_nxt %u rcv_wnd %u snd_nxt %u av_wnd %u snd_wnd %u",\ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->rcv_nxt - _tc->irs; \ + ed->data[1] = _tc->rcv_wnd; \ + ed->data[2] = _tc->snd_nxt - _tc->iss; \ + ed->data[3] = tcp_available_snd_wnd(_tc); \ + ed->data[4] = _tc->snd_wnd; \ +} + +#define TCP_EVT_DUPACK_RCVD_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "dack-rx: snd_una %u cwnd %u snd_wnd %u flight %u rcv_wnd %u",\ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->snd_una - _tc->iss; \ + ed->data[1] = _tc->cwnd; \ + ed->data[2] = _tc->snd_wnd; \ + ed->data[3] = tcp_flight_size(_tc); \ + ed->data[4] = _tc->rcv_wnd; \ +} + +#define TCP_EVT_CC_PACK_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "pack: snd_una %u snd_una_max %u", \ + .format_args = "i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = _tc->snd_una - _tc->iss; \ + ed->data[1] = _tc->snd_una_max - _tc->iss; \ +} +#else +#define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...) +#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...) +#define TCP_EVT_DUPACK_RCVD_HANDLER(_tc, ...) +#define TCP_EVT_CC_PACK_HANDLER(_tc, ...) +#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...) +#endif + +/* + * Congestion control stats + */ +#if TCP_DEBUG_CC_STAT + +#define STATS_INTERVAL 1 + +#define TCP_EVT_CC_RTO_STAT_HANDLER(_tc, ...) \ +{ \ +if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now()) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "rto_stat: rto %u srtt %u rttvar %u ", \ + .format_args = "i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 3); \ + ed->data[0] = _tc->rto; \ + ed->data[1] = _tc->srtt; \ + ed->data[2] = _tc->rttvar; \ +} \ +} + +#define TCP_EVT_CC_STAT_HANDLER(_tc, ...) \ +{ \ +if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now()) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "cc_stat: cwnd %u flight %u space %u ssthresh %u snd_wnd %u",\ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->cwnd; \ + ed->data[1] = tcp_flight_size (_tc); \ + ed->data[2] = tcp_snd_space (_tc); \ + ed->data[3] = _tc->ssthresh; \ + ed->data[4] = _tc->snd_wnd; \ + TCP_EVT_CC_RTO_STAT_HANDLER (_tc); \ + _tc->c_cc_stat_tstamp = tcp_time_now(); \ +} \ +} + +#else +#define TCP_EVT_CC_STAT_HANDLER(_tc, ...) +#endif + +#endif /* SRC_VNET_TCP_TCP_DEBUG_H_ */ +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_error.def b/src/vnet/tcp/tcp_error.def new file mode 100644 index 00000000..a179717f --- /dev/null +++ b/src/vnet/tcp/tcp_error.def @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +tcp_error (NONE, "no error") +tcp_error (LENGTH, "inconsistent ip/tcp lengths") +tcp_error (NO_LISTENER, "no listener for dst port") +tcp_error (LOOKUP_DROPS, "lookup drops") +tcp_error (DISPATCH, "Dispatch error") +tcp_error (ENQUEUED, "Packets pushed into rx fifo") +tcp_error (PARTIALLY_ENQUEUED, "Packets partially pushed into rx fifo") +tcp_error (PURE_ACK, "Pure acks") +tcp_error (SYNS_RCVD, "SYNs received") +tcp_error (SYN_ACKS_RCVD, "SYN-ACKs received") +tcp_error (NOT_READY, "Session not ready for packets") +tcp_error (FIFO_FULL, "Packets dropped for lack of rx fifo space") +tcp_error (EVENT_FIFO_FULL, "Events not sent for lack of event fifo space") +tcp_error (API_QUEUE_FULL, "Sessions not created for lack of API queue space") +tcp_error (CREATE_SESSION_FAIL, "Sessions couldn't be allocated") +tcp_error (SEGMENT_INVALID, "Invalid segments") +tcp_error (SEGMENT_OLD, "Old segment") +tcp_error (ACK_INVALID, "Invalid ACK") +tcp_error (ACK_DUP, "Duplicate ACK") +tcp_error (ACK_OLD, "Old ACK") +tcp_error (ACK_FUTURE, "Future ACK") +tcp_error (PKTS_SENT, "Packets sent") +tcp_error (FILTERED_DUPACKS, "Filtered duplicate ACKs") +tcp_error (RST_SENT, "Resets sent") +tcp_error (INVALID_CONNECTION, "Invalid connection") +tcp_error (NO_WND, "No window") +tcp_error (CONNECTION_CLOSED, "Connection closed") +tcp_error (CREATE_EXISTS, "Connection already exists") +tcp_error (PUNT, "Packets punted")
\ No newline at end of file diff --git a/src/vnet/tcp/tcp_format.c b/src/vnet/tcp/tcp_format.c new file mode 100644 index 00000000..1ca2f58e --- /dev/null +++ b/src/vnet/tcp/tcp_format.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * tcp/tcp_format.c: tcp formatting + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vnet/ip/ip.h> +#include <vnet/tcp/tcp.h> + +u8 * +format_tcp_flags (u8 * s, va_list * args) +{ + int flags = va_arg (*args, int); + + s = format (s, "0x%02x", flags); +#define _(f) if (flags & TCP_FLAG_##f) s = format (s, " %s", #f); + foreach_tcp_flag +#undef _ + return s; +} + +/* Format TCP header. */ +u8 * +format_tcp_header (u8 * s, va_list * args) +{ + tcp_header_t *tcp = va_arg (*args, tcp_header_t *); + u32 max_header_bytes = va_arg (*args, u32); + u32 header_bytes; + uword indent; + + /* Nothing to do. */ + if (max_header_bytes < sizeof (tcp[0])) + return format (s, "TCP header truncated"); + + indent = format_get_indent (s); + indent += 2; + header_bytes = tcp_header_bytes (tcp); + + s = format (s, "TCP: %d -> %d", clib_net_to_host_u16 (tcp->src), + clib_net_to_host_u16 (tcp->dst)); + + s = format (s, "\n%Useq. 0x%08x ack 0x%08x", format_white_space, indent, + clib_net_to_host_u32 (tcp->seq_number), + clib_net_to_host_u32 (tcp->ack_number)); + + s = format (s, "\n%Uflags %U, tcp header: %d bytes", format_white_space, + indent, format_tcp_flags, tcp->flags, header_bytes); + + s = format (s, "\n%Uwindow %d, checksum 0x%04x", format_white_space, indent, + clib_net_to_host_u16 (tcp->window), + clib_net_to_host_u16 (tcp->checksum)); + + +#if 0 + /* Format TCP options. */ + { + u8 *o; + u8 *option_start = (void *) (tcp + 1); + u8 *option_end = (void *) tcp + header_bytes; + + for (o = option_start; o < option_end;) + { + u32 length = o[1]; + switch (o[0]) + { + case TCP_OPTION_END: + length = 1; + o = option_end; + break; + + case TCP_OPTION_NOOP: + length = 1; + break; + + } + } + } +#endif + + /* Recurse into next protocol layer. */ + if (max_header_bytes != 0 && header_bytes < max_header_bytes) + { + ip_main_t *im = &ip_main; + tcp_udp_port_info_t *pi; + + pi = ip_get_tcp_udp_port_info (im, tcp->dst); + + if (pi && pi->format_header) + s = format (s, "\n%U%U", format_white_space, indent - 2, + pi->format_header, + /* next protocol header */ (void *) tcp + header_bytes, + max_header_bytes - header_bytes); + } + + return s; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c new file mode 100644 index 00000000..63d6fd87 --- /dev/null +++ b/src/vnet/tcp/tcp_input.c @@ -0,0 +1,3215 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vppinfra/sparse_vec.h> +#include <vnet/tcp/tcp_packet.h> +#include <vnet/tcp/tcp.h> +#include <vnet/session/session.h> +#include <math.h> + +static char *tcp_error_strings[] = { +#define tcp_error(n,s) s, +#include <vnet/tcp/tcp_error.def> +#undef tcp_error +}; + +/* All TCP nodes have the same outgoing arcs */ +#define foreach_tcp_state_next \ + _ (DROP, "error-drop") \ + _ (TCP4_OUTPUT, "tcp4-output") \ + _ (TCP6_OUTPUT, "tcp6-output") + +typedef enum _tcp_established_next +{ +#define _(s,n) TCP_ESTABLISHED_NEXT_##s, + foreach_tcp_state_next +#undef _ + TCP_ESTABLISHED_N_NEXT, +} tcp_established_next_t; + +typedef enum _tcp_rcv_process_next +{ +#define _(s,n) TCP_RCV_PROCESS_NEXT_##s, + foreach_tcp_state_next +#undef _ + TCP_RCV_PROCESS_N_NEXT, +} tcp_rcv_process_next_t; + +typedef enum _tcp_syn_sent_next +{ +#define _(s,n) TCP_SYN_SENT_NEXT_##s, + foreach_tcp_state_next +#undef _ + TCP_SYN_SENT_N_NEXT, +} tcp_syn_sent_next_t; + +typedef enum _tcp_listen_next +{ +#define _(s,n) TCP_LISTEN_NEXT_##s, + foreach_tcp_state_next +#undef _ + TCP_LISTEN_N_NEXT, +} tcp_listen_next_t; + +/* Generic, state independent indices */ +typedef enum _tcp_state_next +{ +#define _(s,n) TCP_NEXT_##s, + foreach_tcp_state_next +#undef _ + TCP_STATE_N_NEXT, +} tcp_state_next_t; + +#define tcp_next_output(is_ip4) (is_ip4 ? TCP_NEXT_TCP4_OUTPUT \ + : TCP_NEXT_TCP6_OUTPUT) + +vlib_node_registration_t tcp4_established_node; +vlib_node_registration_t tcp6_established_node; + +/** + * Validate segment sequence number. As per RFC793: + * + * Segment Receive Test + * Length Window + * ------- ------- ------------------------------------------- + * 0 0 SEG.SEQ = RCV.NXT + * 0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND + * >0 0 not acceptable + * >0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND + * or RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND + * + * This ultimately consists in checking if segment falls within the window. + * The one important difference compared to RFC793 is that we use rcv_las, + * or the rcv_nxt at last ack sent instead of rcv_nxt since that's the + * peer's reference when computing our receive window. + * + * This: + * seq_leq (end_seq, tc->rcv_las + tc->rcv_wnd) && seq_geq (seq, tc->rcv_las) + * however, is too strict when we have retransmits. Instead we just check that + * the seq is not beyond the right edge and that the end of the segment is not + * less than the left edge. + * + * N.B. rcv_nxt and rcv_wnd are both updated in this node if acks are sent, so + * use rcv_nxt in the right edge window test instead of rcv_las. + * + */ +always_inline u8 +tcp_segment_in_rcv_wnd (tcp_connection_t * tc, u32 seq, u32 end_seq) +{ + return (seq_geq (end_seq, tc->rcv_las) + && seq_leq (seq, tc->rcv_nxt + tc->rcv_wnd)); +} + +/** + * Parse TCP header options. + * + * @param th TCP header + * @param to TCP options data structure to be populated + * @return -1 if parsing failed + */ +int +tcp_options_parse (tcp_header_t * th, tcp_options_t * to) +{ + const u8 *data; + u8 opt_len, opts_len, kind; + int j; + sack_block_t b; + + opts_len = (tcp_doff (th) << 2) - sizeof (tcp_header_t); + data = (const u8 *) (th + 1); + + /* Zero out all flags but those set in SYN */ + to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE); + + for (; opts_len > 0; opts_len -= opt_len, data += opt_len) + { + kind = data[0]; + + /* Get options length */ + if (kind == TCP_OPTION_EOL) + break; + else if (kind == TCP_OPTION_NOOP) + { + opt_len = 1; + continue; + } + else + { + /* broken options */ + if (opts_len < 2) + return -1; + opt_len = data[1]; + + /* weird option length */ + if (opt_len < 2 || opt_len > opts_len) + return -1; + } + + /* Parse options */ + switch (kind) + { + case TCP_OPTION_MSS: + if ((opt_len == TCP_OPTION_LEN_MSS) && tcp_syn (th)) + { + to->flags |= TCP_OPTS_FLAG_MSS; + to->mss = clib_net_to_host_u16 (*(u16 *) (data + 2)); + } + break; + case TCP_OPTION_WINDOW_SCALE: + if ((opt_len == TCP_OPTION_LEN_WINDOW_SCALE) && tcp_syn (th)) + { + to->flags |= TCP_OPTS_FLAG_WSCALE; + to->wscale = data[2]; + if (to->wscale > TCP_MAX_WND_SCALE) + { + clib_warning ("Illegal window scaling value: %d", + to->wscale); + to->wscale = TCP_MAX_WND_SCALE; + } + } + break; + case TCP_OPTION_TIMESTAMP: + if (opt_len == TCP_OPTION_LEN_TIMESTAMP) + { + to->flags |= TCP_OPTS_FLAG_TSTAMP; + to->tsval = clib_net_to_host_u32 (*(u32 *) (data + 2)); + to->tsecr = clib_net_to_host_u32 (*(u32 *) (data + 6)); + } + break; + case TCP_OPTION_SACK_PERMITTED: + if (opt_len == TCP_OPTION_LEN_SACK_PERMITTED && tcp_syn (th)) + to->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; + break; + case TCP_OPTION_SACK_BLOCK: + /* If SACK permitted was not advertised or a SYN, break */ + if ((to->flags & TCP_OPTS_FLAG_SACK_PERMITTED) == 0 || tcp_syn (th)) + break; + + /* If too short or not correctly formatted, break */ + if (opt_len < 10 || ((opt_len - 2) % TCP_OPTION_LEN_SACK_BLOCK)) + break; + + to->flags |= TCP_OPTS_FLAG_SACK; + to->n_sack_blocks = (opt_len - 2) / TCP_OPTION_LEN_SACK_BLOCK; + vec_reset_length (to->sacks); + for (j = 0; j < to->n_sack_blocks; j++) + { + b.start = clib_net_to_host_u32 (*(u32 *) (data + 2 + 8 * j)); + b.end = clib_net_to_host_u32 (*(u32 *) (data + 6 + 8 * j)); + vec_add1 (to->sacks, b); + } + break; + default: + /* Nothing to see here */ + continue; + } + } + return 0; +} + +/** + * RFC1323: Check against wrapped sequence numbers (PAWS). If we have + * timestamp to echo and it's less than tsval_recent, drop segment + * but still send an ACK in order to retain TCP's mechanism for detecting + * and recovering from half-open connections + * + * Or at least that's what the theory says. It seems that this might not work + * very well with packet reordering and fast retransmit. XXX + */ +always_inline int +tcp_segment_check_paws (tcp_connection_t * tc) +{ + return tcp_opts_tstamp (&tc->rcv_opts) && tc->tsval_recent + && timestamp_lt (tc->rcv_opts.tsval, tc->tsval_recent); +} + +/** + * Update tsval recent + */ +always_inline void +tcp_update_timestamp (tcp_connection_t * tc, u32 seq, u32 seq_end) +{ + /* + * RFC1323: If Last.ACK.sent falls within the range of sequence numbers + * of an incoming segment: + * SEG.SEQ <= Last.ACK.sent < SEG.SEQ + SEG.LEN + * then the TSval from the segment is copied to TS.Recent; + * otherwise, the TSval is ignored. + */ + if (tcp_opts_tstamp (&tc->rcv_opts) && seq_leq (seq, tc->rcv_las) + && seq_leq (tc->rcv_las, seq_end)) + { + ASSERT (timestamp_leq (tc->tsval_recent, tc->rcv_opts.tsval)); + tc->tsval_recent = tc->rcv_opts.tsval; + tc->tsval_recent_age = tcp_time_now (); + } +} + +/** + * Validate incoming segment as per RFC793 p. 69 and RFC1323 p. 19 + * + * It first verifies if segment has a wrapped sequence number (PAWS) and then + * does the processing associated to the first four steps (ignoring security + * and precedence): sequence number, rst bit and syn bit checks. + * + * @return 0 if segments passes validation. + */ +static int +tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, + vlib_buffer_t * b0, tcp_header_t * th0, u32 * next0) +{ + if (PREDICT_FALSE (!tcp_ack (th0) && !tcp_rst (th0) && !tcp_syn (th0))) + return -1; + + if (PREDICT_FALSE (tcp_options_parse (th0, &tc0->rcv_opts))) + { + clib_warning ("options parse error"); + return -1; + } + + if (tcp_segment_check_paws (tc0)) + { + if (CLIB_DEBUG > 2) + { + clib_warning ("paws failed\n%U", format_tcp_connection, tc0, 2); + clib_warning ("seq %u seq_end %u ack %u", + vnet_buffer (b0)->tcp.seq_number - tc0->irs, + vnet_buffer (b0)->tcp.seq_end - tc0->irs, + vnet_buffer (b0)->tcp.ack_number - tc0->iss); + } + TCP_EVT_DBG (TCP_EVT_PAWS_FAIL, tc0, vnet_buffer (b0)->tcp.seq_number, + vnet_buffer (b0)->tcp.seq_end); + + /* If it just so happens that a segment updates tsval_recent for a + * segment over 24 days old, invalidate tsval_recent. */ + if (timestamp_lt (tc0->tsval_recent_age + TCP_PAWS_IDLE, + tcp_time_now ())) + { + /* Age isn't reset until we get a valid tsval (bsd inspired) */ + tc0->tsval_recent = 0; + clib_warning ("paws failed - really old segment. REALLY?"); + } + else + { + /* Drop after ack if not rst */ + if (!tcp_rst (th0)) + { + tcp_make_ack (tc0, b0); + *next0 = tcp_next_output (tc0->c_is_ip4); + TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0); + return -1; + } + } + } + + /* 1st: check sequence number */ + if (!tcp_segment_in_rcv_wnd (tc0, vnet_buffer (b0)->tcp.seq_number, + vnet_buffer (b0)->tcp.seq_end)) + { + /* If our window is 0 and the packet is in sequence, let it pass + * through for ack processing. It should be dropped later.*/ + if (tc0->rcv_wnd == 0 + && tc0->rcv_nxt == vnet_buffer (b0)->tcp.seq_number) + { + /* TODO Should segment be tagged? */ + } + else + { + /* If not RST, send dup ack */ + if (!tcp_rst (th0)) + { + tcp_make_ack (tc0, b0); + *next0 = tcp_next_output (tc0->c_is_ip4); + TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0); + } + return -1; + } + } + + /* 2nd: check the RST bit */ + if (tcp_rst (th0)) + { + tcp_connection_reset (tc0); + return -1; + } + + /* 3rd: check security and precedence (skip) */ + + /* 4th: check the SYN bit */ + if (tcp_syn (th0)) + { + /* TODO implement RFC 5961 */ + if (tc0->state == TCP_STATE_SYN_RCVD) + { + tcp_make_synack (tc0, b0); + TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0, 0); + } + else + { + tcp_make_ack (tc0, b0); + TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, tc0); + } + *next0 = tcp_next_output (tc0->c_is_ip4); + return -1; + } + + /* If segment in window, save timestamp */ + tcp_update_timestamp (tc0, vnet_buffer (b0)->tcp.seq_number, + vnet_buffer (b0)->tcp.seq_end); + return 0; +} + +always_inline int +tcp_rcv_ack_is_acceptable (tcp_connection_t * tc0, vlib_buffer_t * tb0) +{ + /* SND.UNA =< SEG.ACK =< SND.NXT */ + return (seq_leq (tc0->snd_una, vnet_buffer (tb0)->tcp.ack_number) + && seq_leq (vnet_buffer (tb0)->tcp.ack_number, tc0->snd_nxt)); +} + +/** + * Compute smoothed RTT as per VJ's '88 SIGCOMM and RFC6298 + * + * Note that although the original article, srtt and rttvar are scaled + * to minimize round-off errors, here we don't. Instead, we rely on + * better precision time measurements. + * + * TODO support us rtt resolution + */ +static void +tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt) +{ + int err, diff; + + if (tc->srtt != 0) + { + err = mrtt - tc->srtt; + + /* XXX Drop in RTT results in RTTVAR increase and bigger RTO. + * The increase should be bound */ + tc->srtt = clib_max ((int) tc->srtt + (err >> 3), 1); + diff = (clib_abs (err) - (int) tc->rttvar) >> 2; + tc->rttvar = clib_max ((int) tc->rttvar + diff, 1); + } + else + { + /* First measurement. */ + tc->srtt = mrtt; + tc->rttvar = mrtt >> 1; + } +} + +void +tcp_update_rto (tcp_connection_t * tc) +{ + tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); + tc->rto = clib_max (tc->rto, TCP_RTO_MIN); +} + +/** + * Update RTT estimate and RTO timer + * + * Measure RTT: We have two sources of RTT measurements: TSOPT and ACK + * timing. Middle boxes are known to fiddle with TCP options so we + * should give higher priority to ACK timing. + * + * This should be called only if previously sent bytes have been acked. + * + * return 1 if valid rtt 0 otherwise + */ +static int +tcp_update_rtt (tcp_connection_t * tc, u32 ack) +{ + u32 mrtt = 0; + + /* Karn's rule, part 1. Don't use retransmitted segments to estimate + * RTT because they're ambiguous. */ + if (tcp_in_cong_recovery (tc) || tc->sack_sb.sacked_bytes) + goto done; + + if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq)) + { + mrtt = tcp_time_now () - tc->rtt_ts; + } + /* As per RFC7323 TSecr can be used for RTTM only if the segment advances + * snd_una, i.e., the left side of the send window: + * seq_lt (tc->snd_una, ack). This is a condition for calling update_rtt */ + else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr) + { + mrtt = tcp_time_now () - tc->rcv_opts.tsecr; + } + + /* Ignore dubious measurements */ + if (mrtt == 0 || mrtt > TCP_RTT_MAX) + goto done; + + tcp_estimate_rtt (tc, mrtt); + +done: + + /* Allow measuring of a new RTT */ + tc->rtt_ts = 0; + + /* If we got here something must've been ACKed so make sure boff is 0, + * even if mrrt is not valid since we update the rto lower */ + tc->rto_boff = 0; + tcp_update_rto (tc); + + return 0; +} + +/** + * Dequeue bytes that have been acked and while at it update RTT estimates. + */ +static void +tcp_dequeue_acked (tcp_connection_t * tc, u32 ack) +{ + /* Dequeue the newly ACKed add SACKed bytes */ + stream_session_dequeue_drop (&tc->connection, + tc->bytes_acked + tc->sack_sb.snd_una_adv); + + tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); + + /* Update rtt and rto */ + tcp_update_rtt (tc, ack); + + /* If everything has been acked, stop retransmit timer + * otherwise update. */ + tcp_retransmit_timer_update (tc); +} + +/** + * Check if duplicate ack as per RFC5681 Sec. 2 + */ +static u8 +tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 prev_snd_wnd, + u32 prev_snd_una) +{ + return ((vnet_buffer (b)->tcp.ack_number == prev_snd_una) + && seq_gt (tc->snd_una_max, tc->snd_una) + && (vnet_buffer (b)->tcp.seq_end == vnet_buffer (b)->tcp.seq_number) + && (prev_snd_wnd == tc->snd_wnd)); +} + +/** + * Checks if ack is a congestion control event. + */ +static u8 +tcp_ack_is_cc_event (tcp_connection_t * tc, vlib_buffer_t * b, + u32 prev_snd_wnd, u32 prev_snd_una, u8 * is_dack) +{ + /* Check if ack is duplicate. Per RFC 6675, ACKs that SACK new data are + * defined to be 'duplicate' */ + *is_dack = tc->sack_sb.last_sacked_bytes + || tcp_ack_is_dupack (tc, b, prev_snd_wnd, prev_snd_una); + + return ((*is_dack || tcp_in_cong_recovery (tc)) && !tcp_is_lost_fin (tc)); +} + +void +scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) +{ + sack_scoreboard_hole_t *next, *prev; + + if (hole->next != TCP_INVALID_SACK_HOLE_INDEX) + { + next = pool_elt_at_index (sb->holes, hole->next); + next->prev = hole->prev; + } + else + { + sb->tail = hole->prev; + } + + if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX) + { + prev = pool_elt_at_index (sb->holes, hole->prev); + prev->next = hole->next; + } + else + { + sb->head = hole->next; + } + + if (scoreboard_hole_index (sb, hole) == sb->cur_rxt_hole) + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; + + /* Poison the entry */ + if (CLIB_DEBUG > 0) + memset (hole, 0xfe, sizeof (*hole)); + + pool_put (sb->holes, hole); +} + +sack_scoreboard_hole_t * +scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index, + u32 start, u32 end) +{ + sack_scoreboard_hole_t *hole, *next, *prev; + u32 hole_index; + + pool_get (sb->holes, hole); + memset (hole, 0, sizeof (*hole)); + + hole->start = start; + hole->end = end; + hole_index = scoreboard_hole_index (sb, hole); + + prev = scoreboard_get_hole (sb, prev_index); + if (prev) + { + hole->prev = prev_index; + hole->next = prev->next; + + if ((next = scoreboard_next_hole (sb, hole))) + next->prev = hole_index; + else + sb->tail = hole_index; + + prev->next = hole_index; + } + else + { + sb->head = hole_index; + hole->prev = TCP_INVALID_SACK_HOLE_INDEX; + hole->next = TCP_INVALID_SACK_HOLE_INDEX; + } + + return hole; +} + +void +scoreboard_update_bytes (tcp_connection_t * tc, sack_scoreboard_t * sb) +{ + sack_scoreboard_hole_t *hole, *prev; + u32 bytes = 0, blks = 0; + + sb->lost_bytes = 0; + sb->sacked_bytes = 0; + hole = scoreboard_last_hole (sb); + if (!hole) + return; + + if (seq_gt (sb->high_sacked, hole->end)) + { + bytes = sb->high_sacked - hole->end; + blks = 1; + } + + while ((prev = scoreboard_prev_hole (sb, hole)) + && (bytes < (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss + && blks < TCP_DUPACK_THRESHOLD)) + { + bytes += hole->start - prev->end; + blks++; + hole = prev; + } + + while (hole) + { + sb->lost_bytes += scoreboard_hole_bytes (hole); + hole->is_lost = 1; + prev = hole; + hole = scoreboard_prev_hole (sb, hole); + if (hole) + bytes += prev->start - hole->end; + } + sb->sacked_bytes = bytes; +} + +/** + * Figure out the next hole to retransmit + * + * Follows logic proposed in RFC6675 Sec. 4, NextSeg() + */ +sack_scoreboard_hole_t * +scoreboard_next_rxt_hole (sack_scoreboard_t * sb, + sack_scoreboard_hole_t * start, + u8 have_sent_1_smss, + u8 * can_rescue, u8 * snd_limited) +{ + sack_scoreboard_hole_t *hole = 0; + + hole = start ? start : scoreboard_first_hole (sb); + while (hole && seq_leq (hole->end, sb->high_rxt) && hole->is_lost) + hole = scoreboard_next_hole (sb, hole); + + /* Nothing, return */ + if (!hole) + { + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; + return 0; + } + + /* Rule (1): if higher than rxt, less than high_sacked and lost */ + if (hole->is_lost && seq_lt (hole->start, sb->high_sacked)) + { + sb->cur_rxt_hole = scoreboard_hole_index (sb, hole); + } + else + { + /* Rule (2): output takes care of transmitting new data */ + if (!have_sent_1_smss) + { + hole = 0; + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; + } + /* Rule (3): if hole not lost */ + else if (seq_lt (hole->start, sb->high_sacked)) + { + *snd_limited = 1; + sb->cur_rxt_hole = scoreboard_hole_index (sb, hole); + } + /* Rule (4): if hole beyond high_sacked */ + else + { + ASSERT (seq_geq (hole->start, sb->high_sacked)); + *snd_limited = 1; + *can_rescue = 1; + /* HighRxt MUST NOT be updated */ + return 0; + } + } + + if (hole && seq_lt (sb->high_rxt, hole->start)) + sb->high_rxt = hole->start; + + return hole; +} + +void +scoreboard_init_high_rxt (sack_scoreboard_t * sb, u32 seq) +{ + sack_scoreboard_hole_t *hole; + hole = scoreboard_first_hole (sb); + if (hole) + { + seq = seq_gt (seq, hole->start) ? seq : hole->start; + sb->cur_rxt_hole = sb->head; + } + sb->high_rxt = seq; +} + +/** + * Test that scoreboard is sane after recovery + * + * Returns 1 if scoreboard is empty or if first hole beyond + * snd_una. + */ +u8 +tcp_scoreboard_is_sane_post_recovery (tcp_connection_t * tc) +{ + sack_scoreboard_hole_t *hole; + hole = scoreboard_first_hole (&tc->sack_sb); + return (!hole || seq_geq (hole->start, tc->snd_una)); +} + +void +tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) +{ + sack_scoreboard_t *sb = &tc->sack_sb; + sack_block_t *blk, tmp; + sack_scoreboard_hole_t *hole, *next_hole, *last_hole; + u32 blk_index = 0, old_sacked_bytes, hole_index; + int i, j; + + sb->last_sacked_bytes = 0; + sb->snd_una_adv = 0; + old_sacked_bytes = sb->sacked_bytes; + sb->last_bytes_delivered = 0; + + if (!tcp_opts_sack (&tc->rcv_opts) + && sb->head == TCP_INVALID_SACK_HOLE_INDEX) + return; + + /* Remove invalid blocks */ + blk = tc->rcv_opts.sacks; + while (blk < vec_end (tc->rcv_opts.sacks)) + { + if (seq_lt (blk->start, blk->end) + && seq_gt (blk->start, tc->snd_una) + && seq_gt (blk->start, ack) && seq_leq (blk->end, tc->snd_una_max)) + { + blk++; + continue; + } + vec_del1 (tc->rcv_opts.sacks, blk - tc->rcv_opts.sacks); + } + + /* Add block for cumulative ack */ + if (seq_gt (ack, tc->snd_una)) + { + tmp.start = tc->snd_una; + tmp.end = ack; + vec_add1 (tc->rcv_opts.sacks, tmp); + } + + if (vec_len (tc->rcv_opts.sacks) == 0) + return; + + tcp_scoreboard_trace_add (tc, ack); + + /* Make sure blocks are ordered */ + for (i = 0; i < vec_len (tc->rcv_opts.sacks); i++) + for (j = i + 1; j < vec_len (tc->rcv_opts.sacks); j++) + if (seq_lt (tc->rcv_opts.sacks[j].start, tc->rcv_opts.sacks[i].start)) + { + tmp = tc->rcv_opts.sacks[i]; + tc->rcv_opts.sacks[i] = tc->rcv_opts.sacks[j]; + tc->rcv_opts.sacks[j] = tmp; + } + + if (sb->head == TCP_INVALID_SACK_HOLE_INDEX) + { + /* If no holes, insert the first that covers all outstanding bytes */ + last_hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX, + tc->snd_una, tc->snd_una_max); + sb->tail = scoreboard_hole_index (sb, last_hole); + tmp = tc->rcv_opts.sacks[vec_len (tc->rcv_opts.sacks) - 1]; + sb->high_sacked = tmp.end; + } + else + { + /* If we have holes but snd_una_max is beyond the last hole, update + * last hole end */ + tmp = tc->rcv_opts.sacks[vec_len (tc->rcv_opts.sacks) - 1]; + last_hole = scoreboard_last_hole (sb); + if (seq_gt (tc->snd_una_max, last_hole->end)) + { + if (seq_geq (last_hole->start, sb->high_sacked)) + { + last_hole->end = tc->snd_una_max; + } + /* New hole after high sacked block */ + else if (seq_lt (sb->high_sacked, tc->snd_una_max)) + { + scoreboard_insert_hole (sb, sb->tail, sb->high_sacked, + tc->snd_una_max); + } + } + /* Keep track of max byte sacked for when the last hole + * is acked */ + if (seq_gt (tmp.end, sb->high_sacked)) + sb->high_sacked = tmp.end; + } + + /* Walk the holes with the SACK blocks */ + hole = pool_elt_at_index (sb->holes, sb->head); + while (hole && blk_index < vec_len (tc->rcv_opts.sacks)) + { + blk = &tc->rcv_opts.sacks[blk_index]; + if (seq_leq (blk->start, hole->start)) + { + /* Block covers hole. Remove hole */ + if (seq_geq (blk->end, hole->end)) + { + next_hole = scoreboard_next_hole (sb, hole); + + /* Byte accounting: snd_una needs to be advanced */ + if (blk->end == ack) + { + if (next_hole) + { + if (seq_lt (ack, next_hole->start)) + sb->snd_una_adv = next_hole->start - ack; + sb->last_bytes_delivered += + next_hole->start - hole->end; + } + else + { + ASSERT (seq_geq (sb->high_sacked, ack)); + sb->snd_una_adv = sb->high_sacked - ack; + sb->last_bytes_delivered += sb->high_sacked - hole->end; + } + } + + scoreboard_remove_hole (sb, hole); + hole = next_hole; + } + /* Partial 'head' overlap */ + else + { + if (seq_gt (blk->end, hole->start)) + { + hole->start = blk->end; + } + blk_index++; + } + } + else + { + /* Hole must be split */ + if (seq_lt (blk->end, hole->end)) + { + hole_index = scoreboard_hole_index (sb, hole); + next_hole = scoreboard_insert_hole (sb, hole_index, blk->end, + hole->end); + + /* Pool might've moved */ + hole = scoreboard_get_hole (sb, hole_index); + hole->end = blk->start; + blk_index++; + ASSERT (hole->next == scoreboard_hole_index (sb, next_hole)); + } + else if (seq_lt (blk->start, hole->end)) + { + hole->end = blk->start; + } + hole = scoreboard_next_hole (sb, hole); + } + } + + scoreboard_update_bytes (tc, sb); + sb->last_sacked_bytes = sb->sacked_bytes + - (old_sacked_bytes - sb->last_bytes_delivered); + ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes); + ASSERT (sb->sacked_bytes == 0 + || sb->sacked_bytes < tc->snd_una_max - seq_max (tc->snd_una, ack)); + ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_una_max + - seq_max (tc->snd_una, ack)); + ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc) + || sb->holes[sb->head].start == ack + sb->snd_una_adv); +} + +/** + * Try to update snd_wnd based on feedback received from peer. + * + * If successful, and new window is 'effectively' 0, activate persist + * timer. + */ +static void +tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd) +{ + /* If (SND.WL1 < SEG.SEQ or (SND.WL1 = SEG.SEQ and SND.WL2 =< SEG.ACK)), set + * SND.WND <- SEG.WND, set SND.WL1 <- SEG.SEQ, and set SND.WL2 <- SEG.ACK */ + if (seq_lt (tc->snd_wl1, seq) + || (tc->snd_wl1 == seq && seq_leq (tc->snd_wl2, ack))) + { + tc->snd_wnd = snd_wnd; + tc->snd_wl1 = seq; + tc->snd_wl2 = ack; + TCP_EVT_DBG (TCP_EVT_SND_WND, tc); + + if (tc->snd_wnd < tc->snd_mss) + { + /* Set persist timer if not set and we just got 0 wnd */ + if (!tcp_timer_is_active (tc, TCP_TIMER_PERSIST) + && !tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)) + tcp_persist_timer_set (tc); + } + else + { + tcp_persist_timer_reset (tc); + if (!tcp_in_recovery (tc) && tc->rto_boff > 0) + { + tc->rto_boff = 0; + tcp_update_rto (tc); + } + } + } +} + +void +tcp_cc_init_congestion (tcp_connection_t * tc) +{ + tcp_fastrecovery_on (tc); + tc->snd_congestion = tc->snd_una_max; + tc->cc_algo->congestion (tc); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 4); +} + +static void +tcp_cc_recovery_exit (tcp_connection_t * tc) +{ + /* Deflate rto */ + tc->rto_boff = 0; + tcp_update_rto (tc); + tc->snd_rxt_ts = 0; + tc->snd_nxt = tc->snd_una_max; + tcp_recovery_off (tc); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); +} + +void +tcp_cc_fastrecovery_exit (tcp_connection_t * tc) +{ + tc->cc_algo->recovered (tc); + tc->snd_rxt_bytes = 0; + tc->rcv_dupacks = 0; + tc->snd_nxt = tc->snd_una_max; + tcp_fastrecovery_off (tc); + tcp_fastrecovery_1_smss_off (tc); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); +} + +static void +tcp_cc_congestion_undo (tcp_connection_t * tc) +{ + tc->cwnd = tc->prev_cwnd; + tc->ssthresh = tc->prev_ssthresh; + tc->snd_nxt = tc->snd_una_max; + tc->rcv_dupacks = 0; + if (tcp_in_recovery (tc)) + tcp_cc_recovery_exit (tc); + ASSERT (tc->rto_boff == 0); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 5); + /* TODO extend for fastrecovery */ +} + +static u8 +tcp_cc_is_spurious_retransmit (tcp_connection_t * tc) +{ + return (tcp_in_recovery (tc) && tc->rto_boff == 1 + && tc->snd_rxt_ts + && tcp_opts_tstamp (&tc->rcv_opts) + && timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts)); +} + +int +tcp_cc_recover (tcp_connection_t * tc) +{ + ASSERT (tcp_in_cong_recovery (tc)); + if (tcp_cc_is_spurious_retransmit (tc)) + { + tcp_cc_congestion_undo (tc); + return 1; + } + + if (tcp_in_recovery (tc)) + tcp_cc_recovery_exit (tc); + else if (tcp_in_fastrecovery (tc)) + tcp_cc_fastrecovery_exit (tc); + + ASSERT (tc->rto_boff == 0); + ASSERT (!tcp_in_cong_recovery (tc)); + ASSERT (tcp_scoreboard_is_sane_post_recovery (tc)); + return 0; +} + +static void +tcp_cc_update (tcp_connection_t * tc, vlib_buffer_t * b) +{ + ASSERT (!tcp_in_cong_recovery (tc) || tcp_is_lost_fin (tc)); + + /* Congestion avoidance */ + tc->cc_algo->rcv_ack (tc); + tc->tsecr_last_ack = tc->rcv_opts.tsecr; + + /* If a cumulative ack, make sure dupacks is 0 */ + tc->rcv_dupacks = 0; + + /* When dupacks hits the threshold we only enter fast retransmit if + * cumulative ack covers more than snd_congestion. Should snd_una + * wrap this test may fail under otherwise valid circumstances. + * Therefore, proactively update snd_congestion when wrap detected. */ + if (PREDICT_FALSE + (seq_leq (tc->snd_congestion, tc->snd_una - tc->bytes_acked) + && seq_gt (tc->snd_congestion, tc->snd_una))) + tc->snd_congestion = tc->snd_una - 1; +} + +static u8 +tcp_should_fastrecover_sack (tcp_connection_t * tc) +{ + return (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss < tc->sack_sb.sacked_bytes; +} + +static u8 +tcp_should_fastrecover (tcp_connection_t * tc) +{ + return (tc->rcv_dupacks == TCP_DUPACK_THRESHOLD + || tcp_should_fastrecover_sack (tc)); +} + +/** + * One function to rule them all ... and in the darkness bind them + */ +static void +tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) +{ + u32 rxt_delivered; + + /* + * Duplicate ACK. Check if we should enter fast recovery, or if already in + * it account for the bytes that left the network. + */ + if (is_dack) + { + ASSERT (tc->snd_una != tc->snd_una_max + || tc->sack_sb.last_sacked_bytes); + + tc->rcv_dupacks++; + + if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD && !tc->bytes_acked) + { + ASSERT (tcp_in_fastrecovery (tc)); + /* Pure duplicate ack. If some data got acked, it's handled lower */ + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + return; + } + else if (tcp_should_fastrecover (tc)) + { + /* Things are already bad */ + if (tcp_in_cong_recovery (tc)) + { + tc->rcv_dupacks = 0; + goto partial_ack_test; + } + + /* If of of the two conditions lower hold, reset dupacks because + * we're probably after timeout (RFC6582 heuristics). + * If Cumulative ack does not cover more than congestion threshold, + * and: + * 1) The following doesn't hold: The congestion window is greater + * than SMSS bytes and the difference between highest_ack + * and prev_highest_ack is at most 4*SMSS bytes + * 2) Echoed timestamp in the last non-dup ack does not equal the + * stored timestamp + */ + if (seq_leq (tc->snd_una, tc->snd_congestion) + && ((!(tc->cwnd > tc->snd_mss + && tc->bytes_acked <= 4 * tc->snd_mss)) + || (tc->rcv_opts.tsecr != tc->tsecr_last_ack))) + { + tc->rcv_dupacks = 0; + return; + } + + tcp_cc_init_congestion (tc); + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + + /* The first segment MUST be retransmitted */ + tcp_retransmit_first_unacked (tc); + + /* Post retransmit update cwnd to ssthresh and account for the + * three segments that have left the network and should've been + * buffered at the receiver XXX */ + tc->cwnd = tc->ssthresh + tc->rcv_dupacks * tc->snd_mss; + ASSERT (tc->cwnd >= tc->snd_mss); + + /* If cwnd allows, send more data */ + if (tcp_opts_sack_permitted (&tc->rcv_opts)) + { + scoreboard_init_high_rxt (&tc->sack_sb, + tc->snd_una + tc->snd_mss); + tcp_fast_retransmit_sack (tc); + } + else + { + tcp_fast_retransmit_no_sack (tc); + } + + return; + } + else if (!tc->bytes_acked + || (tc->bytes_acked && !tcp_in_cong_recovery (tc))) + { + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + return; + } + else + goto partial_ack; + } + +partial_ack_test: + + if (!tc->bytes_acked) + return; + +partial_ack: + /* + * Legitimate ACK. 1) See if we can exit recovery + */ + /* XXX limit this only to first partial ack? */ + tcp_retransmit_timer_update (tc); + + if (seq_geq (tc->snd_una, tc->snd_congestion)) + { + /* If spurious return, we've already updated everything */ + if (tcp_cc_recover (tc)) + { + tc->tsecr_last_ack = tc->rcv_opts.tsecr; + return; + } + + tc->snd_nxt = tc->snd_una_max; + + /* Treat as congestion avoidance ack */ + tc->cc_algo->rcv_ack (tc); + tc->tsecr_last_ack = tc->rcv_opts.tsecr; + return; + } + + /* + * Legitimate ACK. 2) If PARTIAL ACK try to retransmit + */ + TCP_EVT_DBG (TCP_EVT_CC_PACK, tc); + + /* RFC6675: If the incoming ACK is a cumulative acknowledgment, + * reset dupacks to 0 */ + tc->rcv_dupacks = 0; + + tcp_retransmit_first_unacked (tc); + + /* Post RTO timeout don't try anything fancy */ + if (tcp_in_recovery (tc)) + return; + + /* Remove retransmitted bytes that have been delivered */ + ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv + >= tc->sack_sb.last_bytes_delivered + || (tc->flags & TCP_CONN_FINSNT)); + + if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt)) + { + /* If we have sacks and we haven't gotten an ack beyond high_rxt, + * remove sacked bytes delivered */ + rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv + - tc->sack_sb.last_bytes_delivered; + ASSERT (tc->snd_rxt_bytes >= rxt_delivered); + tc->snd_rxt_bytes -= rxt_delivered; + } + else + { + /* Either all retransmitted holes have been acked, or we're + * "in the blind" and retransmitting segment by segment */ + tc->snd_rxt_bytes = 0; + } + + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK); + + /* + * Since this was a partial ack, try to retransmit some more data + */ + tcp_fast_retransmit (tc); +} + +void +tcp_cc_init (tcp_connection_t * tc) +{ + tc->cc_algo = tcp_cc_algo_get (TCP_CC_NEWRENO); + tc->cc_algo->init (tc); +} + +/** + * Process incoming ACK + */ +static int +tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, + tcp_header_t * th, u32 * next, u32 * error) +{ + u32 prev_snd_wnd, prev_snd_una; + u8 is_dack; + + TCP_EVT_DBG (TCP_EVT_CC_STAT, tc); + + /* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) */ + if (PREDICT_FALSE (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt))) + { + /* If we have outstanding data and this is within the window, accept it, + * probably retransmit has timed out. Otherwise ACK segment and then + * drop it */ + if (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_una_max)) + { + tcp_make_ack (tc, b); + *next = tcp_next_output (tc->c_is_ip4); + *error = TCP_ERROR_ACK_INVALID; + TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 0, + vnet_buffer (b)->tcp.ack_number); + return -1; + } + + TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 2, + vnet_buffer (b)->tcp.ack_number); + + tc->snd_nxt = vnet_buffer (b)->tcp.ack_number; + *error = TCP_ERROR_ACK_FUTURE; + } + + /* If old ACK, probably it's an old dupack */ + if (PREDICT_FALSE (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una))) + { + *error = TCP_ERROR_ACK_OLD; + TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 1, + vnet_buffer (b)->tcp.ack_number); + if (tcp_in_fastrecovery (tc) && tc->rcv_dupacks == TCP_DUPACK_THRESHOLD) + { + TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc); + tcp_cc_handle_event (tc, 1); + } + /* Don't drop yet */ + return 0; + } + + /* + * Looks okay, process feedback + */ + + if (tcp_opts_sack_permitted (&tc->rcv_opts)) + tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number); + + prev_snd_wnd = tc->snd_wnd; + prev_snd_una = tc->snd_una; + tcp_update_snd_wnd (tc, vnet_buffer (b)->tcp.seq_number, + vnet_buffer (b)->tcp.ack_number, + clib_net_to_host_u16 (th->window) << tc->snd_wscale); + tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una; + tc->snd_una = vnet_buffer (b)->tcp.ack_number + tc->sack_sb.snd_una_adv; + tcp_validate_txf_size (tc, tc->bytes_acked); + + if (tc->bytes_acked) + tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number); + + TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc); + + /* + * Check if we have congestion event + */ + + if (tcp_ack_is_cc_event (tc, b, prev_snd_wnd, prev_snd_una, &is_dack)) + { + tcp_cc_handle_event (tc, is_dack); + if (!tcp_in_cong_recovery (tc)) + return 0; + *error = TCP_ERROR_ACK_DUP; + TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc, 1); + return vnet_buffer (b)->tcp.data_len ? 0 : -1; + } + + /* + * Update congestion control (slow start/congestion avoidance) + */ + tcp_cc_update (tc, b); + + return 0; +} + +static u8 +tcp_sack_vector_is_sane (sack_block_t * sacks) +{ + int i; + for (i = 1; i < vec_len (sacks); i++) + { + if (sacks[i - 1].end == sacks[i].start) + return 0; + } + return 1; +} + +/** + * Build SACK list as per RFC2018. + * + * Makes sure the first block contains the segment that generated the current + * ACK and the following ones are the ones most recently reported in SACK + * blocks. + * + * @param tc TCP connection for which the SACK list is updated + * @param start Start sequence number of the newest SACK block + * @param end End sequence of the newest SACK block + */ +void +tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end) +{ + sack_block_t *new_list = 0, *block = 0; + int i; + + /* If the first segment is ooo add it to the list. Last write might've moved + * rcv_nxt over the first segment. */ + if (seq_lt (tc->rcv_nxt, start)) + { + vec_add2 (new_list, block, 1); + block->start = start; + block->end = end; + } + + /* Find the blocks still worth keeping. */ + for (i = 0; i < vec_len (tc->snd_sacks); i++) + { + /* Discard if rcv_nxt advanced beyond current block */ + if (seq_leq (tc->snd_sacks[i].start, tc->rcv_nxt)) + continue; + + /* Merge or drop if segment overlapped by the new segment */ + if (block && (seq_geq (tc->snd_sacks[i].end, new_list[0].start) + && seq_leq (tc->snd_sacks[i].start, new_list[0].end))) + { + if (seq_lt (tc->snd_sacks[i].start, new_list[0].start)) + new_list[0].start = tc->snd_sacks[i].start; + if (seq_lt (new_list[0].end, tc->snd_sacks[i].end)) + new_list[0].end = tc->snd_sacks[i].end; + continue; + } + + /* Save to new SACK list if we have space. */ + if (vec_len (new_list) < TCP_MAX_SACK_BLOCKS) + { + vec_add1 (new_list, tc->snd_sacks[i]); + } + else + { + clib_warning ("sack discarded"); + } + } + + ASSERT (vec_len (new_list) <= TCP_MAX_SACK_BLOCKS); + + /* Replace old vector with new one */ + vec_free (tc->snd_sacks); + tc->snd_sacks = new_list; + + /* Segments should not 'touch' */ + ASSERT (tcp_sack_vector_is_sane (tc->snd_sacks)); +} + +/** Enqueue data for delivery to application */ +always_inline int +tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, + u16 data_len) +{ + int written, error = TCP_ERROR_ENQUEUED; + + ASSERT (seq_geq (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt)); + + /* Pure ACK. Update rcv_nxt and be done. */ + if (PREDICT_FALSE (data_len == 0)) + { + return TCP_ERROR_PURE_ACK; + } + + written = stream_session_enqueue_data (&tc->connection, b, 0, + 1 /* queue event */ , 1); + + TCP_EVT_DBG (TCP_EVT_INPUT, tc, 0, data_len, written); + + /* Update rcv_nxt */ + if (PREDICT_TRUE (written == data_len)) + { + tc->rcv_nxt += written; + } + /* If more data written than expected, account for out-of-order bytes. */ + else if (written > data_len) + { + tc->rcv_nxt += written; + + /* Send ACK confirming the update */ + tc->flags |= TCP_CONN_SNDACK; + } + else if (written > 0) + { + /* We've written something but FIFO is probably full now */ + tc->rcv_nxt += written; + + /* Depending on how fast the app is, all remaining buffers in burst will + * not be enqueued. Inform peer */ + tc->flags |= TCP_CONN_SNDACK; + + error = TCP_ERROR_PARTIALLY_ENQUEUED; + } + else + { + tc->flags |= TCP_CONN_SNDACK; + return TCP_ERROR_FIFO_FULL; + } + + /* Update SACK list if need be */ + if (tcp_opts_sack_permitted (&tc->rcv_opts)) + { + /* Remove SACK blocks that have been delivered */ + tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt); + } + + return error; +} + +/** Enqueue out-of-order data */ +always_inline int +tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, + u16 data_len) +{ + stream_session_t *s0; + int rv, offset; + + ASSERT (seq_gt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt)); + + /* Pure ACK. Do nothing */ + if (PREDICT_FALSE (data_len == 0)) + { + return TCP_ERROR_PURE_ACK; + } + + /* Enqueue out-of-order data with relative offset */ + rv = stream_session_enqueue_data (&tc->connection, b, + vnet_buffer (b)->tcp.seq_number - + tc->rcv_nxt, 0 /* queue event */ , 0); + + /* Nothing written */ + if (rv) + { + TCP_EVT_DBG (TCP_EVT_INPUT, tc, 1, data_len, 0); + return TCP_ERROR_FIFO_FULL; + } + + TCP_EVT_DBG (TCP_EVT_INPUT, tc, 1, data_len, data_len); + + /* Update SACK list if in use */ + if (tcp_opts_sack_permitted (&tc->rcv_opts)) + { + ooo_segment_t *newest; + u32 start, end; + + s0 = stream_session_get (tc->c_s_index, tc->c_thread_index); + + /* Get the newest segment from the fifo */ + newest = svm_fifo_newest_ooo_segment (s0->server_rx_fifo); + if (newest) + { + offset = ooo_segment_offset (s0->server_rx_fifo, newest); + ASSERT (offset <= vnet_buffer (b)->tcp.seq_number - tc->rcv_nxt); + start = tc->rcv_nxt + offset; + end = start + ooo_segment_length (s0->server_rx_fifo, newest); + tcp_update_sack_list (tc, start, end); + svm_fifo_newest_ooo_segment_reset (s0->server_rx_fifo); + } + } + + return TCP_ERROR_ENQUEUED; +} + +/** + * Check if ACK could be delayed. If ack can be delayed, it should return + * true for a full frame. If we're always acking return 0. + */ +always_inline int +tcp_can_delack (tcp_connection_t * tc) +{ + /* Send ack if ... */ + if (TCP_ALWAYS_ACK + /* just sent a rcv wnd 0 */ + || (tc->flags & TCP_CONN_SENT_RCV_WND0) != 0 + /* constrained to send ack */ + || (tc->flags & TCP_CONN_SNDACK) != 0 + /* we're almost out of tx wnd */ + || tcp_available_snd_space (tc) < 4 * tc->snd_mss) + return 0; + + return 1; +} + +static int +tcp_buffer_discard_bytes (vlib_buffer_t * b, u32 n_bytes_to_drop) +{ + u32 discard, first = b->current_length; + vlib_main_t *vm = vlib_get_main (); + + /* Handle multi-buffer segments */ + if (n_bytes_to_drop > b->current_length) + { + if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT)) + return -1; + do + { + discard = clib_min (n_bytes_to_drop, b->current_length); + vlib_buffer_advance (b, discard); + b = vlib_get_buffer (vm, b->next_buffer); + n_bytes_to_drop -= discard; + } + while (n_bytes_to_drop); + if (n_bytes_to_drop > first) + b->total_length_not_including_first_buffer -= n_bytes_to_drop - first; + } + else + vlib_buffer_advance (b, n_bytes_to_drop); + vnet_buffer (b)->tcp.data_len -= n_bytes_to_drop; + return 0; +} + +static int +tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, + u32 * next0) +{ + u32 error = 0, n_bytes_to_drop, n_data_bytes; + + vlib_buffer_advance (b, vnet_buffer (b)->tcp.data_offset); + n_data_bytes = vnet_buffer (b)->tcp.data_len; + ASSERT (n_data_bytes); + + /* Handle out-of-order data */ + if (PREDICT_FALSE (vnet_buffer (b)->tcp.seq_number != tc->rcv_nxt)) + { + /* Old sequence numbers allowed through because they overlapped + * the rx window */ + if (seq_lt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt)) + { + error = TCP_ERROR_SEGMENT_OLD; + *next0 = TCP_NEXT_DROP; + + /* Completely in the past (possible retransmit) */ + if (seq_leq (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt)) + { + /* Ack retransmissions since we may not have any data to send */ + tcp_make_ack (tc, b); + *next0 = tcp_next_output (tc->c_is_ip4); + goto done; + } + + /* Chop off the bytes in the past */ + n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number; + n_data_bytes -= n_bytes_to_drop; + vnet_buffer (b)->tcp.seq_number = tc->rcv_nxt; + if (tcp_buffer_discard_bytes (b, n_bytes_to_drop)) + goto done; + + goto in_order; + } + + error = tcp_session_enqueue_ooo (tc, b, n_data_bytes); + + /* N.B. Should not filter burst of dupacks. Two issues 1) dupacks open + * cwnd on remote peer when congested 2) acks leaving should have the + * latest rcv_wnd since the burst may eaten up all of it, so only the + * old ones could be filtered. + */ + + /* RFC2581: Send DUPACK for fast retransmit */ + tcp_make_ack (tc, b); + *next0 = tcp_next_output (tc->c_is_ip4); + + /* Mark as DUPACK. We may filter these in output if + * the burst fills the holes. */ + if (n_data_bytes) + vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_DUPACK; + + TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc); + goto done; + } + +in_order: + + /* In order data, enqueue. Fifo figures out by itself if any out-of-order + * segments can be enqueued after fifo tail offset changes. */ + error = tcp_session_enqueue_data (tc, b, n_data_bytes); + + /* Check if ACK can be delayed */ + if (tcp_can_delack (tc)) + { + if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK)) + tcp_timer_set (tc, TCP_TIMER_DELACK, TCP_DELACK_TIME); + goto done; + } + + *next0 = tcp_next_output (tc->c_is_ip4); + tcp_make_ack (tc, b); + +done: + return error; +} + +typedef struct +{ + tcp_header_t tcp_header; + tcp_connection_t tcp_connection; +} tcp_rx_trace_t; + +u8 * +format_tcp_rx_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *); + uword indent = format_get_indent (s); + + s = format (s, "%U\n%U%U", + format_tcp_header, &t->tcp_header, 128, + format_white_space, indent, + format_tcp_connection, &t->tcp_connection, 1); + + return s; +} + +u8 * +format_tcp_rx_trace_short (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *); + + s = format (s, "%d -> %d (%U)", + clib_net_to_host_u16 (t->tcp_header.src_port), + clib_net_to_host_u16 (t->tcp_header.dst_port), format_tcp_state, + t->tcp_connection.state); + + return s; +} + +void +tcp_set_rx_trace_data (tcp_rx_trace_t * t0, tcp_connection_t * tc0, + tcp_header_t * th0, vlib_buffer_t * b0, u8 is_ip4) +{ + if (tc0) + { + clib_memcpy (&t0->tcp_connection, tc0, sizeof (t0->tcp_connection)); + } + else + { + th0 = tcp_buffer_hdr (b0); + } + clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); +} + +always_inline void +tcp_established_inc_counter (vlib_main_t * vm, u8 is_ip4, u8 evt, u8 val) +{ + if (PREDICT_TRUE (!val)) + return; + + if (is_ip4) + vlib_node_increment_counter (vm, tcp4_established_node.index, evt, val); + else + vlib_node_increment_counter (vm, tcp6_established_node.index, evt, val); +} + +always_inline uword +tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->thread_index, errors = 0; + tcp_main_t *tm = vnet_get_tcp_main (); + u8 is_fin = 0; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + tcp_header_t *th0 = 0; + tcp_connection_t *tc0; + u32 next0 = TCP_ESTABLISHED_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index, + my_thread_index); + + if (PREDICT_FALSE (tc0 == 0)) + { + error0 = TCP_ERROR_INVALID_CONNECTION; + goto done; + } + + th0 = tcp_buffer_hdr (b0); + /* N.B. buffer is rewritten if segment is ooo. Thus, th0 becomes a + * dangling reference. */ + is_fin = tcp_is_fin (th0); + + /* SYNs, FINs and data consume sequence numbers */ + vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number + + tcp_is_syn (th0) + is_fin + vnet_buffer (b0)->tcp.data_len; + + /* TODO header prediction fast path */ + + /* 1-4: check SEQ, RST, SYN */ + if (PREDICT_FALSE (tcp_segment_validate (vm, tc0, b0, th0, &next0))) + { + error0 = TCP_ERROR_SEGMENT_INVALID; + TCP_EVT_DBG (TCP_EVT_SEG_INVALID, tc0, + vnet_buffer (b0)->tcp.seq_number, + vnet_buffer (b0)->tcp.seq_end); + goto done; + } + + /* 5: check the ACK field */ + if (tcp_rcv_ack (tc0, b0, th0, &next0, &error0)) + goto done; + + /* 6: check the URG bit TODO */ + + /* 7: process the segment text */ + if (vnet_buffer (b0)->tcp.data_len) + error0 = tcp_segment_rcv (tm, tc0, b0, &next0); + + /* 8: check the FIN bit */ + if (PREDICT_FALSE (is_fin)) + { + /* Enter CLOSE-WAIT and notify session. To avoid lingering + * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */ + /* Account for the FIN if nothing else was received */ + if (vnet_buffer (b0)->tcp.data_len == 0) + tc0->rcv_nxt += 1; + tcp_make_ack (tc0, b0); + next0 = tcp_next_output (tc0->c_is_ip4); + tc0->state = TCP_STATE_CLOSE_WAIT; + stream_session_disconnect_notify (&tc0->connection); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); + TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0); + } + + done: + b0->error = node->errors[error0]; + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + tcp_rx_trace_t *t0 = + vlib_add_trace (vm, node, b0, sizeof (*t0)); + tcp_set_rx_trace_data (t0, tc0, th0, b0, is_ip4); + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + errors = session_manager_flush_enqueue_events (my_thread_index); + tcp_established_inc_counter (vm, is_ip4, TCP_ERROR_EVENT_FIFO_FULL, errors); + tcp_flush_frame_to_output (vm, my_thread_index, is_ip4); + + return from_frame->n_vectors; +} + +static uword +tcp4_established (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_established_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +tcp6_established (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_established_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_established_node) = +{ + .function = tcp4_established, + .name = "tcp4-established", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_ESTABLISHED_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, + .format_trace = format_tcp_rx_trace_short, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_established_node, tcp4_established); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_established_node) = +{ + .function = tcp6_established, + .name = "tcp6-established", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_ESTABLISHED_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, + .format_trace = format_tcp_rx_trace_short, +}; +/* *INDENT-ON* */ + + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_established_node, tcp6_established); + +vlib_node_registration_t tcp4_syn_sent_node; +vlib_node_registration_t tcp6_syn_sent_node; + +static u8 +tcp_lookup_is_valid (tcp_connection_t * tc, tcp_header_t * hdr) +{ + transport_connection_t *tmp; + if (!tc) + return 1; + + u8 is_valid = (tc->c_lcl_port == hdr->dst_port + && (tc->state == TCP_STATE_LISTEN + || tc->c_rmt_port == hdr->src_port)); + + if (!is_valid) + { + if ((tmp = + stream_session_half_open_lookup (&tc->c_lcl_ip, &tc->c_rmt_ip, + tc->c_lcl_port, tc->c_rmt_port, + tc->c_transport_proto))) + { + if (tmp->lcl_port == hdr->dst_port + && tmp->rmt_port == hdr->src_port) + { + clib_warning ("half-open is valid!"); + } + } + } + return is_valid; +} + +/** + * Lookup transport connection + */ +static tcp_connection_t * +tcp_lookup_connection (vlib_buffer_t * b, u8 thread_index, u8 is_ip4) +{ + tcp_header_t *tcp; + transport_connection_t *tconn; + tcp_connection_t *tc; + if (is_ip4) + { + ip4_header_t *ip4; + ip4 = vlib_buffer_get_current (b); + tcp = ip4_next_header (ip4); + tconn = stream_session_lookup_transport_wt4 (&ip4->dst_address, + &ip4->src_address, + tcp->dst_port, + tcp->src_port, + SESSION_TYPE_IP4_TCP, + thread_index); + tc = tcp_get_connection_from_transport (tconn); + ASSERT (tcp_lookup_is_valid (tc, tcp)); + } + else + { + ip6_header_t *ip6; + ip6 = vlib_buffer_get_current (b); + tcp = ip6_next_header (ip6); + tconn = stream_session_lookup_transport_wt6 (&ip6->dst_address, + &ip6->src_address, + tcp->dst_port, + tcp->src_port, + SESSION_TYPE_IP6_TCP, + thread_index); + tc = tcp_get_connection_from_transport (tconn); + ASSERT (tcp_lookup_is_valid (tc, tcp)); + } + return tc; +} + +always_inline uword +tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->thread_index, errors = 0; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0, ack0, seq0; + vlib_buffer_t *b0; + tcp_rx_trace_t *t0; + tcp_header_t *tcp0 = 0; + tcp_connection_t *tc0; + tcp_connection_t *new_tc0; + u32 next0 = TCP_SYN_SENT_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + tc0 = + tcp_half_open_connection_get (vnet_buffer (b0)-> + tcp.connection_index); + if (PREDICT_FALSE (tc0 == 0)) + { + error0 = TCP_ERROR_INVALID_CONNECTION; + goto drop; + } + + /* Half-open completed recently but the connection was't removed + * yet by the owning thread */ + if (PREDICT_FALSE (tc0->flags & TCP_CONN_HALF_OPEN_DONE)) + { + /* Make sure the connection actually exists */ + ASSERT (tcp_lookup_connection (b0, my_thread_index, is_ip4)); + goto drop; + } + + ack0 = vnet_buffer (b0)->tcp.ack_number; + seq0 = vnet_buffer (b0)->tcp.seq_number; + tcp0 = tcp_buffer_hdr (b0); + + /* Crude check to see if the connection handle does not match + * the packet. Probably connection just switched to established */ + if (PREDICT_FALSE (tcp0->dst_port != tc0->c_lcl_port + || tcp0->src_port != tc0->c_rmt_port)) + goto drop; + + if (PREDICT_FALSE + (!tcp_ack (tcp0) && !tcp_rst (tcp0) && !tcp_syn (tcp0))) + goto drop; + + /* SYNs, FINs and data consume sequence numbers */ + vnet_buffer (b0)->tcp.seq_end = seq0 + tcp_is_syn (tcp0) + + tcp_is_fin (tcp0) + vnet_buffer (b0)->tcp.data_len; + + /* + * 1. check the ACK bit + */ + + /* + * If the ACK bit is set + * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless + * the RST bit is set, if so drop the segment and return) + * <SEQ=SEG.ACK><CTL=RST> + * and discard the segment. Return. + * If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable. + */ + if (tcp_ack (tcp0)) + { + if (seq_leq (ack0, tc0->iss) || seq_gt (ack0, tc0->snd_nxt)) + { + clib_warning ("ack not in rcv wnd"); + if (!tcp_rst (tcp0)) + tcp_send_reset_w_pkt (tc0, b0, is_ip4); + goto drop; + } + + /* Make sure ACK is valid */ + if (seq_gt (tc0->snd_una, ack0)) + { + clib_warning ("ack invalid"); + goto drop; + } + } + + /* + * 2. check the RST bit + */ + + if (tcp_rst (tcp0)) + { + /* If ACK is acceptable, signal client that peer is not + * willing to accept connection and drop connection*/ + if (tcp_ack (tcp0)) + tcp_connection_reset (tc0); + goto drop; + } + + /* + * 3. check the security and precedence (skipped) + */ + + /* + * 4. check the SYN bit + */ + + /* No SYN flag. Drop. */ + if (!tcp_syn (tcp0)) + { + clib_warning ("not synack"); + goto drop; + } + + /* Parse options */ + if (tcp_options_parse (tcp0, &tc0->rcv_opts)) + { + clib_warning ("options parse fail"); + goto drop; + } + + /* Valid SYN or SYN-ACK. Move connection from half-open pool to + * current thread pool. */ + pool_get (tm->connections[my_thread_index], new_tc0); + clib_memcpy (new_tc0, tc0, sizeof (*new_tc0)); + new_tc0->c_c_index = new_tc0 - tm->connections[my_thread_index]; + new_tc0->c_thread_index = my_thread_index; + new_tc0->rcv_nxt = vnet_buffer (b0)->tcp.seq_end; + new_tc0->irs = seq0; + new_tc0->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID; + new_tc0->timers[TCP_TIMER_RETRANSMIT_SYN] = + TCP_TIMER_HANDLE_INVALID; + + /* If this is not the owning thread, wait for syn retransmit to + * expire and cleanup then */ + if (tcp_half_open_connection_cleanup (tc0)) + tc0->flags |= TCP_CONN_HALF_OPEN_DONE; + + if (tcp_opts_tstamp (&new_tc0->rcv_opts)) + { + new_tc0->tsval_recent = new_tc0->rcv_opts.tsval; + new_tc0->tsval_recent_age = tcp_time_now (); + } + + if (tcp_opts_wscale (&new_tc0->rcv_opts)) + new_tc0->snd_wscale = new_tc0->rcv_opts.wscale; + + /* RFC1323: SYN and SYN-ACK wnd not scaled */ + new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window); + new_tc0->snd_wl1 = seq0; + new_tc0->snd_wl2 = ack0; + + tcp_connection_init_vars (new_tc0); + + /* SYN-ACK: See if we can switch to ESTABLISHED state */ + if (PREDICT_TRUE (tcp_ack (tcp0))) + { + /* Our SYN is ACKed: we have iss < ack = snd_una */ + + /* TODO Dequeue acknowledged segments if we support Fast Open */ + new_tc0->snd_una = ack0; + new_tc0->state = TCP_STATE_ESTABLISHED; + + /* Make sure las is initialized for the wnd computation */ + new_tc0->rcv_las = new_tc0->rcv_nxt; + + /* Notify app that we have connection. If session layer can't + * allocate session send reset */ + if (stream_session_connect_notify (&new_tc0->connection, 0)) + { + clib_warning ("connect notify fail"); + tcp_send_reset_w_pkt (new_tc0, b0, is_ip4); + tcp_connection_cleanup (new_tc0); + goto drop; + } + + /* Make sure after data segment processing ACK is sent */ + new_tc0->flags |= TCP_CONN_SNDACK; + + /* Update rtt with the syn-ack sample */ + tcp_update_rtt (new_tc0, vnet_buffer (b0)->tcp.ack_number); + TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, new_tc0); + } + /* SYN: Simultaneous open. Change state to SYN-RCVD and send SYN-ACK */ + else + { + new_tc0->state = TCP_STATE_SYN_RCVD; + + /* Notify app that we have connection */ + if (stream_session_connect_notify (&new_tc0->connection, 0)) + { + tcp_connection_cleanup (new_tc0); + tcp_send_reset_w_pkt (tc0, b0, is_ip4); + TCP_EVT_DBG (TCP_EVT_RST_SENT, tc0); + goto drop; + } + + tc0->rtt_ts = 0; + tcp_init_snd_vars (tc0); + tcp_make_synack (new_tc0, b0); + next0 = tcp_next_output (is_ip4); + + goto drop; + } + + /* Read data, if any */ + if (PREDICT_FALSE (vnet_buffer (b0)->tcp.data_len)) + { + ASSERT (0); + error0 = tcp_segment_rcv (tm, new_tc0, b0, &next0); + if (error0 == TCP_ERROR_PURE_ACK) + error0 = TCP_ERROR_SYN_ACKS_RCVD; + } + else + { + tcp_make_ack (new_tc0, b0); + next0 = tcp_next_output (new_tc0->c_is_ip4); + } + + drop: + + b0->error = error0 ? node->errors[error0] : 0; + if (PREDICT_FALSE + ((b0->flags & VLIB_BUFFER_IS_TRACED) && tcp0 != 0)) + { + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + clib_memcpy (&t0->tcp_header, tcp0, sizeof (t0->tcp_header)); + clib_memcpy (&t0->tcp_connection, tc0, + sizeof (t0->tcp_connection)); + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + errors = session_manager_flush_enqueue_events (my_thread_index); + if (errors) + { + if (is_ip4) + vlib_node_increment_counter (vm, tcp4_established_node.index, + TCP_ERROR_EVENT_FIFO_FULL, errors); + else + vlib_node_increment_counter (vm, tcp6_established_node.index, + TCP_ERROR_EVENT_FIFO_FULL, errors); + } + + return from_frame->n_vectors; +} + +static uword +tcp4_syn_sent (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_syn_sent_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +tcp6_syn_sent_rcv (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_syn_sent_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_syn_sent_node) = +{ + .function = tcp4_syn_sent, + .name = "tcp4-syn-sent", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_SYN_SENT_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, + .format_trace = format_tcp_rx_trace_short, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_syn_sent_node, tcp4_syn_sent); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_syn_sent_node) = +{ + .function = tcp6_syn_sent_rcv, + .name = "tcp6-syn-sent", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_SYN_SENT_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, + .format_trace = format_tcp_rx_trace_short, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_syn_sent_node, tcp6_syn_sent_rcv); + +/** + * Handles reception for all states except LISTEN, SYN-SENT and ESTABLISHED + * as per RFC793 p. 64 + */ +always_inline uword +tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->thread_index, errors = 0; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + tcp_header_t *tcp0 = 0; + tcp_connection_t *tc0; + u32 next0 = TCP_RCV_PROCESS_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED; + u8 is_fin0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index, + my_thread_index); + if (PREDICT_FALSE (tc0 == 0)) + { + error0 = TCP_ERROR_INVALID_CONNECTION; + goto drop; + } + + tcp0 = tcp_buffer_hdr (b0); + is_fin0 = tcp_is_fin (tcp0); + + /* SYNs, FINs and data consume sequence numbers */ + vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number + + tcp_is_syn (tcp0) + is_fin0 + vnet_buffer (b0)->tcp.data_len; + + if (CLIB_DEBUG) + { + tcp_connection_t *tmp; + tmp = tcp_lookup_connection (b0, my_thread_index, is_ip4); + if (tmp->state != tc0->state) + { + clib_warning ("state changed"); + ASSERT (0); + goto drop; + } + } + + /* + * Special treatment for CLOSED + */ + switch (tc0->state) + { + case TCP_STATE_CLOSED: + goto drop; + break; + } + + /* + * For all other states (except LISTEN) + */ + + /* 1-4: check SEQ, RST, SYN */ + if (PREDICT_FALSE (tcp_segment_validate (vm, tc0, b0, tcp0, + &next0))) + { + error0 = TCP_ERROR_SEGMENT_INVALID; + goto drop; + } + + /* 5: check the ACK field */ + switch (tc0->state) + { + case TCP_STATE_SYN_RCVD: + /* + * If the segment acknowledgment is not acceptable, form a + * reset segment, + * <SEQ=SEG.ACK><CTL=RST> + * and send it. + */ + if (!tcp_rcv_ack_is_acceptable (tc0, b0)) + { + clib_warning ("connection not accepted"); + tcp_send_reset_w_pkt (tc0, b0, is_ip4); + goto drop; + } + + /* Update rtt and rto */ + tcp_update_rtt (tc0, vnet_buffer (b0)->tcp.ack_number); + + /* Switch state to ESTABLISHED */ + tc0->state = TCP_STATE_ESTABLISHED; + + /* Initialize session variables */ + tc0->snd_una = vnet_buffer (b0)->tcp.ack_number; + tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window) + << tc0->rcv_opts.wscale; + tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number; + tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number; + stream_session_accept_notify (&tc0->connection); + + /* Reset SYN-ACK retransmit and SYN_RCV establish timers */ + tcp_retransmit_timer_reset (tc0); + tcp_timer_reset (tc0, TCP_TIMER_ESTABLISH); + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); + break; + case TCP_STATE_ESTABLISHED: + /* We can get packets in established state here because they + * were enqueued before state change */ + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + + break; + case TCP_STATE_FIN_WAIT_1: + /* In addition to the processing for the ESTABLISHED state, if + * our FIN is now acknowledged then enter FIN-WAIT-2 and + * continue processing in that state. */ + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + + /* Still have to send the FIN */ + if (tc0->flags & TCP_CONN_FINPNDG) + { + /* TX fifo finally drained */ + if (!stream_session_tx_fifo_max_dequeue (&tc0->connection)) + tcp_send_fin (tc0); + } + /* If FIN is ACKed */ + else if (tc0->snd_una == tc0->snd_una_max) + { + tc0->state = TCP_STATE_FIN_WAIT_2; + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); + + /* Stop all retransmit timers because we have nothing more + * to send. Enable waitclose though because we're willing to + * wait for peer's FIN but not indefinitely. */ + tcp_connection_timers_reset (tc0); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + } + break; + case TCP_STATE_FIN_WAIT_2: + /* In addition to the processing for the ESTABLISHED state, if + * the retransmission queue is empty, the user's CLOSE can be + * acknowledged ("ok") but do not delete the TCB. */ + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + break; + case TCP_STATE_CLOSE_WAIT: + /* Do the same processing as for the ESTABLISHED state. */ + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + break; + case TCP_STATE_CLOSING: + /* In addition to the processing for the ESTABLISHED state, if + * the ACK acknowledges our FIN then enter the TIME-WAIT state, + * otherwise ignore the segment. */ + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + + tc0->state = TCP_STATE_TIME_WAIT; + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + goto drop; + + break; + case TCP_STATE_LAST_ACK: + /* The only thing that [should] arrive in this state is an + * acknowledgment of our FIN. If our FIN is now acknowledged, + * delete the TCB, enter the CLOSED state, and return. */ + + if (!tcp_rcv_ack_is_acceptable (tc0, b0)) + goto drop; + + tc0->snd_una = vnet_buffer (b0)->tcp.ack_number; + /* Apparently our FIN was lost */ + if (is_fin0) + { + tcp_send_fin (tc0); + goto drop; + } + + tc0->state = TCP_STATE_CLOSED; + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); + + /* Don't delete the connection/session yet. Instead, wait a + * reasonable amount of time until the pipes are cleared. In + * particular, this makes sure that we won't have dead sessions + * when processing events on the tx path */ + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); + tcp_retransmit_timer_reset (tc0); + + goto drop; + + break; + case TCP_STATE_TIME_WAIT: + /* The only thing that can arrive in this state is a + * retransmission of the remote FIN. Acknowledge it, and restart + * the 2 MSL timeout. */ + + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + + tcp_make_ack (tc0, b0); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + + goto drop; + + break; + default: + ASSERT (0); + } + + /* 6: check the URG bit TODO */ + + /* 7: process the segment text */ + switch (tc0->state) + { + case TCP_STATE_ESTABLISHED: + case TCP_STATE_FIN_WAIT_1: + case TCP_STATE_FIN_WAIT_2: + if (vnet_buffer (b0)->tcp.data_len) + error0 = tcp_segment_rcv (tm, tc0, b0, &next0); + else if (is_fin0) + tc0->rcv_nxt += 1; + break; + case TCP_STATE_CLOSE_WAIT: + case TCP_STATE_CLOSING: + case TCP_STATE_LAST_ACK: + case TCP_STATE_TIME_WAIT: + /* This should not occur, since a FIN has been received from the + * remote side. Ignore the segment text. */ + break; + } + + /* 8: check the FIN bit */ + if (!is_fin0) + goto drop; + + switch (tc0->state) + { + case TCP_STATE_ESTABLISHED: + case TCP_STATE_SYN_RCVD: + /* Send FIN-ACK notify app and enter CLOSE-WAIT */ + tcp_connection_timers_reset (tc0); + tcp_make_fin (tc0, b0); + tc0->snd_nxt += 1; + next0 = tcp_next_output (tc0->c_is_ip4); + stream_session_disconnect_notify (&tc0->connection); + tc0->state = TCP_STATE_CLOSE_WAIT; + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); + break; + case TCP_STATE_CLOSE_WAIT: + case TCP_STATE_CLOSING: + case TCP_STATE_LAST_ACK: + /* move along .. */ + break; + case TCP_STATE_FIN_WAIT_1: + tc0->state = TCP_STATE_CLOSING; + tcp_make_ack (tc0, b0); + next0 = tcp_next_output (is_ip4); + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); + /* Wait for ACK but not forever */ + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + break; + case TCP_STATE_FIN_WAIT_2: + /* Got FIN, send ACK! Be more aggressive with resource cleanup */ + tc0->state = TCP_STATE_TIME_WAIT; + tcp_connection_timers_reset (tc0); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_TIMEWAIT_TIME); + tcp_make_ack (tc0, b0); + next0 = tcp_next_output (is_ip4); + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); + break; + case TCP_STATE_TIME_WAIT: + /* Remain in the TIME-WAIT state. Restart the time-wait + * timeout. + */ + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_TIMEWAIT_TIME); + break; + } + TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0); + + drop: + b0->error = error0 ? node->errors[error0] : 0; + + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + tcp_rx_trace_t *t0 = + vlib_add_trace (vm, node, b0, sizeof (*t0)); + tcp_set_rx_trace_data (t0, tc0, tcp0, b0, is_ip4); + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + errors = session_manager_flush_enqueue_events (my_thread_index); + if (errors) + { + if (is_ip4) + vlib_node_increment_counter (vm, tcp4_established_node.index, + TCP_ERROR_EVENT_FIFO_FULL, errors); + else + vlib_node_increment_counter (vm, tcp6_established_node.index, + TCP_ERROR_EVENT_FIFO_FULL, errors); + } + + return from_frame->n_vectors; +} + +static uword +tcp4_rcv_process (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_rcv_process_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +tcp6_rcv_process (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_rcv_process_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_rcv_process_node) = +{ + .function = tcp4_rcv_process, + .name = "tcp4-rcv-process", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_RCV_PROCESS_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, + .format_trace = format_tcp_rx_trace_short, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_rcv_process_node, tcp4_rcv_process); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_rcv_process_node) = +{ + .function = tcp6_rcv_process, + .name = "tcp6-rcv-process", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_RCV_PROCESS_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, + .format_trace = format_tcp_rx_trace_short, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_rcv_process_node, tcp6_rcv_process); + +vlib_node_registration_t tcp4_listen_node; +vlib_node_registration_t tcp6_listen_node; + +/** + * LISTEN state processing as per RFC 793 p. 65 + */ +always_inline uword +tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->thread_index; + u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + tcp_rx_trace_t *t0; + tcp_header_t *th0 = 0; + tcp_connection_t *lc0; + ip4_header_t *ip40; + ip6_header_t *ip60; + tcp_connection_t *child0; + u32 error0 = TCP_ERROR_SYNS_RCVD, next0 = TCP_LISTEN_NEXT_DROP; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + lc0 = tcp_listener_get (vnet_buffer (b0)->tcp.connection_index); + + if (is_ip4) + { + ip40 = vlib_buffer_get_current (b0); + th0 = ip4_next_header (ip40); + } + else + { + ip60 = vlib_buffer_get_current (b0); + th0 = ip6_next_header (ip60); + } + + /* Create child session. For syn-flood protection use filter */ + + /* 1. first check for an RST: handled in dispatch */ + /* if (tcp_rst (th0)) + goto drop; */ + + /* 2. second check for an ACK: handled in dispatch */ + /* if (tcp_ack (th0)) + { + tcp_send_reset (b0, is_ip4); + goto drop; + } */ + + /* 3. check for a SYN (did that already) */ + + /* Make sure connection wasn't just created */ + child0 = tcp_lookup_connection (b0, my_thread_index, is_ip4); + if (PREDICT_FALSE (child0->state != TCP_STATE_LISTEN)) + { + error0 = TCP_ERROR_CREATE_EXISTS; + goto drop; + } + + /* Create child session and send SYN-ACK */ + child0 = tcp_connection_new (my_thread_index); + child0->c_lcl_port = lc0->c_lcl_port; + child0->c_rmt_port = th0->src_port; + child0->c_is_ip4 = is_ip4; + child0->state = TCP_STATE_SYN_RCVD; + + if (is_ip4) + { + child0->c_lcl_ip4.as_u32 = ip40->dst_address.as_u32; + child0->c_rmt_ip4.as_u32 = ip40->src_address.as_u32; + } + else + { + clib_memcpy (&child0->c_lcl_ip6, &ip60->dst_address, + sizeof (ip6_address_t)); + clib_memcpy (&child0->c_rmt_ip6, &ip60->src_address, + sizeof (ip6_address_t)); + } + + if (stream_session_accept (&child0->connection, lc0->c_s_index, sst, + 0 /* notify */ )) + { + clib_warning ("session accept fail"); + tcp_connection_cleanup (child0); + error0 = TCP_ERROR_CREATE_SESSION_FAIL; + goto drop; + } + + if (tcp_options_parse (th0, &child0->rcv_opts)) + { + clib_warning ("options parse fail"); + goto drop; + } + + child0->irs = vnet_buffer (b0)->tcp.seq_number; + child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1; + child0->rcv_las = child0->rcv_nxt; + + /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK} + * segments are used to initialize PAWS. */ + if (tcp_opts_tstamp (&child0->rcv_opts)) + { + child0->tsval_recent = child0->rcv_opts.tsval; + child0->tsval_recent_age = tcp_time_now (); + } + + if (tcp_opts_wscale (&child0->rcv_opts)) + child0->snd_wscale = child0->rcv_opts.wscale; + + child0->snd_wnd = clib_net_to_host_u16 (th0->window) + << child0->snd_wscale; + child0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number; + child0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number; + + tcp_connection_init_vars (child0); + TCP_EVT_DBG (TCP_EVT_SYN_RCVD, child0, 1); + + /* Reuse buffer to make syn-ack and send */ + tcp_make_synack (child0, b0); + next0 = tcp_next_output (is_ip4); + tcp_timer_set (child0, TCP_TIMER_ESTABLISH, TCP_SYN_RCVD_TIME); + + drop: + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); + clib_memcpy (&t0->tcp_connection, lc0, + sizeof (t0->tcp_connection)); + } + + b0->error = node->errors[error0]; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + return from_frame->n_vectors; +} + +static uword +tcp4_listen (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_listen_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +tcp6_listen (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_listen_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_listen_node) = +{ + .function = tcp4_listen, + .name = "tcp4-listen", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_LISTEN_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_LISTEN_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, + .format_trace = format_tcp_rx_trace_short, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_listen_node, tcp4_listen); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_listen_node) = +{ + .function = tcp6_listen, + .name = "tcp6-listen", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_LISTEN_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_LISTEN_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, + .format_trace = format_tcp_rx_trace_short, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_listen_node, tcp6_listen); + +vlib_node_registration_t tcp4_input_node; +vlib_node_registration_t tcp6_input_node; + +typedef enum _tcp_input_next +{ + TCP_INPUT_NEXT_DROP, + TCP_INPUT_NEXT_LISTEN, + TCP_INPUT_NEXT_RCV_PROCESS, + TCP_INPUT_NEXT_SYN_SENT, + TCP_INPUT_NEXT_ESTABLISHED, + TCP_INPUT_NEXT_RESET, + TCP_INPUT_NEXT_PUNT, + TCP_INPUT_N_NEXT +} tcp_input_next_t; + +#define foreach_tcp4_input_next \ + _ (DROP, "error-drop") \ + _ (LISTEN, "tcp4-listen") \ + _ (RCV_PROCESS, "tcp4-rcv-process") \ + _ (SYN_SENT, "tcp4-syn-sent") \ + _ (ESTABLISHED, "tcp4-established") \ + _ (RESET, "tcp4-reset") \ + _ (PUNT, "error-punt") + +#define foreach_tcp6_input_next \ + _ (DROP, "error-drop") \ + _ (LISTEN, "tcp6-listen") \ + _ (RCV_PROCESS, "tcp6-rcv-process") \ + _ (SYN_SENT, "tcp6-syn-sent") \ + _ (ESTABLISHED, "tcp6-established") \ + _ (RESET, "tcp6-reset") \ + _ (PUNT, "error-punt") + +#define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN) + +always_inline uword +tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->thread_index; + tcp_main_t *tm = vnet_get_tcp_main (); + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + next_index = node->cached_next_index; + tcp_set_time_now (my_thread_index); + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + int n_advance_bytes0, n_data_bytes0; + u32 bi0; + vlib_buffer_t *b0; + tcp_header_t *tcp0 = 0; + tcp_connection_t *tc0; + transport_connection_t *tconn; + ip4_header_t *ip40; + ip6_header_t *ip60; + u32 error0 = TCP_ERROR_NO_LISTENER, next0 = TCP_INPUT_NEXT_DROP; + u8 flags0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + vnet_buffer (b0)->tcp.flags = 0; + + /* Checksum computed by ipx_local no need to compute again */ + + if (is_ip4) + { + ip40 = vlib_buffer_get_current (b0); + tcp0 = ip4_next_header (ip40); + n_advance_bytes0 = (ip4_header_bytes (ip40) + + tcp_header_bytes (tcp0)); + n_data_bytes0 = clib_net_to_host_u16 (ip40->length) + - n_advance_bytes0; + tconn = stream_session_lookup_transport_wt4 (&ip40->dst_address, + &ip40->src_address, + tcp0->dst_port, + tcp0->src_port, + SESSION_TYPE_IP4_TCP, + my_thread_index); + tc0 = tcp_get_connection_from_transport (tconn); + ASSERT (tcp_lookup_is_valid (tc0, tcp0)); + } + else + { + ip60 = vlib_buffer_get_current (b0); + tcp0 = ip6_next_header (ip60); + n_advance_bytes0 = tcp_header_bytes (tcp0); + n_data_bytes0 = clib_net_to_host_u16 (ip60->payload_length) + - n_advance_bytes0; + n_advance_bytes0 += sizeof (ip60[0]); + tconn = stream_session_lookup_transport_wt6 (&ip60->dst_address, + &ip60->src_address, + tcp0->dst_port, + tcp0->src_port, + SESSION_TYPE_IP6_TCP, + my_thread_index); + tc0 = tcp_get_connection_from_transport (tconn); + ASSERT (tcp_lookup_is_valid (tc0, tcp0)); + } + + /* Length check */ + if (PREDICT_FALSE (n_advance_bytes0 < 0)) + { + error0 = TCP_ERROR_LENGTH; + goto done; + } + + /* Session exists */ + if (PREDICT_TRUE (0 != tc0)) + { + /* Save connection index */ + vnet_buffer (b0)->tcp.connection_index = tc0->c_c_index; + vnet_buffer (b0)->tcp.seq_number = + clib_net_to_host_u32 (tcp0->seq_number); + vnet_buffer (b0)->tcp.ack_number = + clib_net_to_host_u32 (tcp0->ack_number); + + vnet_buffer (b0)->tcp.hdr_offset = (u8 *) tcp0 + - (u8 *) vlib_buffer_get_current (b0); + vnet_buffer (b0)->tcp.data_offset = n_advance_bytes0; + vnet_buffer (b0)->tcp.data_len = n_data_bytes0; + + flags0 = tcp0->flags & filter_flags; + next0 = tm->dispatch_table[tc0->state][flags0].next; + error0 = tm->dispatch_table[tc0->state][flags0].error; + + if (PREDICT_FALSE (error0 == TCP_ERROR_DISPATCH + || next0 == TCP_INPUT_NEXT_RESET)) + { + /* Overload tcp flags to store state */ + tcp_state_t state0 = tc0->state; + vnet_buffer (b0)->tcp.flags = tc0->state; + + if (error0 == TCP_ERROR_DISPATCH) + clib_warning ("disp error state %U flags %U", + format_tcp_state, state0, format_tcp_flags, + (int) flags0); + } + } + else + { + if ((is_ip4 && tm->punt_unknown4) || + (!is_ip4 && tm->punt_unknown6)) + { + next0 = TCP_INPUT_NEXT_PUNT; + error0 = TCP_ERROR_PUNT; + } + else + { + /* Send reset */ + next0 = TCP_INPUT_NEXT_RESET; + error0 = TCP_ERROR_NO_LISTENER; + } + } + + done: + b0->error = error0 ? node->errors[error0] : 0; + + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + tcp_rx_trace_t *t0 = + vlib_add_trace (vm, node, b0, sizeof (*t0)); + tcp_set_rx_trace_data (t0, tc0, tcp0, b0, is_ip4); + } + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return from_frame->n_vectors; +} + +static uword +tcp4_input (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +tcp6_input (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_input_node) = +{ + .function = tcp4_input, + .name = "tcp4-input", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_INPUT_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_INPUT_NEXT_##s] = n, + foreach_tcp4_input_next +#undef _ + }, + .format_buffer = format_tcp_header, + .format_trace = format_tcp_rx_trace, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_input_node, tcp4_input); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_input_node) = +{ + .function = tcp6_input, + .name = "tcp6-input", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_INPUT_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_INPUT_NEXT_##s] = n, + foreach_tcp6_input_next +#undef _ + }, + .format_buffer = format_tcp_header, + .format_trace = format_tcp_rx_trace, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_input_node, tcp6_input); + +static void +tcp_dispatch_table_init (tcp_main_t * tm) +{ + int i, j; + for (i = 0; i < ARRAY_LEN (tm->dispatch_table); i++) + for (j = 0; j < ARRAY_LEN (tm->dispatch_table[i]); j++) + { + tm->dispatch_table[i][j].next = TCP_INPUT_NEXT_DROP; + tm->dispatch_table[i][j].error = TCP_ERROR_DISPATCH; + } + +#define _(t,f,n,e) \ +do { \ + tm->dispatch_table[TCP_STATE_##t][f].next = (n); \ + tm->dispatch_table[TCP_STATE_##t][f].error = (e); \ +} while (0) + + /* SYNs for new connections -> tcp-listen. */ + _(LISTEN, TCP_FLAG_SYN, TCP_INPUT_NEXT_LISTEN, TCP_ERROR_NONE); + _(LISTEN, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_NONE); + _(LISTEN, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_NONE); + _(LISTEN, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, + TCP_ERROR_NONE); + /* ACK for for a SYN-ACK -> tcp-rcv-process. */ + _(SYN_RCVD, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(SYN_RCVD, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(SYN_RCVD, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + /* SYN-ACK for a SYN */ + _(SYN_SENT, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, + TCP_ERROR_NONE); + _(SYN_SENT, TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE); + _(SYN_SENT, TCP_FLAG_RST, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE); + _(SYN_SENT, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, + TCP_ERROR_NONE); + /* ACK for for established connection -> tcp-established. */ + _(ESTABLISHED, TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); + /* FIN for for established connection -> tcp-established. */ + _(ESTABLISHED, TCP_FLAG_FIN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); + _(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, + TCP_ERROR_NONE); + _(ESTABLISHED, TCP_FLAG_RST, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); + _(ESTABLISHED, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, + TCP_ERROR_NONE); + _(ESTABLISHED, TCP_FLAG_SYN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); + _(ESTABLISHED, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, + TCP_ERROR_NONE); + /* ACK or FIN-ACK to our FIN */ + _(FIN_WAIT_1, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(FIN_WAIT_1, TCP_FLAG_ACK | TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, + TCP_ERROR_NONE); + /* FIN in reply to our FIN from the other side */ + _(FIN_WAIT_1, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(FIN_WAIT_1, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + /* FIN confirming that the peer (app) has closed */ + _(FIN_WAIT_2, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(FIN_WAIT_2, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(FIN_WAIT_2, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, + TCP_ERROR_NONE); + _(CLOSE_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(CLOSE_WAIT, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, + TCP_ERROR_NONE); + _(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(LAST_ACK, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, + TCP_ERROR_NONE); + _(LAST_ACK, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(TIME_WAIT, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(TIME_WAIT, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, + TCP_ERROR_NONE); + _(TIME_WAIT, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(TIME_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); + _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); + _(CLOSED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP, + TCP_ERROR_CONNECTION_CLOSED); +#undef _ +} + +clib_error_t * +tcp_input_init (vlib_main_t * vm) +{ + clib_error_t *error = 0; + tcp_main_t *tm = vnet_get_tcp_main (); + + if ((error = vlib_call_init_function (vm, tcp_init))) + return error; + + /* Initialize dispatch table. */ + tcp_dispatch_table_init (tm); + + return error; +} + +VLIB_INIT_FUNCTION (tcp_input_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c new file mode 100644 index 00000000..103fea4c --- /dev/null +++ b/src/vnet/tcp/tcp_newreno.c @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/tcp/tcp.h> + +void +newreno_congestion (tcp_connection_t * tc) +{ + tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss); +} + +void +newreno_recovered (tcp_connection_t * tc) +{ + tc->cwnd = tc->ssthresh; +} + +void +newreno_rcv_ack (tcp_connection_t * tc) +{ + if (tcp_in_slowstart (tc)) + { + tc->cwnd += clib_min (tc->snd_mss, tc->bytes_acked); + } + else + { + /* Round up to 1 if needed */ + tc->cwnd += clib_max ((tc->snd_mss * tc->snd_mss) / tc->cwnd, 1); + } +} + +void +newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type) +{ + if (ack_type == TCP_CC_DUPACK) + { + if (!tcp_opts_sack_permitted (tc)) + tc->cwnd += tc->snd_mss; + } + else if (ack_type == TCP_CC_PARTIALACK) + { + /* RFC 6582 Sec. 3.2 */ + if (!tcp_opts_sack_permitted (&tc->rcv_opts)) + { + /* Deflate the congestion window by the amount of new data + * acknowledged by the Cumulative Acknowledgment field. + * If the partial ACK acknowledges at least one SMSS of new data, + * then add back SMSS bytes to the congestion window. This + * artificially inflates the congestion window in order to reflect + * the additional segment that has left the network. This "partial + * window deflation" attempts to ensure that, when fast recovery + * eventually ends, approximately ssthresh amount of data will be + * outstanding in the network.*/ + tc->cwnd = (tc->cwnd > tc->bytes_acked + tc->snd_mss) ? + tc->cwnd - tc->bytes_acked : tc->snd_mss; + if (tc->bytes_acked > tc->snd_mss) + tc->cwnd += tc->snd_mss; + } + } +} + +void +newreno_conn_init (tcp_connection_t * tc) +{ + tc->ssthresh = tc->snd_wnd; + tc->cwnd = tcp_initial_cwnd (tc); +} + +const static tcp_cc_algorithm_t tcp_newreno = { + .congestion = newreno_congestion, + .recovered = newreno_recovered, + .rcv_ack = newreno_rcv_ack, + .rcv_cong_ack = newreno_rcv_cong_ack, + .init = newreno_conn_init +}; + +clib_error_t * +newreno_init (vlib_main_t * vm) +{ + clib_error_t *error = 0; + + tcp_cc_algo_register (TCP_CC_NEWRENO, &tcp_newreno); + + return error; +} + +VLIB_INIT_FUNCTION (newreno_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c new file mode 100644 index 00000000..a954bfa7 --- /dev/null +++ b/src/vnet/tcp/tcp_output.c @@ -0,0 +1,2113 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/tcp/tcp.h> +#include <vnet/lisp-cp/packets.h> +#include <math.h> + +vlib_node_registration_t tcp4_output_node; +vlib_node_registration_t tcp6_output_node; + +typedef enum _tcp_output_next +{ + TCP_OUTPUT_NEXT_DROP, + TCP_OUTPUT_NEXT_IP_LOOKUP, + TCP_OUTPUT_N_NEXT +} tcp_output_next_t; + +#define foreach_tcp4_output_next \ + _ (DROP, "error-drop") \ + _ (IP_LOOKUP, "ip4-lookup") + +#define foreach_tcp6_output_next \ + _ (DROP, "error-drop") \ + _ (IP_LOOKUP, "ip6-lookup") + +static char *tcp_error_strings[] = { +#define tcp_error(n,s) s, +#include <vnet/tcp/tcp_error.def> +#undef tcp_error +}; + +typedef struct +{ + tcp_header_t tcp_header; + tcp_connection_t tcp_connection; +} tcp_tx_trace_t; + +u16 dummy_mtu = 1460; + +u8 * +format_tcp_tx_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + tcp_tx_trace_t *t = va_arg (*args, tcp_tx_trace_t *); + uword indent = format_get_indent (s); + + s = format (s, "%U\n%U%U", + format_tcp_header, &t->tcp_header, 128, + format_white_space, indent, + format_tcp_connection, &t->tcp_connection, 1); + + return s; +} + +static u8 +tcp_window_compute_scale (u32 window) +{ + u8 wnd_scale = 0; + while (wnd_scale < TCP_MAX_WND_SCALE && (window >> wnd_scale) > TCP_WND_MAX) + wnd_scale++; + return wnd_scale; +} + +/** + * Update max segment size we're able to process. + * + * The value is constrained by our interface's MTU and IP options. It is + * also what we advertise to our peer. + */ +void +tcp_update_rcv_mss (tcp_connection_t * tc) +{ + /* TODO find our iface MTU */ + tc->mss = dummy_mtu - sizeof (tcp_header_t); +} + +/** + * TCP's initial window + */ +always_inline u32 +tcp_initial_wnd_unscaled (tcp_connection_t * tc) +{ + /* RFC 6928 recommends the value lower. However at the time our connections + * are initialized, fifos may not be allocated. Therefore, advertise the + * smallest possible unscaled window size and update once fifos are + * assigned to the session. + */ + /* + tcp_update_rcv_mss (tc); + TCP_IW_N_SEGMENTS * tc->mss; + */ + return TCP_MIN_RX_FIFO_SIZE; +} + +/** + * Compute initial window and scale factor. As per RFC1323, window field in + * SYN and SYN-ACK segments is never scaled. + */ +u32 +tcp_initial_window_to_advertise (tcp_connection_t * tc) +{ + u32 max_fifo; + + /* Initial wnd for SYN. Fifos are not allocated yet. + * Use some predefined value. For SYN-ACK we still want the + * scale to be computed in the same way */ + max_fifo = TCP_MAX_RX_FIFO_SIZE; + + tc->rcv_wscale = tcp_window_compute_scale (max_fifo); + tc->rcv_wnd = tcp_initial_wnd_unscaled (tc); + + return clib_min (tc->rcv_wnd, TCP_WND_MAX); +} + +/** + * Compute and return window to advertise, scaled as per RFC1323 + */ +u32 +tcp_window_to_advertise (tcp_connection_t * tc, tcp_state_t state) +{ + if (state < TCP_STATE_ESTABLISHED) + return tcp_initial_window_to_advertise (tc); + + tcp_update_rcv_wnd (tc); + + if (tc->rcv_wnd == 0) + { + tc->flags |= TCP_CONN_SENT_RCV_WND0; + } + else + { + tc->flags &= ~TCP_CONN_SENT_RCV_WND0; + } + + return tc->rcv_wnd >> tc->rcv_wscale; +} + +void +tcp_update_rcv_wnd (tcp_connection_t * tc) +{ + i32 observed_wnd; + u32 available_space, max_fifo, wnd; + + /* + * Figure out how much space we have available + */ + available_space = stream_session_max_rx_enqueue (&tc->connection); + max_fifo = stream_session_rx_fifo_size (&tc->connection); + + ASSERT (tc->rcv_opts.mss < max_fifo); + if (available_space < tc->rcv_opts.mss && available_space < max_fifo >> 3) + available_space = 0; + + /* + * Use the above and what we know about what we've previously advertised + * to compute the new window + */ + observed_wnd = (i32) tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las); + if (observed_wnd < 0) + observed_wnd = 0; + + /* Bad. Thou shalt not shrink */ + if (available_space < observed_wnd) + { + wnd = observed_wnd; + TCP_EVT_DBG (TCP_EVT_RCV_WND_SHRUNK, tc, observed_wnd, available_space); + } + else + { + wnd = available_space; + } + + /* Make sure we have a multiple of rcv_wscale */ + if (wnd && tc->rcv_wscale) + { + wnd &= ~(1 << tc->rcv_wscale); + if (wnd == 0) + wnd = 1 << tc->rcv_wscale; + } + + tc->rcv_wnd = clib_min (wnd, TCP_WND_MAX << tc->rcv_wscale); +} + +/** + * Write TCP options to segment. + */ +u32 +tcp_options_write (u8 * data, tcp_options_t * opts) +{ + u32 opts_len = 0; + u32 buf, seq_len = 4; + + if (tcp_opts_mss (opts)) + { + *data++ = TCP_OPTION_MSS; + *data++ = TCP_OPTION_LEN_MSS; + buf = clib_host_to_net_u16 (opts->mss); + clib_memcpy (data, &buf, sizeof (opts->mss)); + data += sizeof (opts->mss); + opts_len += TCP_OPTION_LEN_MSS; + } + + if (tcp_opts_wscale (opts)) + { + *data++ = TCP_OPTION_WINDOW_SCALE; + *data++ = TCP_OPTION_LEN_WINDOW_SCALE; + *data++ = opts->wscale; + opts_len += TCP_OPTION_LEN_WINDOW_SCALE; + } + + if (tcp_opts_sack_permitted (opts)) + { + *data++ = TCP_OPTION_SACK_PERMITTED; + *data++ = TCP_OPTION_LEN_SACK_PERMITTED; + opts_len += TCP_OPTION_LEN_SACK_PERMITTED; + } + + if (tcp_opts_tstamp (opts)) + { + *data++ = TCP_OPTION_TIMESTAMP; + *data++ = TCP_OPTION_LEN_TIMESTAMP; + buf = clib_host_to_net_u32 (opts->tsval); + clib_memcpy (data, &buf, sizeof (opts->tsval)); + data += sizeof (opts->tsval); + buf = clib_host_to_net_u32 (opts->tsecr); + clib_memcpy (data, &buf, sizeof (opts->tsecr)); + data += sizeof (opts->tsecr); + opts_len += TCP_OPTION_LEN_TIMESTAMP; + } + + if (tcp_opts_sack (opts)) + { + int i; + u32 n_sack_blocks = clib_min (vec_len (opts->sacks), + TCP_OPTS_MAX_SACK_BLOCKS); + + if (n_sack_blocks != 0) + { + *data++ = TCP_OPTION_SACK_BLOCK; + *data++ = 2 + n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK; + for (i = 0; i < n_sack_blocks; i++) + { + buf = clib_host_to_net_u32 (opts->sacks[i].start); + clib_memcpy (data, &buf, seq_len); + data += seq_len; + buf = clib_host_to_net_u32 (opts->sacks[i].end); + clib_memcpy (data, &buf, seq_len); + data += seq_len; + } + opts_len += 2 + n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK; + } + } + + /* Terminate TCP options */ + if (opts_len % 4) + { + *data++ = TCP_OPTION_EOL; + opts_len += TCP_OPTION_LEN_EOL; + } + + /* Pad with zeroes to a u32 boundary */ + while (opts_len % 4) + { + *data++ = TCP_OPTION_NOOP; + opts_len += TCP_OPTION_LEN_NOOP; + } + return opts_len; +} + +always_inline int +tcp_make_syn_options (tcp_options_t * opts, u8 wnd_scale) +{ + u8 len = 0; + + opts->flags |= TCP_OPTS_FLAG_MSS; + opts->mss = dummy_mtu; /*XXX discover that */ + len += TCP_OPTION_LEN_MSS; + + opts->flags |= TCP_OPTS_FLAG_WSCALE; + opts->wscale = wnd_scale; + len += TCP_OPTION_LEN_WINDOW_SCALE; + + opts->flags |= TCP_OPTS_FLAG_TSTAMP; + opts->tsval = tcp_time_now (); + opts->tsecr = 0; + len += TCP_OPTION_LEN_TIMESTAMP; + + if (TCP_USE_SACKS) + { + opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; + len += TCP_OPTION_LEN_SACK_PERMITTED; + } + + /* Align to needed boundary */ + len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN; + return len; +} + +always_inline int +tcp_make_synack_options (tcp_connection_t * tc, tcp_options_t * opts) +{ + u8 len = 0; + + opts->flags |= TCP_OPTS_FLAG_MSS; + opts->mss = tc->mss; + len += TCP_OPTION_LEN_MSS; + + if (tcp_opts_wscale (&tc->rcv_opts)) + { + opts->flags |= TCP_OPTS_FLAG_WSCALE; + opts->wscale = tc->rcv_wscale; + len += TCP_OPTION_LEN_WINDOW_SCALE; + } + + if (tcp_opts_tstamp (&tc->rcv_opts)) + { + opts->flags |= TCP_OPTS_FLAG_TSTAMP; + opts->tsval = tcp_time_now (); + opts->tsecr = tc->tsval_recent; + len += TCP_OPTION_LEN_TIMESTAMP; + } + + if (tcp_opts_sack_permitted (&tc->rcv_opts)) + { + opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; + len += TCP_OPTION_LEN_SACK_PERMITTED; + } + + /* Align to needed boundary */ + len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN; + return len; +} + +always_inline int +tcp_make_established_options (tcp_connection_t * tc, tcp_options_t * opts) +{ + u8 len = 0; + + opts->flags = 0; + + if (tcp_opts_tstamp (&tc->rcv_opts)) + { + opts->flags |= TCP_OPTS_FLAG_TSTAMP; + opts->tsval = tcp_time_now (); + opts->tsecr = tc->tsval_recent; + len += TCP_OPTION_LEN_TIMESTAMP; + } + if (tcp_opts_sack_permitted (&tc->rcv_opts)) + { + if (vec_len (tc->snd_sacks)) + { + opts->flags |= TCP_OPTS_FLAG_SACK; + opts->sacks = tc->snd_sacks; + opts->n_sack_blocks = clib_min (vec_len (tc->snd_sacks), + TCP_OPTS_MAX_SACK_BLOCKS); + len += 2 + TCP_OPTION_LEN_SACK_BLOCK * opts->n_sack_blocks; + } + } + + /* Align to needed boundary */ + len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN; + return len; +} + +always_inline int +tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts, + tcp_state_t state) +{ + switch (state) + { + case TCP_STATE_ESTABLISHED: + case TCP_STATE_FIN_WAIT_1: + return tcp_make_established_options (tc, opts); + case TCP_STATE_SYN_RCVD: + return tcp_make_synack_options (tc, opts); + case TCP_STATE_SYN_SENT: + return tcp_make_syn_options (opts, tc->rcv_wscale); + default: + clib_warning ("Not handled!"); + return 0; + } +} + +/** + * Update snd_mss to reflect the effective segment size that we can send + * by taking into account all TCP options, including SACKs + */ +void +tcp_update_snd_mss (tcp_connection_t * tc) +{ + /* Compute options to be used for connection. These may be reused when + * sending data or to compute the effective mss (snd_mss) */ + tc->snd_opts_len = + tcp_make_options (tc, &tc->snd_opts, TCP_STATE_ESTABLISHED); + + /* XXX check if MTU has been updated */ + tc->snd_mss = clib_min (tc->mss, tc->rcv_opts.mss) - tc->snd_opts_len; + ASSERT (tc->snd_mss > 0); +} + +void +tcp_init_mss (tcp_connection_t * tc) +{ + u16 default_min_mss = 536; + tcp_update_rcv_mss (tc); + + /* TODO cache mss and consider PMTU discovery */ + tc->snd_mss = clib_min (tc->rcv_opts.mss, tc->mss); + + if (tc->snd_mss < 45) + { + clib_warning ("snd mss is 0"); + /* Assume that at least the min default mss works */ + tc->snd_mss = default_min_mss; + tc->rcv_opts.mss = default_min_mss; + } + + /* We should have enough space for 40 bytes of options */ + ASSERT (tc->snd_mss > 45); + + /* If we use timestamp option, account for it */ + if (tcp_opts_tstamp (&tc->rcv_opts)) + tc->snd_mss -= TCP_OPTION_LEN_TIMESTAMP; +} + +always_inline int +tcp_alloc_tx_buffers (tcp_main_t * tm, u8 thread_index, u32 n_free_buffers) +{ + vlib_main_t *vm = vlib_get_main (); + u32 current_length = vec_len (tm->tx_buffers[thread_index]); + u32 n_allocated; + + vec_validate (tm->tx_buffers[thread_index], + current_length + n_free_buffers - 1); + n_allocated = + vlib_buffer_alloc (vm, &tm->tx_buffers[thread_index][current_length], + n_free_buffers); + _vec_len (tm->tx_buffers[thread_index]) = current_length + n_allocated; + /* buffer shortage, report failure */ + if (vec_len (tm->tx_buffers[thread_index]) == 0) + { + clib_warning ("out of buffers"); + return -1; + } + return 0; +} + +always_inline int +tcp_get_free_buffer_index (tcp_main_t * tm, u32 * bidx) +{ + u32 *my_tx_buffers; + u32 thread_index = vlib_get_thread_index (); + if (PREDICT_FALSE (vec_len (tm->tx_buffers[thread_index]) == 0)) + { + if (tcp_alloc_tx_buffers (tm, thread_index, VLIB_FRAME_SIZE)) + return -1; + } + my_tx_buffers = tm->tx_buffers[thread_index]; + *bidx = my_tx_buffers[vec_len (my_tx_buffers) - 1]; + _vec_len (my_tx_buffers) -= 1; + return 0; +} + +always_inline void +tcp_return_buffer (tcp_main_t * tm) +{ + _vec_len (tm->tx_buffers[vlib_get_thread_index ()]) += 1; +} + +always_inline void * +tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b) +{ + if (b->flags & VLIB_BUFFER_NEXT_PRESENT) + vlib_buffer_free_one (vm, b->next_buffer); + /* Zero all flags but free list index and trace flag */ + b->flags &= VLIB_BUFFER_NEXT_PRESENT - 1; + b->current_data = 0; + b->current_length = 0; + b->total_length_not_including_first_buffer = 0; + vnet_buffer (b)->tcp.flags = 0; + + /* Leave enough space for headers */ + return vlib_buffer_make_headroom (b, MAX_HDRS_LEN); +} + +always_inline void * +tcp_init_buffer (vlib_main_t * vm, vlib_buffer_t * b) +{ + ASSERT ((b->flags & VLIB_BUFFER_NEXT_PRESENT) == 0); + b->flags &= VLIB_BUFFER_FREE_LIST_INDEX_MASK; + b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; + b->total_length_not_including_first_buffer = 0; + vnet_buffer (b)->tcp.flags = 0; + + /* Leave enough space for headers */ + return vlib_buffer_make_headroom (b, MAX_HDRS_LEN); +} + +/** + * Prepare ACK + */ +void +tcp_make_ack_i (tcp_connection_t * tc, vlib_buffer_t * b, tcp_state_t state, + u8 flags) +{ + tcp_options_t _snd_opts, *snd_opts = &_snd_opts; + u8 tcp_opts_len, tcp_hdr_opts_len; + tcp_header_t *th; + u16 wnd; + + wnd = tcp_window_to_advertise (tc, state); + + /* Make and write options */ + tcp_opts_len = tcp_make_established_options (tc, snd_opts); + tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t); + + th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt, + tc->rcv_nxt, tcp_hdr_opts_len, flags, wnd); + + tcp_options_write ((u8 *) (th + 1), snd_opts); + vnet_buffer (b)->tcp.connection_index = tc->c_c_index; +} + +/** + * Convert buffer to ACK + */ +void +tcp_make_ack (tcp_connection_t * tc, vlib_buffer_t * b) +{ + vlib_main_t *vm = vlib_get_main (); + + tcp_reuse_buffer (vm, b); + tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_ACK); + TCP_EVT_DBG (TCP_EVT_ACK_SENT, tc); + vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; + tc->rcv_las = tc->rcv_nxt; +} + +/** + * Convert buffer to FIN-ACK + */ +void +tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b) +{ + vlib_main_t *vm = vlib_get_main (); + u8 flags = 0; + + tcp_reuse_buffer (vm, b); + + flags = TCP_FLAG_FIN | TCP_FLAG_ACK; + tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, flags); + + /* Reset flags, make sure ack is sent */ + vnet_buffer (b)->tcp.flags &= ~TCP_BUF_FLAG_DUPACK; +} + +/** + * Convert buffer to SYN + */ +void +tcp_make_syn (tcp_connection_t * tc, vlib_buffer_t * b) +{ + u8 tcp_hdr_opts_len, tcp_opts_len; + tcp_header_t *th; + u16 initial_wnd; + tcp_options_t snd_opts; + + initial_wnd = tcp_initial_window_to_advertise (tc); + + /* Make and write options */ + memset (&snd_opts, 0, sizeof (snd_opts)); + tcp_opts_len = tcp_make_syn_options (&snd_opts, tc->rcv_wscale); + tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t); + + th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss, + tc->rcv_nxt, tcp_hdr_opts_len, TCP_FLAG_SYN, + initial_wnd); + vnet_buffer (b)->tcp.connection_index = tc->c_c_index; + tcp_options_write ((u8 *) (th + 1), &snd_opts); + + tcp_timer_update (tc, TCP_TIMER_RETRANSMIT_SYN, + tc->rto * TCP_TO_TIMER_TICK); +} + +/** + * Convert buffer to SYN-ACK + */ +void +tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b) +{ + vlib_main_t *vm = vlib_get_main (); + tcp_options_t _snd_opts, *snd_opts = &_snd_opts; + u8 tcp_opts_len, tcp_hdr_opts_len; + tcp_header_t *th; + u16 initial_wnd; + + memset (snd_opts, 0, sizeof (*snd_opts)); + tcp_reuse_buffer (vm, b); + + initial_wnd = tcp_initial_window_to_advertise (tc); + tcp_opts_len = tcp_make_synack_options (tc, snd_opts); + tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t); + + th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss, + tc->rcv_nxt, tcp_hdr_opts_len, + TCP_FLAG_SYN | TCP_FLAG_ACK, initial_wnd); + tcp_options_write ((u8 *) (th + 1), snd_opts); + + vnet_buffer (b)->tcp.connection_index = tc->c_c_index; + vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; + + /* Init retransmit timer. Use update instead of set because of + * retransmissions */ + tcp_retransmit_timer_force_update (tc); + TCP_EVT_DBG (TCP_EVT_SYNACK_SENT, tc); +} + +always_inline void +tcp_enqueue_to_ip_lookup_i (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, + u8 is_ip4, u8 flush) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + u32 thread_index = vlib_get_thread_index (); + u32 *to_next, next_index; + vlib_frame_t *f; + + b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; + b->error = 0; + + /* Default FIB for now */ + vnet_buffer (b)->sw_if_index[VLIB_TX] = 0; + + /* Send to IP lookup */ + next_index = is_ip4 ? ip4_lookup_node.index : ip6_lookup_node.index; + if (VLIB_BUFFER_TRACE_TRAJECTORY > 0) + { + b->pre_data[0] = 2; + b->pre_data[1] = next_index; + } + + f = tm->ip_lookup_tx_frames[!is_ip4][thread_index]; + if (!f) + { + f = vlib_get_frame_to_node (vm, next_index); + ASSERT (f); + tm->ip_lookup_tx_frames[!is_ip4][thread_index] = f; + } + + to_next = vlib_frame_vector_args (f); + to_next[f->n_vectors] = bi; + f->n_vectors += 1; + if (flush || f->n_vectors == VLIB_FRAME_SIZE) + { + vlib_put_frame_to_node (vm, next_index, f); + tm->ip_lookup_tx_frames[!is_ip4][thread_index] = 0; + } +} + +always_inline void +tcp_enqueue_to_ip_lookup_now (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, + u8 is_ip4) +{ + tcp_enqueue_to_ip_lookup_i (vm, b, bi, is_ip4, 1); +} + +always_inline void +tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, + u8 is_ip4) +{ + tcp_enqueue_to_ip_lookup_i (vm, b, bi, is_ip4, 0); +} + +always_inline void +tcp_enqueue_to_output_i (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, + u8 is_ip4, u8 flush) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + u32 thread_index = vlib_get_thread_index (); + u32 *to_next, next_index; + vlib_frame_t *f; + + b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; + b->error = 0; + + /* Decide where to send the packet */ + next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index; + if (VLIB_BUFFER_TRACE_TRAJECTORY > 0) + { + b->pre_data[0] = 1; + b->pre_data[1] = next_index; + } + + /* Get frame to v4/6 output node */ + f = tm->tx_frames[!is_ip4][thread_index]; + if (!f) + { + f = vlib_get_frame_to_node (vm, next_index); + ASSERT (f); + tm->tx_frames[!is_ip4][thread_index] = f; + } + to_next = vlib_frame_vector_args (f); + to_next[f->n_vectors] = bi; + f->n_vectors += 1; + if (flush || f->n_vectors == VLIB_FRAME_SIZE) + { + vlib_put_frame_to_node (vm, next_index, f); + tm->tx_frames[!is_ip4][thread_index] = 0; + } +} + +always_inline void +tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) +{ + tcp_enqueue_to_output_i (vm, b, bi, is_ip4, 0); +} + +always_inline void +tcp_enqueue_to_output_now (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, + u8 is_ip4) +{ + tcp_enqueue_to_output_i (vm, b, bi, is_ip4, 1); +} + +int +tcp_make_reset_in_place (vlib_main_t * vm, vlib_buffer_t * b0, + tcp_state_t state, u8 thread_index, u8 is_ip4) +{ + ip4_header_t *ih4; + ip6_header_t *ih6; + tcp_header_t *th0; + ip4_address_t src_ip40, dst_ip40; + ip6_address_t src_ip60, dst_ip60; + u16 src_port, dst_port; + u32 tmp; + u32 seq, ack; + u8 flags; + + /* Find IP and TCP headers */ + th0 = tcp_buffer_hdr (b0); + + /* Save src and dst ip */ + if (is_ip4) + { + ih4 = vlib_buffer_get_current (b0); + ASSERT ((ih4->ip_version_and_header_length & 0xF0) == 0x40); + src_ip40.as_u32 = ih4->src_address.as_u32; + dst_ip40.as_u32 = ih4->dst_address.as_u32; + } + else + { + ih6 = vlib_buffer_get_current (b0); + ASSERT ((ih6->ip_version_traffic_class_and_flow_label & 0xF0) == 0x60); + clib_memcpy (&src_ip60, &ih6->src_address, sizeof (ip6_address_t)); + clib_memcpy (&dst_ip60, &ih6->dst_address, sizeof (ip6_address_t)); + } + + src_port = th0->src_port; + dst_port = th0->dst_port; + + /* Try to determine what/why we're actually resetting */ + if (state == TCP_STATE_CLOSED) + { + if (!tcp_syn (th0)) + return -1; + + tmp = clib_net_to_host_u32 (th0->seq_number); + + /* Got a SYN for no listener. */ + flags = TCP_FLAG_RST | TCP_FLAG_ACK; + ack = clib_host_to_net_u32 (tmp + 1); + seq = 0; + } + else + { + flags = TCP_FLAG_RST; + seq = th0->ack_number; + ack = 0; + } + + tcp_reuse_buffer (vm, b0); + th0 = vlib_buffer_push_tcp_net_order (b0, dst_port, src_port, seq, ack, + sizeof (tcp_header_t), flags, 0); + + if (is_ip4) + { + ih4 = vlib_buffer_push_ip4 (vm, b0, &dst_ip40, &src_ip40, + IP_PROTOCOL_TCP, 1); + th0->checksum = ip4_tcp_udp_compute_checksum (vm, b0, ih4); + } + else + { + int bogus = ~0; + ih6 = vlib_buffer_push_ip6 (vm, b0, &dst_ip60, &src_ip60, + IP_PROTOCOL_TCP); + th0->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b0, ih6, &bogus); + ASSERT (!bogus); + } + + return 0; +} + +/** + * Send reset without reusing existing buffer + * + * It extracts connection info out of original packet + */ +void +tcp_send_reset_w_pkt (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4) +{ + vlib_buffer_t *b; + u32 bi; + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = vlib_get_main (); + u8 tcp_hdr_len, flags = 0; + tcp_header_t *th, *pkt_th; + u32 seq, ack; + ip4_header_t *ih4, *pkt_ih4; + ip6_header_t *ih6, *pkt_ih6; + + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + + b = vlib_get_buffer (vm, bi); + tcp_init_buffer (vm, b); + + /* Make and write options */ + tcp_hdr_len = sizeof (tcp_header_t); + + if (is_ip4) + { + pkt_ih4 = vlib_buffer_get_current (pkt); + pkt_th = ip4_next_header (pkt_ih4); + } + else + { + pkt_ih6 = vlib_buffer_get_current (pkt); + pkt_th = ip6_next_header (pkt_ih6); + } + + if (tcp_ack (pkt_th)) + { + flags = TCP_FLAG_RST; + seq = pkt_th->ack_number; + ack = (tc && tc->state >= TCP_STATE_SYN_RCVD) ? tc->rcv_nxt : 0; + } + else + { + flags = TCP_FLAG_RST | TCP_FLAG_ACK; + seq = 0; + ack = clib_host_to_net_u32 (vnet_buffer (pkt)->tcp.seq_end); + } + + th = vlib_buffer_push_tcp_net_order (b, pkt_th->dst_port, pkt_th->src_port, + seq, ack, tcp_hdr_len, flags, 0); + + /* Swap src and dst ip */ + if (is_ip4) + { + ASSERT ((pkt_ih4->ip_version_and_header_length & 0xF0) == 0x40); + ih4 = vlib_buffer_push_ip4 (vm, b, &pkt_ih4->dst_address, + &pkt_ih4->src_address, IP_PROTOCOL_TCP, 1); + th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih4); + } + else + { + int bogus = ~0; + ASSERT ((pkt_ih6->ip_version_traffic_class_and_flow_label & 0xF0) == + 0x60); + ih6 = vlib_buffer_push_ip6 (vm, b, &pkt_ih6->dst_address, + &pkt_ih6->src_address, IP_PROTOCOL_TCP); + th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih6, &bogus); + ASSERT (!bogus); + } + + tcp_enqueue_to_ip_lookup_now (vm, b, bi, is_ip4); + TCP_EVT_DBG (TCP_EVT_RST_SENT, tc); +} + +/** + * Build and set reset packet for connection + */ +void +tcp_send_reset (tcp_connection_t * tc) +{ + vlib_main_t *vm = vlib_get_main (); + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_buffer_t *b; + u32 bi; + tcp_header_t *th; + u16 tcp_hdr_opts_len, advertise_wnd, opts_write_len; + u8 flags; + + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); + tcp_init_buffer (vm, b); + + tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); + tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t); + advertise_wnd = tcp_window_to_advertise (tc, TCP_STATE_ESTABLISHED); + flags = TCP_FLAG_RST; + th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt, + tc->rcv_nxt, tcp_hdr_opts_len, flags, + advertise_wnd); + opts_write_len = tcp_options_write ((u8 *) (th + 1), &tc->snd_opts); + ASSERT (opts_write_len == tc->snd_opts_len); + vnet_buffer (b)->tcp.connection_index = tc->c_c_index; + if (tc->c_is_ip4) + { + ip4_header_t *ih4; + ih4 = vlib_buffer_push_ip4 (vm, b, &tc->c_lcl_ip.ip4, + &tc->c_rmt_ip.ip4, IP_PROTOCOL_TCP, 0); + th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih4); + } + else + { + int bogus = ~0; + ip6_header_t *ih6; + ih6 = vlib_buffer_push_ip6 (vm, b, &tc->c_lcl_ip.ip6, + &tc->c_rmt_ip.ip6, IP_PROTOCOL_TCP); + th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih6, &bogus); + ASSERT (!bogus); + } + tcp_enqueue_to_ip_lookup_now (vm, b, bi, tc->c_is_ip4); + TCP_EVT_DBG (TCP_EVT_RST_SENT, tc); +} + +void +tcp_push_ip_hdr (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b) +{ + tcp_header_t *th = vlib_buffer_get_current (b); + vlib_main_t *vm = vlib_get_main (); + if (tc->c_is_ip4) + { + ip4_header_t *ih; + ih = vlib_buffer_push_ip4 (vm, b, &tc->c_lcl_ip4, + &tc->c_rmt_ip4, IP_PROTOCOL_TCP, 1); + th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih); + } + else + { + ip6_header_t *ih; + int bogus = ~0; + + ih = vlib_buffer_push_ip6 (vm, b, &tc->c_lcl_ip6, + &tc->c_rmt_ip6, IP_PROTOCOL_TCP); + th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih, &bogus); + ASSERT (!bogus); + } +} + +/** + * Send SYN + * + * Builds a SYN packet for a half-open connection and sends it to ipx_lookup. + * The packet is not forwarded through tcpx_output to avoid doing lookups + * in the half_open pool. + */ +void +tcp_send_syn (tcp_connection_t * tc) +{ + vlib_buffer_t *b; + u32 bi; + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = vlib_get_main (); + + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + + b = vlib_get_buffer (vm, bi); + tcp_init_buffer (vm, b); + tcp_make_syn (tc, b); + + /* Measure RTT with this */ + tc->rtt_ts = tcp_time_now (); + tc->rtt_seq = tc->snd_nxt; + tc->rto_boff = 0; + + /* Set the connection establishment timer */ + tcp_timer_set (tc, TCP_TIMER_ESTABLISH, TCP_ESTABLISH_TIME); + + tcp_push_ip_hdr (tm, tc, b); + tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4); + TCP_EVT_DBG (TCP_EVT_SYN_SENT, tc); +} + +/** + * Flush tx frame populated by retransmits and timer pops + */ +void +tcp_flush_frame_to_output (vlib_main_t * vm, u8 thread_index, u8 is_ip4) +{ + if (tcp_main.tx_frames[!is_ip4][thread_index]) + { + u32 next_index; + next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index; + vlib_put_frame_to_node (vm, next_index, + tcp_main.tx_frames[!is_ip4][thread_index]); + tcp_main.tx_frames[!is_ip4][thread_index] = 0; + } +} + +/** + * Flush ip lookup tx frames populated by timer pops + */ +always_inline void +tcp_flush_frame_to_ip_lookup (vlib_main_t * vm, u8 thread_index, u8 is_ip4) +{ + if (tcp_main.ip_lookup_tx_frames[!is_ip4][thread_index]) + { + u32 next_index; + next_index = is_ip4 ? ip4_lookup_node.index : ip6_lookup_node.index; + vlib_put_frame_to_node (vm, next_index, + tcp_main.ip_lookup_tx_frames[!is_ip4] + [thread_index]); + tcp_main.ip_lookup_tx_frames[!is_ip4][thread_index] = 0; + } +} + +/** + * Flush v4 and v6 tcp and ip-lookup tx frames for thread index + */ +void +tcp_flush_frames_to_output (u8 thread_index) +{ + vlib_main_t *vm = vlib_get_main (); + tcp_flush_frame_to_output (vm, thread_index, 1); + tcp_flush_frame_to_output (vm, thread_index, 0); + tcp_flush_frame_to_ip_lookup (vm, thread_index, 1); + tcp_flush_frame_to_ip_lookup (vm, thread_index, 0); +} + +/** + * Send FIN + */ +void +tcp_send_fin (tcp_connection_t * tc) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = vlib_get_main (); + vlib_buffer_t *b; + u32 bi; + u8 fin_snt = 0; + + + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); + fin_snt = tc->flags & TCP_CONN_FINSNT; + if (fin_snt) + tc->snd_nxt = tc->snd_una; + tcp_make_fin (tc, b); + tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4); + if (!fin_snt) + { + tc->flags |= TCP_CONN_FINSNT; + tc->flags &= ~TCP_CONN_FINPNDG; + /* Account for the FIN */ + tc->snd_una_max += 1; + tc->snd_nxt = tc->snd_una_max; + } + tcp_retransmit_timer_force_update (tc); + TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc); +} + +always_inline u8 +tcp_make_state_flags (tcp_connection_t * tc, tcp_state_t next_state) +{ + switch (next_state) + { + case TCP_STATE_ESTABLISHED: + return TCP_FLAG_ACK; + case TCP_STATE_SYN_RCVD: + return TCP_FLAG_SYN | TCP_FLAG_ACK; + case TCP_STATE_SYN_SENT: + return TCP_FLAG_SYN; + case TCP_STATE_LAST_ACK: + case TCP_STATE_FIN_WAIT_1: + if (tc->snd_nxt + 1 < tc->snd_una_max) + return TCP_FLAG_ACK; + else + return TCP_FLAG_FIN; + default: + clib_warning ("Shouldn't be here!"); + } + return 0; +} + +/** + * Push TCP header and update connection variables + */ +static void +tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, + tcp_state_t next_state, u8 compute_opts) +{ + u32 advertise_wnd, data_len; + u8 tcp_hdr_opts_len, opts_write_len, flags; + tcp_header_t *th; + + data_len = b->current_length + b->total_length_not_including_first_buffer; + ASSERT (!b->total_length_not_including_first_buffer + || (b->flags & VLIB_BUFFER_NEXT_PRESENT)); + vnet_buffer (b)->tcp.flags = 0; + + if (compute_opts) + tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); + + tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t); + advertise_wnd = tcp_window_to_advertise (tc, next_state); + flags = tcp_make_state_flags (tc, next_state); + + /* Push header and options */ + th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt, + tc->rcv_nxt, tcp_hdr_opts_len, flags, + advertise_wnd); + opts_write_len = tcp_options_write ((u8 *) (th + 1), &tc->snd_opts); + + ASSERT (opts_write_len == tc->snd_opts_len); + vnet_buffer (b)->tcp.connection_index = tc->c_c_index; + + /* + * Update connection variables + */ + + tc->snd_nxt += data_len; + tc->rcv_las = tc->rcv_nxt; + + /* TODO this is updated in output as well ... */ + if (seq_gt (tc->snd_nxt, tc->snd_una_max)) + { + tc->snd_una_max = tc->snd_nxt; + tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); + } + + TCP_EVT_DBG (TCP_EVT_PKTIZE, tc); +} + +void +tcp_send_ack (tcp_connection_t * tc) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = vlib_get_main (); + + vlib_buffer_t *b; + u32 bi; + + /* Get buffer */ + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); + + /* Fill in the ACK */ + tcp_make_ack (tc, b); + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); +} + +/** + * Delayed ack timer handler + * + * Sends delayed ACK when timer expires + */ +void +tcp_timer_delack_handler (u32 index) +{ + u32 thread_index = vlib_get_thread_index (); + tcp_connection_t *tc; + + tc = tcp_connection_get (index, thread_index); + tc->timers[TCP_TIMER_DELACK] = TCP_TIMER_HANDLE_INVALID; + tcp_send_ack (tc); +} + +/** + * Build a retransmit segment + * + * @return the number of bytes in the segment or 0 if there's nothing to + * retransmit + */ +u32 +tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, + u32 max_deq_bytes, vlib_buffer_t ** b) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = vlib_get_main (); + int n_bytes = 0; + u32 start, bi, available_bytes, seg_size; + u8 *data; + + ASSERT (tc->state >= TCP_STATE_ESTABLISHED); + ASSERT (max_deq_bytes != 0); + + /* + * Make sure we can retransmit something + */ + available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection); + ASSERT (available_bytes >= offset); + available_bytes -= offset; + if (!available_bytes) + return 0; + max_deq_bytes = clib_min (tc->snd_mss, max_deq_bytes); + max_deq_bytes = clib_min (available_bytes, max_deq_bytes); + + /* Start is beyond snd_congestion */ + start = tc->snd_una + offset; + if (seq_geq (start, tc->snd_congestion)) + goto done; + + /* Don't overshoot snd_congestion */ + if (seq_gt (start + max_deq_bytes, tc->snd_congestion)) + { + max_deq_bytes = tc->snd_congestion - start; + if (max_deq_bytes == 0) + goto done; + } + + seg_size = max_deq_bytes + MAX_HDRS_LEN; + + /* + * Prepare options + */ + tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); + + /* + * Allocate and fill in buffer(s) + */ + + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return 0; + *b = vlib_get_buffer (vm, bi); + data = tcp_init_buffer (vm, *b); + + /* Easy case, buffer size greater than mss */ + if (PREDICT_TRUE (seg_size <= tm->bytes_per_buffer)) + { + n_bytes = stream_session_peek_bytes (&tc->connection, data, offset, + max_deq_bytes); + ASSERT (n_bytes == max_deq_bytes); + b[0]->current_length = n_bytes; + tcp_push_hdr_i (tc, *b, tc->state, 0); + } + /* Split mss into multiple buffers */ + else + { + u32 chain_bi = ~0, n_bufs_per_seg; + u32 thread_index = vlib_get_thread_index (); + u16 n_peeked, len_to_deq, available_bufs; + vlib_buffer_t *chain_b, *prev_b; + int i; + + n_bufs_per_seg = ceil ((double) seg_size / tm->bytes_per_buffer); + + /* Make sure we have enough buffers */ + available_bufs = vec_len (tm->tx_buffers[thread_index]); + if (n_bufs_per_seg > available_bufs) + { + if (tcp_alloc_tx_buffers (tm, thread_index, + VLIB_FRAME_SIZE - available_bufs)) + { + tcp_return_buffer (tm); + *b = 0; + return 0; + } + } + + n_bytes = stream_session_peek_bytes (&tc->connection, data, offset, + tm->bytes_per_buffer - + MAX_HDRS_LEN); + b[0]->current_length = n_bytes; + b[0]->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + b[0]->total_length_not_including_first_buffer = 0; + max_deq_bytes -= n_bytes; + + chain_b = *b; + for (i = 1; i < n_bufs_per_seg; i++) + { + prev_b = chain_b; + len_to_deq = clib_min (max_deq_bytes, tm->bytes_per_buffer); + tcp_get_free_buffer_index (tm, &chain_bi); + ASSERT (chain_bi != (u32) ~ 0); + chain_b = vlib_get_buffer (vm, chain_bi); + chain_b->current_data = 0; + data = vlib_buffer_get_current (chain_b); + n_peeked = stream_session_peek_bytes (&tc->connection, data, + offset + n_bytes, len_to_deq); + ASSERT (n_peeked == len_to_deq); + n_bytes += n_peeked; + chain_b->current_length = n_peeked; + chain_b->flags &= VLIB_BUFFER_FREE_LIST_INDEX_MASK; + chain_b->next_buffer = 0; + + /* update previous buffer */ + prev_b->next_buffer = chain_bi; + prev_b->flags |= VLIB_BUFFER_NEXT_PRESENT; + + max_deq_bytes -= n_peeked; + b[0]->total_length_not_including_first_buffer += n_peeked; + } + + tcp_push_hdr_i (tc, *b, tc->state, 0); + } + + ASSERT (n_bytes > 0); + ASSERT (((*b)->current_data + (*b)->current_length) <= + tm->bytes_per_buffer); + + if (tcp_in_fastrecovery (tc)) + tc->snd_rxt_bytes += n_bytes; + +done: + TCP_EVT_DBG (TCP_EVT_CC_RTX, tc, offset, n_bytes); + return n_bytes; +} + +/** + * Reset congestion control, switch cwnd to loss window and try again. + */ +static void +tcp_rtx_timeout_cc (tcp_connection_t * tc) +{ + tc->prev_ssthresh = tc->ssthresh; + tc->prev_cwnd = tc->cwnd; + + /* Cleanly recover cc (also clears up fast retransmit) */ + if (tcp_in_fastrecovery (tc)) + tcp_cc_fastrecovery_exit (tc); + + /* Start again from the beginning */ + tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss); + tc->cwnd = tcp_loss_wnd (tc); + tc->snd_congestion = tc->snd_una_max; + tc->rtt_ts = 0; + tcp_recovery_on (tc); +} + +static void +tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = vlib_get_main (); + u32 thread_index = vlib_get_thread_index (); + tcp_connection_t *tc; + vlib_buffer_t *b = 0; + u32 bi, n_bytes; + + if (is_syn) + { + tc = tcp_half_open_connection_get (index); + /* Note: the connection may have transitioned to ESTABLISHED... */ + if (PREDICT_FALSE (tc == 0)) + return; + tc->timers[TCP_TIMER_RETRANSMIT_SYN] = TCP_TIMER_HANDLE_INVALID; + } + else + { + tc = tcp_connection_get (index, thread_index); + /* Note: the connection may have been closed and pool_put */ + if (PREDICT_FALSE (tc == 0)) + return; + tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID; + } + + if (tc->state >= TCP_STATE_ESTABLISHED) + { + /* Lost FIN, retransmit and return */ + if (tcp_is_lost_fin (tc)) + { + tcp_send_fin (tc); + return; + } + + /* We're not in recovery so make sure rto_boff is 0 */ + if (!tcp_in_recovery (tc) && tc->rto_boff > 0) + { + tc->rto_boff = 0; + tcp_update_rto (tc); + } + + /* Increment RTO backoff (also equal to number of retries) and go back + * to first un-acked byte */ + tc->rto_boff += 1; + + /* First retransmit timeout */ + if (tc->rto_boff == 1) + tcp_rtx_timeout_cc (tc); + + tc->snd_nxt = tc->snd_una; + tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); + + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1); + + /* Send one segment. Note that n_bytes may be zero due to buffer shortfall */ + n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b); + + /* TODO be less aggressive about this */ + scoreboard_clear (&tc->sack_sb); + + if (n_bytes == 0) + { + ASSERT (!b); + if (tc->snd_una == tc->snd_una_max) + return; + ASSERT (tc->rto_boff > 1 && tc->snd_una == tc->snd_congestion); + clib_warning ("retransmit fail: %U", format_tcp_connection, tc, 2); + /* Try again eventually */ + tcp_retransmit_timer_set (tc); + return; + } + + bi = vlib_get_buffer_index (vm, b); + + /* For first retransmit, record timestamp (Eifel detection RFC3522) */ + if (tc->rto_boff == 1) + tc->snd_rxt_ts = tcp_time_now (); + + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + tcp_retransmit_timer_update (tc); + } + /* Retransmit for SYN */ + else if (tc->state == TCP_STATE_SYN_SENT) + { + /* Half-open connection actually moved to established but we were + * waiting for syn retransmit to pop to call cleanup from the right + * thread. */ + if (tc->flags & TCP_CONN_HALF_OPEN_DONE) + { + if (tcp_half_open_connection_cleanup (tc)) + { + clib_warning ("could not remove half-open connection"); + ASSERT (0); + } + return; + } + + /* Try without increasing RTO a number of times. If this fails, + * start growing RTO exponentially */ + tc->rto_boff += 1; + if (tc->rto_boff > TCP_RTO_SYN_RETRIES) + tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); + + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + + b = vlib_get_buffer (vm, bi); + tcp_init_buffer (vm, b); + tcp_make_syn (tc, b); + + tc->rtt_ts = 0; + TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc, 0); + + /* This goes straight to ipx_lookup. Retransmit timer set already */ + tcp_push_ip_hdr (tm, tc, b); + tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4); + } + /* Retransmit SYN-ACK */ + else if (tc->state == TCP_STATE_SYN_RCVD) + { + tc->rto_boff += 1; + if (tc->rto_boff > TCP_RTO_SYN_RETRIES) + tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); + tc->rtt_ts = 0; + + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + + b = vlib_get_buffer (vm, bi); + tcp_make_synack (tc, b); + TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc, 1); + + /* Retransmit timer already updated, just enqueue to output */ + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + } + else + { + ASSERT (tc->state == TCP_STATE_CLOSED); + TCP_DBG ("connection state: %d", tc->state); + return; + } +} + +void +tcp_timer_retransmit_handler (u32 index) +{ + tcp_timer_retransmit_handler_i (index, 0); +} + +void +tcp_timer_retransmit_syn_handler (u32 index) +{ + tcp_timer_retransmit_handler_i (index, 1); +} + +/** + * Got 0 snd_wnd from peer, try to do something about it. + * + */ +void +tcp_timer_persist_handler (u32 index) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = vlib_get_main (); + u32 thread_index = vlib_get_thread_index (); + tcp_connection_t *tc; + vlib_buffer_t *b; + u32 bi, max_snd_bytes, available_bytes, offset; + int n_bytes = 0; + u8 *data; + + tc = tcp_connection_get_if_valid (index, thread_index); + + if (!tc) + return; + + /* Make sure timer handle is set to invalid */ + tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID; + + /* Problem already solved or worse */ + if (tc->state == TCP_STATE_CLOSED || tc->state > TCP_STATE_ESTABLISHED + || tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc)) + return; + + available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection); + offset = tc->snd_una_max - tc->snd_una; + + /* Reprogram persist if no new bytes available to send. We may have data + * next time */ + if (!available_bytes) + { + tcp_persist_timer_set (tc); + return; + } + + if (available_bytes <= offset) + { + ASSERT (tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)); + return; + } + + /* Increment RTO backoff */ + tc->rto_boff += 1; + tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); + + /* + * Try to force the first unsent segment (or buffer) + */ + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); + data = tcp_init_buffer (vm, b); + + tcp_validate_txf_size (tc, offset); + tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); + max_snd_bytes = clib_min (tc->snd_mss, tm->bytes_per_buffer - MAX_HDRS_LEN); + n_bytes = stream_session_peek_bytes (&tc->connection, data, offset, + max_snd_bytes); + b->current_length = n_bytes; + ASSERT (n_bytes != 0 && (tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT) + || tc->snd_nxt == tc->snd_una_max + || tc->rto_boff > 1)); + + tcp_push_hdr_i (tc, b, tc->state, 0); + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + + /* Just sent new data, enable retransmit */ + tcp_retransmit_timer_update (tc); +} + +/** + * Retransmit first unacked segment + */ +void +tcp_retransmit_first_unacked (tcp_connection_t * tc) +{ + vlib_main_t *vm = vlib_get_main (); + vlib_buffer_t *b; + u32 bi, old_snd_nxt, n_bytes; + + old_snd_nxt = tc->snd_nxt; + tc->snd_nxt = tc->snd_una; + + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2); + n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b); + if (!n_bytes) + return; + bi = vlib_get_buffer_index (vm, b); + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + + tc->snd_nxt = old_snd_nxt; +} + +/** + * Do fast retransmit with SACKs + */ +void +tcp_fast_retransmit_sack (tcp_connection_t * tc) +{ + vlib_main_t *vm = vlib_get_main (); + u32 n_written = 0, offset, max_bytes; + vlib_buffer_t *b = 0; + sack_scoreboard_hole_t *hole; + sack_scoreboard_t *sb; + u32 bi, old_snd_nxt; + int snd_space; + u8 snd_limited = 0, can_rescue = 0; + + ASSERT (tcp_in_fastrecovery (tc)); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); + + old_snd_nxt = tc->snd_nxt; + sb = &tc->sack_sb; + snd_space = tcp_available_snd_space (tc); + + hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); + while (hole && snd_space > 0) + { + hole = scoreboard_next_rxt_hole (sb, hole, + tcp_fastrecovery_sent_1_smss (tc), + &can_rescue, &snd_limited); + if (!hole) + { + if (!can_rescue || !(seq_lt (sb->rescue_rxt, tc->snd_una) + || seq_gt (sb->rescue_rxt, + tc->snd_congestion))) + break; + + /* If rescue rxt undefined or less than snd_una then one segment of + * up to SMSS octets that MUST include the highest outstanding + * unSACKed sequence number SHOULD be returned, and RescueRxt set to + * RecoveryPoint. HighRxt MUST NOT be updated. + */ + max_bytes = clib_min (tc->snd_mss, + tc->snd_congestion - tc->snd_una); + max_bytes = clib_min (max_bytes, snd_space); + offset = tc->snd_congestion - tc->snd_una - max_bytes; + sb->rescue_rxt = tc->snd_congestion; + tc->snd_nxt = tc->snd_una + offset; + n_written = tcp_prepare_retransmit_segment (tc, offset, max_bytes, + &b); + ASSERT (n_written); + bi = vlib_get_buffer_index (vm, b); + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + break; + } + + max_bytes = clib_min (hole->end - sb->high_rxt, snd_space); + max_bytes = snd_limited ? clib_min (max_bytes, tc->snd_mss) : max_bytes; + if (max_bytes == 0) + break; + offset = sb->high_rxt - tc->snd_una; + tc->snd_nxt = sb->high_rxt; + n_written = tcp_prepare_retransmit_segment (tc, offset, max_bytes, &b); + + /* Nothing left to retransmit */ + if (n_written == 0) + break; + + bi = vlib_get_buffer_index (vm, b); + sb->high_rxt += n_written; + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + ASSERT (n_written <= snd_space); + snd_space -= n_written; + } + + /* If window allows, send 1 SMSS of new data */ + tc->snd_nxt = old_snd_nxt; +} + +/** + * Fast retransmit without SACK info + */ +void +tcp_fast_retransmit_no_sack (tcp_connection_t * tc) +{ + vlib_main_t *vm = vlib_get_main (); + u32 n_written = 0, offset = 0, bi, old_snd_nxt; + int snd_space; + vlib_buffer_t *b; + + ASSERT (tcp_in_fastrecovery (tc)); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); + + /* Start resending from first un-acked segment */ + old_snd_nxt = tc->snd_nxt; + tc->snd_nxt = tc->snd_una; + snd_space = tcp_available_snd_space (tc); + + while (snd_space > 0) + { + offset += n_written; + n_written = tcp_prepare_retransmit_segment (tc, offset, snd_space, &b); + + /* Nothing left to retransmit */ + if (n_written == 0) + break; + + bi = vlib_get_buffer_index (vm, b); + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + snd_space -= n_written; + } + + /* Restore snd_nxt. If window allows, send 1 SMSS of new data */ + tc->snd_nxt = old_snd_nxt; +} + +/** + * Do fast retransmit + */ +void +tcp_fast_retransmit (tcp_connection_t * tc) +{ + if (tcp_opts_sack_permitted (&tc->rcv_opts) + && scoreboard_first_hole (&tc->sack_sb)) + tcp_fast_retransmit_sack (tc); + else + tcp_fast_retransmit_no_sack (tc); +} + +always_inline u32 +tcp_session_has_ooo_data (tcp_connection_t * tc) +{ + stream_session_t *s = + stream_session_get (tc->c_s_index, tc->c_thread_index); + return svm_fifo_has_ooo_data (s->server_rx_fifo); +} + +always_inline uword +tcp46_output_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->thread_index; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + next_index = node->cached_next_index; + tcp_set_time_now (my_thread_index); + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + tcp_connection_t *tc0; + tcp_tx_trace_t *t0; + tcp_header_t *th0 = 0; + u32 error0 = TCP_ERROR_PKTS_SENT, next0 = TCP_OUTPUT_NEXT_IP_LOOKUP; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index, + my_thread_index); + if (PREDICT_FALSE (tc0 == 0 || tc0->state == TCP_STATE_CLOSED)) + { + error0 = TCP_ERROR_INVALID_CONNECTION; + next0 = TCP_OUTPUT_NEXT_DROP; + goto done; + } + + th0 = vlib_buffer_get_current (b0); + TCP_EVT_DBG (TCP_EVT_OUTPUT, tc0, th0->flags, b0->current_length); + + if (is_ip4) + { + vlib_buffer_push_ip4 (vm, b0, &tc0->c_lcl_ip4, &tc0->c_rmt_ip4, + IP_PROTOCOL_TCP, 1); + b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; + vnet_buffer (b0)->l4_hdr_offset = (u8 *) th0 - b0->data; + th0->checksum = 0; + } + else + { + ip6_header_t *ih0; + ih0 = vlib_buffer_push_ip6 (vm, b0, &tc0->c_lcl_ip6, + &tc0->c_rmt_ip6, IP_PROTOCOL_TCP); + b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; + vnet_buffer (b0)->l3_hdr_offset = (u8 *) ih0 - b0->data; + vnet_buffer (b0)->l4_hdr_offset = (u8 *) th0 - b0->data; + th0->checksum = 0; + } + + /* Filter out DUPACKs if there are no OOO segments left */ + if (PREDICT_FALSE + (vnet_buffer (b0)->tcp.flags & TCP_BUF_FLAG_DUPACK)) + { + if (!tcp_session_has_ooo_data (tc0)) + { + error0 = TCP_ERROR_FILTERED_DUPACKS; + next0 = TCP_OUTPUT_NEXT_DROP; + goto done; + } + } + + /* Stop DELACK timer and fix flags */ + tc0->flags &= ~(TCP_CONN_SNDACK); + tcp_timer_reset (tc0, TCP_TIMER_DELACK); + + /* If not retransmitting + * 1) update snd_una_max (SYN, SYNACK, FIN) + * 2) If we're not tracking an ACK, start tracking */ + if (seq_lt (tc0->snd_una_max, tc0->snd_nxt)) + { + tc0->snd_una_max = tc0->snd_nxt; + if (tc0->rtt_ts == 0) + { + tc0->rtt_ts = tcp_time_now (); + tc0->rtt_seq = tc0->snd_nxt; + } + } + + /* Set the retransmit timer if not set already and not + * doing a pure ACK */ + if (!tcp_timer_is_active (tc0, TCP_TIMER_RETRANSMIT) + && tc0->snd_nxt != tc0->snd_una) + { + tcp_retransmit_timer_set (tc0); + tc0->rto_boff = 0; + } + +#if 0 + /* Make sure we haven't lost route to our peer */ + if (PREDICT_FALSE (tc0->last_fib_check + < tc0->snd_opts.tsval + TCP_FIB_RECHECK_PERIOD)) + { + if (PREDICT_TRUE + (tc0->c_rmt_fei == tcp_lookup_rmt_in_fib (tc0))) + { + tc0->last_fib_check = tc0->snd_opts.tsval; + } + else + { + clib_warning ("lost connection to peer"); + tcp_connection_reset (tc0); + goto done; + } + } + + /* Use pre-computed dpo to set next node */ + next0 = tc0->c_rmt_dpo.dpoi_next_node; + vnet_buffer (b0)->ip.adj_index[VLIB_TX] = tc0->c_rmt_dpo.dpoi_index; +#endif + + vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0; + + b0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; + done: + b0->error = node->errors[error0]; + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + if (th0) + { + clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); + } + else + { + memset (&t0->tcp_header, 0, sizeof (t0->tcp_header)); + } + clib_memcpy (&t0->tcp_connection, tc0, + sizeof (t0->tcp_connection)); + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return from_frame->n_vectors; +} + +static uword +tcp4_output (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_output_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +tcp6_output (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_output_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_output_node) = +{ + .function = tcp4_output,.name = "tcp4-output", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_OUTPUT_N_NEXT, + .next_nodes = { +#define _(s,n) [TCP_OUTPUT_NEXT_##s] = n, + foreach_tcp4_output_next +#undef _ + }, + .format_buffer = format_tcp_header, + .format_trace = format_tcp_tx_trace, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_output_node, tcp4_output); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_output_node) = +{ + .function = tcp6_output, + .name = "tcp6-output", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_OUTPUT_N_NEXT, + .next_nodes = { +#define _(s,n) [TCP_OUTPUT_NEXT_##s] = n, + foreach_tcp6_output_next +#undef _ + }, + .format_buffer = format_tcp_header, + .format_trace = format_tcp_tx_trace, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_output_node, tcp6_output); + +u32 +tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b) +{ + tcp_connection_t *tc; + + tc = (tcp_connection_t *) tconn; + tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED, 0); + ASSERT (seq_leq (tc->snd_una_max, tc->snd_una + tc->snd_wnd)); + + if (tc->rtt_ts == 0 && !tcp_in_cong_recovery (tc)) + { + tc->rtt_ts = tcp_time_now (); + tc->rtt_seq = tc->snd_nxt; + } + return 0; +} + +typedef enum _tcp_reset_next +{ + TCP_RESET_NEXT_DROP, + TCP_RESET_NEXT_IP_LOOKUP, + TCP_RESET_N_NEXT +} tcp_reset_next_t; + +#define foreach_tcp4_reset_next \ + _(DROP, "error-drop") \ + _(IP_LOOKUP, "ip4-lookup") + +#define foreach_tcp6_reset_next \ + _(DROP, "error-drop") \ + _(IP_LOOKUP, "ip6-lookup") + +static uword +tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame, u8 is_ip4) +{ + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->thread_index; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + tcp_tx_trace_t *t0; + tcp_header_t *th0; + u32 error0 = TCP_ERROR_RST_SENT, next0 = TCP_RESET_NEXT_IP_LOOKUP; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + if (tcp_make_reset_in_place (vm, b0, vnet_buffer (b0)->tcp.flags, + my_thread_index, is_ip4)) + { + error0 = TCP_ERROR_LOOKUP_DROPS; + next0 = TCP_RESET_NEXT_DROP; + goto done; + } + + /* Prepare to send to IP lookup */ + vnet_buffer (b0)->sw_if_index[VLIB_TX] = 0; + next0 = TCP_RESET_NEXT_IP_LOOKUP; + + done: + b0->error = node->errors[error0]; + b0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + th0 = vlib_buffer_get_current (b0); + if (is_ip4) + th0 = ip4_next_header ((ip4_header_t *) th0); + else + th0 = ip6_next_header ((ip6_header_t *) th0); + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + return from_frame->n_vectors; +} + +static uword +tcp4_send_reset (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_send_reset_inline (vm, node, from_frame, 1); +} + +static uword +tcp6_send_reset (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_send_reset_inline (vm, node, from_frame, 0); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_reset_node) = { + .function = tcp4_send_reset, + .name = "tcp4-reset", + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_RESET_N_NEXT, + .next_nodes = { +#define _(s,n) [TCP_RESET_NEXT_##s] = n, + foreach_tcp4_reset_next +#undef _ + }, + .format_trace = format_tcp_tx_trace, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_reset_node, tcp4_send_reset); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_reset_node) = { + .function = tcp6_send_reset, + .name = "tcp6-reset", + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_RESET_N_NEXT, + .next_nodes = { +#define _(s,n) [TCP_RESET_NEXT_##s] = n, + foreach_tcp6_reset_next +#undef _ + }, + .format_trace = format_tcp_tx_trace, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_reset_node, tcp6_send_reset); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_packet.h b/src/vnet/tcp/tcp_packet.h new file mode 100644 index 00000000..9ccfe655 --- /dev/null +++ b/src/vnet/tcp/tcp_packet.h @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_tcp_packet_h +#define included_tcp_packet_h + +#include <vnet/vnet.h> + +/* TCP flags bit 0 first. */ +#define foreach_tcp_flag \ + _ (FIN) /**< No more data from sender. */ \ + _ (SYN) /**< Synchronize sequence numbers. */ \ + _ (RST) /**< Reset the connection. */ \ + _ (PSH) /**< Push function. */ \ + _ (ACK) /**< Ack field significant. */ \ + _ (URG) /**< Urgent pointer field significant. */ \ + _ (ECE) /**< ECN-echo. Receiver got CE packet */ \ + _ (CWR) /**< Sender reduced congestion window */ + +enum +{ +#define _(f) TCP_FLAG_BIT_##f, + foreach_tcp_flag +#undef _ + TCP_N_FLAG_BITS, +}; + +enum +{ +#define _(f) TCP_FLAG_##f = 1 << TCP_FLAG_BIT_##f, + foreach_tcp_flag +#undef _ +}; + +typedef struct _tcp_header +{ + union + { + struct + { + u16 src_port; /**< Source port. */ + u16 dst_port; /**< Destination port. */ + }; + struct + { + u16 src, dst; + }; + }; + + u32 seq_number; /**< Sequence number of the first data octet in this + * segment, except when SYN is present. If SYN + * is present the seq number is is the ISN and the + * first data octet is ISN+1 */ + u32 ack_number; /**< Acknowledgement number if ACK is set. It contains + * the value of the next sequence number the sender + * of the segment is expecting to receive. */ + u8 data_offset_and_reserved; + u8 flags; /**< Flags: see the macro above */ + u16 window; /**< Number of bytes sender is willing to receive. */ + + u16 checksum; /**< Checksum of TCP pseudo header and data. */ + u16 urgent_pointer; /**< Seq number of the byte after the urgent data. */ +} __attribute__ ((packed)) tcp_header_t; + +/* Flag tests that return 0 or !0 */ +#define tcp_doff(_th) ((_th)->data_offset_and_reserved >> 4) +#define tcp_fin(_th) ((_th)->flags & TCP_FLAG_FIN) +#define tcp_syn(_th) ((_th)->flags & TCP_FLAG_SYN) +#define tcp_rst(_th) ((_th)->flags & TCP_FLAG_RST) +#define tcp_psh(_th) ((_th)->flags & TCP_FLAG_PSH) +#define tcp_ack(_th) ((_th)->flags & TCP_FLAG_ACK) +#define tcp_urg(_th) ((_th)->flags & TCP_FLAG_URG) +#define tcp_ece(_th) ((_th)->flags & TCP_FLAG_ECE) +#define tcp_cwr(_th) ((_th)->flags & TCP_FLAG_CWR) + +/* Flag tests that return 0 or 1 */ +#define tcp_is_syn(_th) !!((_th)->flags & TCP_FLAG_SYN) +#define tcp_is_fin(_th) !!((_th)->flags & TCP_FLAG_FIN) + +always_inline int +tcp_header_bytes (tcp_header_t * t) +{ + return tcp_doff (t) * sizeof (u32); +} + +/* + * TCP options. + */ + +typedef enum tcp_option_type +{ + TCP_OPTION_EOL = 0, /**< End of options. */ + TCP_OPTION_NOOP = 1, /**< No operation. */ + TCP_OPTION_MSS = 2, /**< Limit MSS. */ + TCP_OPTION_WINDOW_SCALE = 3, /**< Window scale. */ + TCP_OPTION_SACK_PERMITTED = 4, /**< Selective Ack permitted. */ + TCP_OPTION_SACK_BLOCK = 5, /**< Selective Ack block. */ + TCP_OPTION_TIMESTAMP = 8, /**< Timestamps. */ + TCP_OPTION_UTO = 28, /**< User timeout. */ + TCP_OPTION_AO = 29, /**< Authentication Option. */ +} tcp_option_type_t; + +#define foreach_tcp_options_flag \ + _ (MSS) /**< MSS advertised in SYN */ \ + _ (TSTAMP) /**< Timestamp capability advertised in SYN */ \ + _ (WSCALE) /**< Wnd scale capability advertised in SYN */ \ + _ (SACK_PERMITTED) /**< SACK capability advertised in SYN */ \ + _ (SACK) /**< SACK present */ + +enum +{ +#define _(f) TCP_OPTS_FLAG_BIT_##f, + foreach_tcp_options_flag +#undef _ + TCP_OPTIONS_N_FLAG_BITS, +}; + +enum +{ +#define _(f) TCP_OPTS_FLAG_##f = 1 << TCP_OPTS_FLAG_BIT_##f, + foreach_tcp_options_flag +#undef _ +}; + +typedef struct _sack_block +{ + u32 start; /**< Start sequence number */ + u32 end; /**< End sequence number (first outside) */ +} sack_block_t; + +typedef struct +{ + u8 flags; /** Option flags, see above */ + + u16 mss; /**< Maximum segment size advertised */ + u8 wscale; /**< Window scale advertised */ + u32 tsval; /**< Timestamp value */ + u32 tsecr; /**< Echoed/reflected time stamp */ + sack_block_t *sacks; /**< SACK blocks */ + u8 n_sack_blocks; /**< Number of SACKs blocks */ +} tcp_options_t; + +/* Flag tests that return 0 or !0 */ +#define tcp_opts_mss(_to) ((_to)->flags & TCP_OPTS_FLAG_MSS) +#define tcp_opts_tstamp(_to) ((_to)->flags & TCP_OPTS_FLAG_TSTAMP) +#define tcp_opts_wscale(_to) ((_to)->flags & TCP_OPTS_FLAG_WSCALE) +#define tcp_opts_sack(_to) ((_to)->flags & TCP_OPTS_FLAG_SACK) +#define tcp_opts_sack_permitted(_to) ((_to)->flags & TCP_OPTS_FLAG_SACK_PERMITTED) + +/* TCP option lengths */ +#define TCP_OPTION_LEN_EOL 1 +#define TCP_OPTION_LEN_NOOP 1 +#define TCP_OPTION_LEN_MSS 4 +#define TCP_OPTION_LEN_WINDOW_SCALE 3 +#define TCP_OPTION_LEN_SACK_PERMITTED 2 +#define TCP_OPTION_LEN_TIMESTAMP 10 +#define TCP_OPTION_LEN_SACK_BLOCK 8 + +#define TCP_HDR_LEN_MAX 60 +#define TCP_WND_MAX 65535U +#define TCP_MAX_WND_SCALE 14 /* See RFC 1323 */ +#define TCP_OPTS_ALIGN 4 +#define TCP_OPTS_MAX_SACK_BLOCKS 3 +#endif /* included_tcp_packet_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_pg.c b/src/vnet/tcp/tcp_pg.c new file mode 100644 index 00000000..3be4592c --- /dev/null +++ b/src/vnet/tcp/tcp_pg.c @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/tcp_pg: TCP packet-generator interface + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vnet/ip/ip.h> +#include <vnet/pg/pg.h> + +/* TCP flags bit 0 first. */ +#define foreach_tcp_flag \ + _ (FIN) \ + _ (SYN) \ + _ (RST) \ + _ (PSH) \ + _ (ACK) \ + _ (URG) \ + _ (ECE) \ + _ (CWR) + +static void +tcp_pg_edit_function (pg_main_t * pg, + pg_stream_t * s, + pg_edit_group_t * g, u32 * packets, u32 n_packets) +{ + vlib_main_t *vm = vlib_get_main (); + u32 ip_offset, tcp_offset; + + tcp_offset = g->start_byte_offset; + ip_offset = (g - 1)->start_byte_offset; + + while (n_packets >= 1) + { + vlib_buffer_t *p0; + ip4_header_t *ip0; + tcp_header_t *tcp0; + ip_csum_t sum0; + u32 tcp_len0; + + p0 = vlib_get_buffer (vm, packets[0]); + n_packets -= 1; + packets += 1; + + ASSERT (p0->current_data == 0); + ip0 = (void *) (p0->data + ip_offset); + tcp0 = (void *) (p0->data + tcp_offset); + tcp_len0 = clib_net_to_host_u16 (ip0->length) - sizeof (ip0[0]); + + /* Initialize checksum with header. */ + if (BITS (sum0) == 32) + { + sum0 = clib_mem_unaligned (&ip0->src_address, u32); + sum0 = + ip_csum_with_carry (sum0, + clib_mem_unaligned (&ip0->dst_address, u32)); + } + else + sum0 = clib_mem_unaligned (&ip0->src_address, u64); + + sum0 = ip_csum_with_carry + (sum0, clib_host_to_net_u32 (tcp_len0 + (ip0->protocol << 16))); + + /* Invalidate possibly old checksum. */ + tcp0->checksum = 0; + + sum0 = + ip_incremental_checksum_buffer (vm, p0, tcp_offset, tcp_len0, sum0); + + tcp0->checksum = ~ip_csum_fold (sum0); + } +} + +typedef struct +{ + pg_edit_t src, dst; + pg_edit_t seq_number, ack_number; + pg_edit_t data_offset_and_reserved; +#define _(f) pg_edit_t f##_flag; + foreach_tcp_flag +#undef _ + pg_edit_t window; + pg_edit_t checksum; + pg_edit_t urgent_pointer; +} pg_tcp_header_t; + +static inline void +pg_tcp_header_init (pg_tcp_header_t * p) +{ + /* Initialize fields that are not bit fields in the IP header. */ +#define _(f) pg_edit_init (&p->f, tcp_header_t, f); + _(src); + _(dst); + _(seq_number); + _(ack_number); + _(window); + _(checksum); + _(urgent_pointer); +#undef _ + + /* Initialize bit fields. */ +#define _(f) \ + pg_edit_init_bitfield (&p->f##_flag, tcp_header_t, \ + flags, \ + TCP_FLAG_BIT_##f, 1); + + foreach_tcp_flag +#undef _ + pg_edit_init_bitfield (&p->data_offset_and_reserved, tcp_header_t, + data_offset_and_reserved, 4, 4); +} + +uword +unformat_pg_tcp_header (unformat_input_t * input, va_list * args) +{ + pg_stream_t *s = va_arg (*args, pg_stream_t *); + pg_tcp_header_t *p; + u32 group_index; + + p = pg_create_edit_group (s, sizeof (p[0]), sizeof (tcp_header_t), + &group_index); + pg_tcp_header_init (p); + + /* Defaults. */ + pg_edit_set_fixed (&p->seq_number, 0); + pg_edit_set_fixed (&p->ack_number, 0); + + pg_edit_set_fixed (&p->data_offset_and_reserved, + sizeof (tcp_header_t) / sizeof (u32)); + + pg_edit_set_fixed (&p->window, 4096); + pg_edit_set_fixed (&p->urgent_pointer, 0); + +#define _(f) pg_edit_set_fixed (&p->f##_flag, 0); + foreach_tcp_flag +#undef _ + p->checksum.type = PG_EDIT_UNSPECIFIED; + + if (!unformat (input, "TCP: %U -> %U", + unformat_pg_edit, + unformat_tcp_udp_port, &p->src, + unformat_pg_edit, unformat_tcp_udp_port, &p->dst)) + goto error; + + /* Parse options. */ + while (1) + { + if (unformat (input, "window %U", + unformat_pg_edit, unformat_pg_number, &p->window)) + ; + + else if (unformat (input, "checksum %U", + unformat_pg_edit, unformat_pg_number, &p->checksum)) + ; + + else if (unformat (input, "seqnum %U", unformat_pg_edit, + unformat_pg_number, &p->seq_number)) + ; + else if (unformat (input, "acknum %U", unformat_pg_edit, + unformat_pg_number, &p->ack_number)) + ; + /* Flags. */ +#define _(f) else if (unformat (input, #f)) pg_edit_set_fixed (&p->f##_flag, 1); + foreach_tcp_flag +#undef _ + /* Can't parse input: try next protocol level. */ + else + break; + } + + { + ip_main_t *im = &ip_main; + u16 dst_port; + tcp_udp_port_info_t *pi; + + pi = 0; + if (p->dst.type == PG_EDIT_FIXED) + { + dst_port = pg_edit_get_value (&p->dst, PG_EDIT_LO); + pi = ip_get_tcp_udp_port_info (im, dst_port); + } + + if (pi && pi->unformat_pg_edit + && unformat_user (input, pi->unformat_pg_edit, s)) + ; + + else if (!unformat_user (input, unformat_pg_payload, s)) + goto error; + + if (p->checksum.type == PG_EDIT_UNSPECIFIED) + { + pg_edit_group_t *g = pg_stream_get_group (s, group_index); + g->edit_function = tcp_pg_edit_function; + g->edit_function_opaque = 0; + } + + return 1; + } + +error: + /* Free up any edits we may have added. */ + pg_free_edit_group (s); + return 0; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_syn_filter4.c b/src/vnet/tcp/tcp_syn_filter4.c new file mode 100644 index 00000000..9b2a8ac7 --- /dev/null +++ b/src/vnet/tcp/tcp_syn_filter4.c @@ -0,0 +1,545 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> +#include <vnet/feature/feature.h> +#include <vnet/ip/ip.h> +#include <vppinfra/xxhash.h> + +typedef struct +{ + f64 next_reset; + f64 reset_interval; + u8 *syn_counts; +} syn_filter4_runtime_t; + +typedef struct +{ + u32 next_index; + int not_a_syn; + u8 filter_value; +} syn_filter4_trace_t; + +/* packet trace format function */ +static u8 * +format_syn_filter4_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + syn_filter4_trace_t *t = va_arg (*args, syn_filter4_trace_t *); + + s = format (s, "SYN_FILTER4: next index %d, %s", + t->next_index, t->not_a_syn ? "not a syn" : "syn"); + if (t->not_a_syn == 0) + s = format (s, ", filter value %d\n", t->filter_value); + else + s = format (s, "\n"); + return s; +} + +static vlib_node_registration_t syn_filter4_node; + +#define foreach_syn_filter_error \ +_(THROTTLED, "TCP SYN packet throttle drops") \ +_(OK, "TCP SYN packets passed") + +typedef enum +{ +#define _(sym,str) SYN_FILTER_ERROR_##sym, + foreach_syn_filter_error +#undef _ + SYN_FILTER_N_ERROR, +} syn_filter_error_t; + +static char *syn_filter4_error_strings[] = { +#define _(sym,string) string, + foreach_syn_filter_error +#undef _ +}; + +typedef enum +{ + SYN_FILTER_NEXT_DROP, + SYN_FILTER_N_NEXT, +} syn_filter_next_t; + +extern vnet_feature_arc_registration_t vnet_feat_arc_ip4_local; + +static uword +syn_filter4_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + u32 n_left_from, *from, *to_next; + syn_filter_next_t next_index; + u32 ok_syn_packets = 0; + vnet_feature_main_t *fm = &feature_main; + u8 arc_index = vnet_feat_arc_ip4_local.feature_arc_index; + vnet_feature_config_main_t *cm = &fm->feature_config_mains[arc_index]; + syn_filter4_runtime_t *rt = (syn_filter4_runtime_t *) node->runtime_data; + f64 now = vlib_time_now (vm); + /* Shut up spurious gcc warnings. */ + u8 *c0 = 0, *c1 = 0, *c2 = 0, *c3 = 0; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + if (now > rt->next_reset) + { + memset (rt->syn_counts, 0, vec_len (rt->syn_counts)); + rt->next_reset = now + rt->reset_interval; + } + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 8 && n_left_to_next >= 4) + { + u32 bi0, bi1, bi2, bi3; + vlib_buffer_t *b0, *b1, *b2, *b3; + u32 next0, next1, next2, next3; + ip4_header_t *ip0, *ip1, *ip2, *ip3; + tcp_header_t *tcp0, *tcp1, *tcp2, *tcp3; + u32 not_a_syn0 = 1, not_a_syn1 = 1, not_a_syn2 = 1, not_a_syn3 = 1; + u64 hash0, hash1, hash2, hash3; + + /* Prefetch next iteration. */ + { + vlib_buffer_t *p4, *p5, *p6, *p7; + + p4 = vlib_get_buffer (vm, from[4]); + p5 = vlib_get_buffer (vm, from[5]); + p6 = vlib_get_buffer (vm, from[6]); + p7 = vlib_get_buffer (vm, from[7]); + + vlib_prefetch_buffer_header (p4, LOAD); + vlib_prefetch_buffer_header (p5, LOAD); + vlib_prefetch_buffer_header (p6, LOAD); + vlib_prefetch_buffer_header (p7, LOAD); + + CLIB_PREFETCH (p4->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p5->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p6->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p7->data, CLIB_CACHE_LINE_BYTES, STORE); + } + + /* speculatively enqueue b0 and b1 to the current next frame */ + to_next[0] = bi0 = from[0]; + to_next[1] = bi1 = from[1]; + to_next[2] = bi2 = from[2]; + to_next[3] = bi3 = from[3]; + from += 4; + to_next += 4; + n_left_from -= 4; + n_left_to_next -= 4; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + b2 = vlib_get_buffer (vm, bi2); + b3 = vlib_get_buffer (vm, bi3); + + vnet_get_config_data + (&cm->config_main, &b0->current_config_index, + &next0, 0 /* sizeof (c0[0]) */ ); + vnet_get_config_data + (&cm->config_main, &b1->current_config_index, + &next1, 0 /* sizeof (c0[0]) */ ); + vnet_get_config_data + (&cm->config_main, &b2->current_config_index, + &next2, 0 /* sizeof (c0[0]) */ ); + vnet_get_config_data + (&cm->config_main, &b3->current_config_index, + &next3, 0 /* sizeof (c0[0]) */ ); + + /* Not TCP? */ + ip0 = vlib_buffer_get_current (b0); + if (ip0->protocol != IP_PROTOCOL_TCP) + goto trace00; + + tcp0 = ip4_next_header (ip0); + /* + * Not a SYN? + * $$$$ hack: the TCP bitfield flags seem not to compile + * correct code. + */ + if (PREDICT_TRUE (!(tcp0->flags & 0x2))) + goto trace00; + + not_a_syn0 = 0; + hash0 = clib_xxhash ((u64) ip0->src_address.as_u32); + c0 = &rt->syn_counts[hash0 & (_vec_len (rt->syn_counts) - 1)]; + if (PREDICT_FALSE (*c0 >= 0x80)) + { + next0 = SYN_FILTER_NEXT_DROP; + b0->error = node->errors[SYN_FILTER_ERROR_THROTTLED]; + goto trace00; + } + *c0 += 1; + ok_syn_packets++; + + trace00: + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + syn_filter4_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->not_a_syn = not_a_syn0; + t->next_index = next0; + t->filter_value = not_a_syn0 ? 0 : *c0; + } + + /* Not TCP? */ + ip1 = vlib_buffer_get_current (b1); + if (ip1->protocol != IP_PROTOCOL_TCP) + goto trace01; + + tcp1 = ip4_next_header (ip1); + /* + * Not a SYN? + * $$$$ hack: the TCP bitfield flags seem not to compile + * correct code. + */ + if (PREDICT_TRUE (!(tcp1->flags & 0x2))) + goto trace01; + + not_a_syn1 = 0; + hash1 = clib_xxhash ((u64) ip1->src_address.as_u32); + c1 = &rt->syn_counts[hash1 & (_vec_len (rt->syn_counts) - 1)]; + if (PREDICT_FALSE (*c1 >= 0x80)) + { + next1 = SYN_FILTER_NEXT_DROP; + b1->error = node->errors[SYN_FILTER_ERROR_THROTTLED]; + goto trace01; + } + *c1 += 1; + ok_syn_packets++; + + trace01: + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b1->flags & VLIB_BUFFER_IS_TRACED))) + { + syn_filter4_trace_t *t = + vlib_add_trace (vm, node, b1, sizeof (*t)); + t->not_a_syn = not_a_syn1; + t->next_index = next1; + t->filter_value = not_a_syn1 ? 0 : *c1; + } + + /* Not TCP? */ + ip2 = vlib_buffer_get_current (b2); + if (ip2->protocol != IP_PROTOCOL_TCP) + goto trace02; + + tcp2 = ip4_next_header (ip2); + /* + * Not a SYN? + * $$$$ hack: the TCP bitfield flags seem not to compile + * correct code. + */ + if (PREDICT_TRUE (!(tcp2->flags & 0x2))) + goto trace02; + + not_a_syn2 = 0; + hash2 = clib_xxhash ((u64) ip2->src_address.as_u32); + c2 = &rt->syn_counts[hash2 & (_vec_len (rt->syn_counts) - 1)]; + if (PREDICT_FALSE (*c2 >= 0x80)) + { + next2 = SYN_FILTER_NEXT_DROP; + b2->error = node->errors[SYN_FILTER_ERROR_THROTTLED]; + goto trace02; + } + *c2 += 1; + ok_syn_packets++; + + trace02: + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b2->flags & VLIB_BUFFER_IS_TRACED))) + { + syn_filter4_trace_t *t = + vlib_add_trace (vm, node, b2, sizeof (*t)); + t->not_a_syn = not_a_syn2; + t->next_index = next2; + t->filter_value = not_a_syn2 ? 0 : *c2; + } + + /* Not TCP? */ + ip3 = vlib_buffer_get_current (b3); + if (ip3->protocol != IP_PROTOCOL_TCP) + goto trace03; + + tcp3 = ip4_next_header (ip3); + /* + * Not a SYN? + * $$$$ hack: the TCP bitfield flags seem not to compile + * correct code. + */ + if (PREDICT_TRUE (!(tcp3->flags & 0x2))) + goto trace03; + + not_a_syn3 = 0; + hash3 = clib_xxhash ((u64) ip3->src_address.as_u32); + c3 = &rt->syn_counts[hash3 & (_vec_len (rt->syn_counts) - 1)]; + if (PREDICT_FALSE (*c3 >= 0x80)) + { + next3 = SYN_FILTER_NEXT_DROP; + b3->error = node->errors[SYN_FILTER_ERROR_THROTTLED]; + goto trace03; + } + *c3 += 1; + ok_syn_packets++; + + trace03: + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b3->flags & VLIB_BUFFER_IS_TRACED))) + { + syn_filter4_trace_t *t = + vlib_add_trace (vm, node, b3, sizeof (*t)); + t->not_a_syn = not_a_syn3; + t->next_index = next3; + t->filter_value = not_a_syn3 ? 0 : *c3; + } + vlib_validate_buffer_enqueue_x4 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, bi2, bi3, + next0, next1, next2, next3); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 next0; + ip4_header_t *ip0; + tcp_header_t *tcp0; + u32 not_a_syn0 = 1; + u32 hash0; + u8 *c0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + vnet_get_config_data + (&cm->config_main, &b0->current_config_index, + &next0, 0 /* sizeof (c0[0]) */ ); + + /* Not TCP? */ + ip0 = vlib_buffer_get_current (b0); + if (ip0->protocol != IP_PROTOCOL_TCP) + goto trace0; + + tcp0 = ip4_next_header (ip0); + /* + * Not a SYN? + * $$$$ hack: the TCP bitfield flags seem not to compile + * correct code. + */ + if (PREDICT_TRUE (!(tcp0->flags & 0x2))) + goto trace0; + + not_a_syn0 = 0; + hash0 = clib_xxhash ((u64) ip0->src_address.as_u32); + c0 = &rt->syn_counts[hash0 & (_vec_len (rt->syn_counts) - 1)]; + if (PREDICT_FALSE (*c0 >= 0x80)) + { + next0 = SYN_FILTER_NEXT_DROP; + b0->error = node->errors[SYN_FILTER_ERROR_THROTTLED]; + goto trace0; + } + *c0 += 1; + ok_syn_packets++; + + trace0: + + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + syn_filter4_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->not_a_syn = not_a_syn0; + t->next_index = next0; + t->filter_value = not_a_syn0 ? 0 : *c0; + } + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, syn_filter4_node.index, + SYN_FILTER_ERROR_OK, ok_syn_packets); + return frame->n_vectors; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (syn_filter4_node, static) = +{ + .function = syn_filter4_node_fn, + .name = "syn-filter-4", + .vector_size = sizeof (u32), + .format_trace = format_syn_filter4_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .runtime_data_bytes = sizeof (syn_filter4_runtime_t), + .n_errors = ARRAY_LEN(syn_filter4_error_strings), + .error_strings = syn_filter4_error_strings, + + .n_next_nodes = SYN_FILTER_N_NEXT, + + /* edit / add dispositions here */ + .next_nodes = { + [SYN_FILTER_NEXT_DROP] = "error-drop", + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (syn_filter4_node, syn_filter4_node_fn); + +/* *INDENT-OFF* */ +VNET_FEATURE_INIT (syn_filter_4, static) = +{ + .arc_name = "ip4-local", + .node_name = "syn-filter-4", + .runs_before = VNET_FEATURES("ip4-local-end-of-arc"), +}; +/* *INDENT-ON* */ + +int +syn_filter_enable_disable (u32 sw_if_index, int enable_disable) +{ + vnet_main_t *vnm = vnet_get_main (); + vnet_sw_interface_t *sw; + int rv = 0; + + /* Utterly wrong? */ + if (pool_is_free_index (vnm->interface_main.sw_interfaces, sw_if_index)) + return VNET_API_ERROR_INVALID_SW_IF_INDEX; + + /* Not a physical port? */ + sw = vnet_get_sw_interface (vnm, sw_if_index); + if (sw->type != VNET_SW_INTERFACE_TYPE_HARDWARE) + return VNET_API_ERROR_INVALID_SW_IF_INDEX; + + if (enable_disable) + { + syn_filter4_runtime_t *rt; + + /* *INDENT-OFF* */ + foreach_vlib_main ({ + rt = vlib_node_get_runtime_data (this_vlib_main, syn_filter4_node.index); + vec_validate (rt->syn_counts, 1023); + /* + * Given perfect disperson / optimal hashing results: + * Allow 128k (successful) syns/sec. 1024, buckets each of which + * absorb 128 syns before filtering. Reset table once a second. + * Reality bites, lets try resetting once every 100ms. + */ + rt->reset_interval = 0.1; /* reset interval in seconds */ + }); + /* *INDENT-ON* */ + } + + rv = vnet_feature_enable_disable ("ip4-local", "syn-filter-4", + sw_if_index, enable_disable, 0, 0); + + return rv; +} + +static clib_error_t * +syn_filter_enable_disable_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vnet_main_t *vnm = vnet_get_main (); + u32 sw_if_index = ~0; + int enable_disable = 1; + int rv; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "disable")) + enable_disable = 0; + else if (unformat (input, "%U", unformat_vnet_sw_interface, + vnm, &sw_if_index)) + ; + else + break; + } + + if (sw_if_index == ~0) + return clib_error_return (0, "Please specify an interface..."); + + rv = syn_filter_enable_disable (sw_if_index, enable_disable); + + switch (rv) + { + case 0: + break; + + case VNET_API_ERROR_INVALID_SW_IF_INDEX: + return clib_error_return + (0, "Invalid interface, only works on physical ports"); + break; + + case VNET_API_ERROR_UNIMPLEMENTED: + return clib_error_return (0, + "Device driver doesn't support redirection"); + break; + + case VNET_API_ERROR_INVALID_VALUE: + return clib_error_return (0, "feature arc not found"); + + case VNET_API_ERROR_INVALID_VALUE_2: + return clib_error_return (0, "feature node not found"); + + default: + return clib_error_return (0, "syn_filter_enable_disable returned %d", + rv); + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (sr_content_command, static) = +{ + .path = "ip syn filter", + .short_help = "ip syn filter <interface-name> [disable]", + .function = syn_filter_enable_disable_command_fn, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c new file mode 100644 index 00000000..37640cc6 --- /dev/null +++ b/src/vnet/tcp/tcp_test.c @@ -0,0 +1,1764 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vnet/tcp/tcp.h> + +#define TCP_TEST_I(_cond, _comment, _args...) \ +({ \ + int _evald = (_cond); \ + if (!(_evald)) { \ + fformat(stderr, "FAIL:%d: " _comment "\n", \ + __LINE__, ##_args); \ + } else { \ + fformat(stderr, "PASS:%d: " _comment "\n", \ + __LINE__, ##_args); \ + } \ + _evald; \ +}) + +#define TCP_TEST(_cond, _comment, _args...) \ +{ \ + if (!TCP_TEST_I(_cond, _comment, ##_args)) { \ + return 1; \ + } \ +} + +/* *INDENT-OFF* */ +scoreboard_trace_elt_t sb_trace[] = {}; +/* *INDENT-ON* */ + +static int +tcp_test_scoreboard_replay (vlib_main_t * vm, unformat_input_t * input) +{ + int verbose = 0; + tcp_connection_t _tc, *tc = &_tc; + u8 *s = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "detail")) + verbose = 1; + else + { + clib_error_t *e = clib_error_return + (0, "unknown input `%U'", format_unformat_error, input); + clib_error_report (e); + return -1; + } + } + +#if TCP_SCOREBOARD_TRACE + tc->sack_sb.trace = sb_trace; +#endif + s = tcp_scoreboard_replay (s, tc, verbose); + vlib_cli_output (vm, "%v", s); + return 0; +} + +static int +tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) +{ + tcp_connection_t _tc, *tc = &_tc; + sack_scoreboard_t *sb = &tc->sack_sb; + sack_block_t *sacks = 0, block; + sack_scoreboard_hole_t *hole; + int i, verbose = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "verbose")) + verbose = 1; + else if (unformat (input, "replay")) + return tcp_test_scoreboard_replay (vm, input); + } + + memset (tc, 0, sizeof (*tc)); + + tc->snd_una = 0; + tc->snd_una_max = 1000; + tc->snd_nxt = 1000; + tc->rcv_opts.flags |= TCP_OPTS_FLAG_SACK; + scoreboard_init (&tc->sack_sb); + + for (i = 0; i < 1000 / 100; i++) + { + block.start = i * 100; + block.end = (i + 1) * 100; + vec_add1 (sacks, block); + } + + /* + * Inject even blocks + */ + + for (i = 0; i < 1000 / 200; i++) + { + vec_add1 (tc->rcv_opts.sacks, sacks[i * 2]); + } + tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks); + tcp_rcv_sacks (tc, 0); + + if (verbose) + vlib_cli_output (vm, "sb after even blocks:\n%U", format_tcp_scoreboard, + sb); + + TCP_TEST ((pool_elts (sb->holes) == 5), + "scoreboard has %d elements", pool_elts (sb->holes)); + + /* First SACK block should be rejected */ + hole = scoreboard_first_hole (sb); + TCP_TEST ((hole->start == 0 && hole->end == 200), + "first hole start %u end %u", hole->start, hole->end); + hole = scoreboard_last_hole (sb); + TCP_TEST ((hole->start == 900 && hole->end == 1000), + "last hole start %u end %u", hole->start, hole->end); + TCP_TEST ((sb->sacked_bytes == 400), "sacked bytes %d", sb->sacked_bytes); + TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv); + TCP_TEST ((sb->last_sacked_bytes == 400), + "last sacked bytes %d", sb->last_sacked_bytes); + TCP_TEST ((sb->high_sacked == 900), "max byte sacked %u", sb->high_sacked); + /* + * Inject odd blocks + */ + + vec_reset_length (tc->rcv_opts.sacks); + for (i = 0; i < 1000 / 200; i++) + { + vec_add1 (tc->rcv_opts.sacks, sacks[i * 2 + 1]); + } + tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks); + tcp_rcv_sacks (tc, 0); + + if (verbose) + vlib_cli_output (vm, "sb after odd blocks:\n%U", format_tcp_scoreboard, + sb); + + hole = scoreboard_first_hole (sb); + TCP_TEST ((pool_elts (sb->holes) == 1), + "scoreboard has %d holes", pool_elts (sb->holes)); + TCP_TEST ((hole->start == 0 && hole->end == 100), + "first hole start %u end %u", hole->start, hole->end); + TCP_TEST ((sb->sacked_bytes == 900), "sacked bytes %d", sb->sacked_bytes); + TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv); + TCP_TEST ((sb->high_sacked == 1000), "max sacked byte %u", sb->high_sacked); + TCP_TEST ((sb->last_sacked_bytes == 500), + "last sacked bytes %d", sb->last_sacked_bytes); + + /* + * Ack until byte 100, all bytes are now acked + sacked + */ + tcp_rcv_sacks (tc, 100); + if (verbose) + vlib_cli_output (vm, "ack until byte 100:\n%U", format_tcp_scoreboard, + sb); + + TCP_TEST ((pool_elts (sb->holes) == 0), + "scoreboard has %d elements", pool_elts (sb->holes)); + TCP_TEST ((sb->snd_una_adv == 900), + "snd_una_adv after ack %u", sb->snd_una_adv); + TCP_TEST ((sb->high_sacked == 1000), "max sacked byte %u", sb->high_sacked); + TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes); + TCP_TEST ((sb->last_sacked_bytes == 0), + "last sacked bytes %d", sb->last_sacked_bytes); + + /* + * Add new block + */ + + vec_reset_length (tc->rcv_opts.sacks); + + block.start = 1200; + block.end = 1300; + vec_add1 (tc->rcv_opts.sacks, block); + + if (verbose) + vlib_cli_output (vm, "add [1200, 1300]:\n%U", format_tcp_scoreboard, sb); + tc->snd_una_max = 1500; + tc->snd_una = 1000; + tc->snd_nxt = 1500; + tcp_rcv_sacks (tc, 1000); + + if (verbose) + vlib_cli_output (vm, "sb snd_una_max 1500, snd_una 1000:\n%U", + format_tcp_scoreboard, sb); + + TCP_TEST ((sb->snd_una_adv == 0), + "snd_una_adv after ack %u", sb->snd_una_adv); + TCP_TEST ((pool_elts (sb->holes) == 2), + "scoreboard has %d holes", pool_elts (sb->holes)); + hole = scoreboard_first_hole (sb); + TCP_TEST ((hole->start == 1000 && hole->end == 1200), + "first hole start %u end %u", hole->start, hole->end); + TCP_TEST ((sb->snd_una_adv == 0), + "snd_una_adv after ack %u", sb->snd_una_adv); + TCP_TEST ((sb->high_sacked == 1300), "max sacked byte %u", sb->high_sacked); + hole = scoreboard_last_hole (sb); + TCP_TEST ((hole->start == 1300 && hole->end == 1500), + "last hole start %u end %u", hole->start, hole->end); + TCP_TEST ((sb->sacked_bytes == 100), "sacked bytes %d", sb->sacked_bytes); + + /* + * Ack first hole + */ + + vec_reset_length (tc->rcv_opts.sacks); + tcp_rcv_sacks (tc, 1200); + + if (verbose) + vlib_cli_output (vm, "sb ack up to byte 1200:\n%U", format_tcp_scoreboard, + sb); + + TCP_TEST ((sb->snd_una_adv == 100), + "snd_una_adv after ack %u", sb->snd_una_adv); + TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes); + TCP_TEST ((pool_elts (sb->holes) == 1), + "scoreboard has %d elements", pool_elts (sb->holes)); + hole = scoreboard_first_hole (sb); + TCP_TEST ((hole->prev == TCP_INVALID_SACK_HOLE_INDEX + && hole->next == TCP_INVALID_SACK_HOLE_INDEX), "hole is valid"); + TCP_TEST ((sb->last_bytes_delivered == 100), "last bytes delivered %d", + sb->last_bytes_delivered); + + /* + * Add some more blocks and then remove all + */ + vec_reset_length (tc->rcv_opts.sacks); + tc->snd_una += sb->snd_una_adv; + tc->snd_una_max = 1900; + for (i = 0; i < 5; i++) + { + block.start = i * 100 + 1200; + block.end = (i + 1) * 100 + 1200; + vec_add1 (tc->rcv_opts.sacks, block); + } + tcp_rcv_sacks (tc, 1900); + + scoreboard_clear (sb); + if (verbose) + vlib_cli_output (vm, "sb cleared all:\n%U", format_tcp_scoreboard, sb); + + TCP_TEST ((pool_elts (sb->holes) == 0), + "number of holes %d", pool_elts (sb->holes)); + TCP_TEST ((sb->head == TCP_INVALID_SACK_HOLE_INDEX), "head %u", sb->head); + TCP_TEST ((sb->tail == TCP_INVALID_SACK_HOLE_INDEX), "tail %u", sb->tail); + + /* + * Re-inject odd blocks and ack them all + */ + + tc->snd_una = 0; + tc->snd_una_max = 1000; + tc->snd_nxt = 1000; + for (i = 0; i < 5; i++) + { + vec_add1 (tc->rcv_opts.sacks, sacks[i * 2 + 1]); + } + tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks); + tcp_rcv_sacks (tc, 0); + if (verbose) + vlib_cli_output (vm, "sb added odd blocks and ack [0, 950]:\n%U", + format_tcp_scoreboard, sb); + + tcp_rcv_sacks (tc, 950); + + if (verbose) + vlib_cli_output (vm, "sb added odd blocks and ack [0, 950]:\n%U", + format_tcp_scoreboard, sb); + + TCP_TEST ((pool_elts (sb->holes) == 0), + "scoreboard has %d elements", pool_elts (sb->holes)); + TCP_TEST ((sb->snd_una_adv == 50), "snd_una_adv %u", sb->snd_una_adv); + TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes); + TCP_TEST ((sb->last_sacked_bytes == 0), + "last sacked bytes %d", sb->last_sacked_bytes); + + /* + * Inject one block, ack it and overlap hole + */ + + tc->snd_una = 0; + tc->snd_una_max = 1000; + tc->snd_nxt = 1000; + + block.start = 100; + block.end = 500; + vec_add1 (tc->rcv_opts.sacks, block); + tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks); + + tcp_rcv_sacks (tc, 0); + + if (verbose) + vlib_cli_output (vm, "sb added [100, 500]:\n%U", + format_tcp_scoreboard, sb); + + tcp_rcv_sacks (tc, 800); + + if (verbose) + vlib_cli_output (vm, "sb ack [0, 800]:\n%U", format_tcp_scoreboard, sb); + + TCP_TEST ((pool_elts (sb->holes) == 1), + "scoreboard has %d elements", pool_elts (sb->holes)); + TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv); + TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes); + TCP_TEST ((sb->last_sacked_bytes == 0), + "last sacked bytes %d", sb->last_sacked_bytes); + TCP_TEST ((sb->last_bytes_delivered == 400), + "last bytes delivered %d", sb->last_bytes_delivered); + + /* + * One hole close to head, patch head, split in two and start acking + * the lowest part + */ + scoreboard_clear (sb); + tc->snd_una = 0; + tc->snd_una_max = 1000; + tc->snd_nxt = 1000; + + block.start = 500; + block.end = 1000; + vec_add1 (tc->rcv_opts.sacks, block); + tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks); + + tcp_rcv_sacks (tc, 0); + if (verbose) + vlib_cli_output (vm, "sb added [500, 1000]:\n%U", + format_tcp_scoreboard, sb); + + vec_reset_length (tc->rcv_opts.sacks); + block.start = 300; + block.end = 400; + vec_add1 (tc->rcv_opts.sacks, block); + tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks); + tcp_rcv_sacks (tc, 100); + if (verbose) + vlib_cli_output (vm, "sb added [0, 100] [300, 400]:\n%U", + format_tcp_scoreboard, sb); + TCP_TEST ((pool_elts (sb->holes) == 2), + "scoreboard has %d elements", pool_elts (sb->holes)); + + tc->snd_una = 100; + tcp_rcv_sacks (tc, 200); + tcp_rcv_sacks (tc, 300); + if (verbose) + vlib_cli_output (vm, "sb added [0, 300]:\n%U", format_tcp_scoreboard, sb); + TCP_TEST ((sb->sacked_bytes == 500), "sacked bytes %d", sb->sacked_bytes); + + return 0; +} + +static int +tcp_test_sack_tx (vlib_main_t * vm, unformat_input_t * input) +{ + tcp_connection_t _tc, *tc = &_tc; + sack_block_t *sacks; + int i, verbose = 0, expected; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "verbose")) + verbose = 1; + else + { + vlib_cli_output (vm, "parse error: '%U'", format_unformat_error, + input); + return -1; + } + } + + memset (tc, 0, sizeof (*tc)); + + /* + * Add odd sack block pairs + */ + for (i = 1; i < 10; i += 2) + { + tcp_update_sack_list (tc, i * 100, (i + 1) * 100); + } + + TCP_TEST ((vec_len (tc->snd_sacks) == 5), "sack blocks %d expected %d", + vec_len (tc->snd_sacks), 5); + TCP_TEST ((tc->snd_sacks[0].start = 900), + "first sack block start %u expected %u", tc->snd_sacks[0].start, + 900); + + /* + * Try to add one extra + */ + sacks = vec_dup (tc->snd_sacks); + + tcp_update_sack_list (tc, 1100, 1200); + if (verbose) + vlib_cli_output (vm, "add new segment [1100, 1200]\n%U", + format_tcp_sacks, tc); + expected = 5 < TCP_MAX_SACK_BLOCKS ? 6 : 5; + TCP_TEST ((vec_len (tc->snd_sacks) == expected), + "sack blocks %d expected %d", vec_len (tc->snd_sacks), expected); + TCP_TEST ((tc->snd_sacks[0].start == 1100), + "first sack block start %u expected %u", tc->snd_sacks[0].start, + 1100); + + /* restore */ + vec_free (tc->snd_sacks); + tc->snd_sacks = sacks; + + /* + * Overlap first 2 segment + */ + tc->rcv_nxt = 300; + tcp_update_sack_list (tc, 300, 300); + if (verbose) + vlib_cli_output (vm, "overlap first 2 segments:\n%U", + format_tcp_sacks, tc); + TCP_TEST ((vec_len (tc->snd_sacks) == 3), "sack blocks %d expected %d", + vec_len (tc->snd_sacks), 3); + TCP_TEST ((tc->snd_sacks[0].start == 900), + "first sack block start %u expected %u", tc->snd_sacks[0].start, + 500); + + /* + * Add a new segment + */ + tcp_update_sack_list (tc, 1100, 1200); + if (verbose) + vlib_cli_output (vm, "add new segment [1100, 1200]\n%U", + format_tcp_sacks, tc); + TCP_TEST ((vec_len (tc->snd_sacks) == 4), "sack blocks %d expected %d", + vec_len (tc->snd_sacks), 4); + TCP_TEST ((tc->snd_sacks[0].start == 1100), + "first sack block start %u expected %u", tc->snd_sacks[0].start, + 1100); + + /* + * Join middle segments + */ + tcp_update_sack_list (tc, 800, 900); + if (verbose) + vlib_cli_output (vm, "join middle segments [800, 900]\n%U", + format_tcp_sacks, tc); + + TCP_TEST ((vec_len (tc->snd_sacks) == 3), "sack blocks %d expected %d", + vec_len (tc->snd_sacks), 3); + TCP_TEST ((tc->snd_sacks[0].start == 700), + "first sack block start %u expected %u", tc->snd_sacks[0].start, + 1100); + + /* + * Advance rcv_nxt to overlap all + */ + tc->rcv_nxt = 1200; + tcp_update_sack_list (tc, 1200, 1200); + if (verbose) + vlib_cli_output (vm, "advance rcv_nxt to 1200\n%U", format_tcp_sacks, tc); + TCP_TEST ((vec_len (tc->snd_sacks) == 0), "sack blocks %d expected %d", + vec_len (tc->snd_sacks), 0); + + + /* + * Add 2 blocks, overwrite first and update rcv_nxt to also remove it + */ + + vec_reset_length (tc->snd_sacks); + tc->rcv_nxt = 0; + + tcp_update_sack_list (tc, 100, 200); + tcp_update_sack_list (tc, 300, 400); + + if (verbose) + vlib_cli_output (vm, "add [100, 200] [300, 400]\n%U", + format_tcp_sacks, tc); + TCP_TEST ((vec_len (tc->snd_sacks) == 2), + "sack blocks %d expected %d", vec_len (tc->snd_sacks), 2); + TCP_TEST ((tc->snd_sacks[0].start == 300), + "first sack block start %u expected %u", tc->snd_sacks[0].start, + 300); + + tc->rcv_nxt = 100; + tcp_update_sack_list (tc, 100, 100); + if (verbose) + vlib_cli_output (vm, "add [100, 200] rcv_nxt = 100\n%U", + format_tcp_sacks, tc); + TCP_TEST ((vec_len (tc->snd_sacks) == 1), + "sack blocks %d expected %d", vec_len (tc->snd_sacks), 1); + TCP_TEST ((tc->snd_sacks[0].start == 300), + "first sack block start %u expected %u", tc->snd_sacks[0].start, + 300); + return 0; +} + +static int +tcp_test_sack (vlib_main_t * vm, unformat_input_t * input) +{ + int res = 0; + + /* Run all tests */ + if (unformat_check_input (input) == UNFORMAT_END_OF_INPUT) + { + if (tcp_test_sack_tx (vm, input)) + { + return -1; + } + + if (tcp_test_sack_rx (vm, input)) + { + return -1; + } + } + else + { + if (unformat (input, "tx")) + { + res = tcp_test_sack_tx (vm, input); + } + else if (unformat (input, "rx")) + { + res = tcp_test_sack_rx (vm, input); + } + } + + return res; +} + + +typedef struct +{ + u32 offset; + u32 len; +} test_pattern_t; + +/* *INDENT-OFF* */ +test_pattern_t test_pattern[] = { + {380, 8}, {768, 8}, {1156, 8}, {1544, 8}, {1932, 8}, {2320, 8}, {2708, 8}, + {2992, 8}, {372, 8}, {760, 8}, {1148, 8}, {1536, 8}, {1924, 8}, {2312, 8}, + {2700, 8}, {2984, 8}, {364, 8}, {752, 8}, {1140, 8}, {1528, 8}, {1916, 8}, + {2304, 8}, {2692, 8}, {2976, 8}, {356, 8}, {744, 8}, {1132, 8}, {1520, 8}, + {1908, 8}, {2296, 8}, {2684, 8}, {2968, 8}, {348, 8}, {736, 8}, {1124, 8}, + {1512, 8}, {1900, 8}, {2288, 8}, {2676, 8}, {2960, 8}, {340, 8}, {728, 8}, + {1116, 8}, {1504, 8}, {1892, 8}, {2280, 8}, {2668, 8}, {2952, 8}, {332, 8}, + {720, 8}, {1108, 8}, {1496, 8}, {1884, 8}, {2272, 8}, {2660, 8}, {2944, 8}, + {324, 8}, {712, 8}, {1100, 8}, {1488, 8}, {1876, 8}, {2264, 8}, {2652, 8}, + {2936, 8}, {316, 8}, {704, 8}, {1092, 8}, {1480, 8}, {1868, 8}, {2256, 8}, + {2644, 8}, {2928, 8}, {308, 8}, {696, 8}, {1084, 8}, {1472, 8}, {1860, 8}, + {2248, 8}, {2636, 8}, {2920, 8}, {300, 8}, {688, 8}, {1076, 8}, {1464, 8}, + {1852, 8}, {2240, 8}, {2628, 8}, {2912, 8}, {292, 8}, {680, 8}, {1068, 8}, + {1456, 8}, {1844, 8}, {2232, 8}, {2620, 8}, {2904, 8}, {284, 8}, {672, 8}, + {1060, 8}, {1448, 8}, {1836, 8}, {2224, 8}, {2612, 8}, {2896, 8}, {276, 8}, + {664, 8}, {1052, 8}, {1440, 8}, {1828, 8}, {2216, 8}, {2604, 8}, {2888, 8}, + {268, 8}, {656, 8}, {1044, 8}, {1432, 8}, {1820, 8}, {2208, 8}, {2596, 8}, + {2880, 8}, {260, 8}, {648, 8}, {1036, 8}, {1424, 8}, {1812, 8}, {2200, 8}, + {2588, 8}, {2872, 8}, {252, 8}, {640, 8}, {1028, 8}, {1416, 8}, {1804, 8}, + {2192, 8}, {2580, 8}, {2864, 8}, {244, 8}, {632, 8}, {1020, 8}, {1408, 8}, + {1796, 8}, {2184, 8}, {2572, 8}, {2856, 8}, {236, 8}, {624, 8}, {1012, 8}, + {1400, 8}, {1788, 8}, {2176, 8}, {2564, 8}, {2848, 8}, {228, 8}, {616, 8}, + {1004, 8}, {1392, 8}, {1780, 8}, {2168, 8}, {2556, 8}, {2840, 8}, {220, 8}, + {608, 8}, {996, 8}, {1384, 8}, {1772, 8}, {2160, 8}, {2548, 8}, {2832, 8}, + {212, 8}, {600, 8}, {988, 8}, {1376, 8}, {1764, 8}, {2152, 8}, {2540, 8}, + {2824, 8}, {204, 8}, {592, 8}, {980, 8}, {1368, 8}, {1756, 8}, {2144, 8}, + {2532, 8}, {2816, 8}, {196, 8}, {584, 8}, {972, 8}, {1360, 8}, {1748, 8}, + {2136, 8}, {2524, 8}, {2808, 8}, {188, 8}, {576, 8}, {964, 8}, {1352, 8}, + {1740, 8}, {2128, 8}, {2516, 8}, {2800, 8}, {180, 8}, {568, 8}, {956, 8}, + {1344, 8}, {1732, 8}, {2120, 8}, {2508, 8}, {2792, 8}, {172, 8}, {560, 8}, + {948, 8}, {1336, 8}, {1724, 8}, {2112, 8}, {2500, 8}, {2784, 8}, {164, 8}, + {552, 8}, {940, 8}, {1328, 8}, {1716, 8}, {2104, 8}, {2492, 8}, {2776, 8}, + {156, 8}, {544, 8}, {932, 8}, {1320, 8}, {1708, 8}, {2096, 8}, {2484, 8}, + {2768, 8}, {148, 8}, {536, 8}, {924, 8}, {1312, 8}, {1700, 8}, {2088, 8}, + {2476, 8}, {2760, 8}, {140, 8}, {528, 8}, {916, 8}, {1304, 8}, {1692, 8}, + {2080, 8}, {2468, 8}, {2752, 8}, {132, 8}, {520, 8}, {908, 8}, {1296, 8}, + {1684, 8}, {2072, 8}, {2460, 8}, {2744, 8}, {124, 8}, {512, 8}, {900, 8}, + {1288, 8}, {1676, 8}, {2064, 8}, {2452, 8}, {2736, 8}, {116, 8}, {504, 8}, + {892, 8}, {1280, 8}, {1668, 8}, {2056, 8}, {2444, 8}, {2728, 8}, {108, 8}, + {496, 8}, {884, 8}, {1272, 8}, {1660, 8}, {2048, 8}, {2436, 8}, {2720, 8}, + {100, 8}, {488, 8}, {876, 8}, {1264, 8}, {1652, 8}, {2040, 8}, {2428, 8}, + {2716, 4}, {92, 8}, {480, 8}, {868, 8}, {1256, 8}, {1644, 8}, {2032, 8}, + {2420, 8}, {84, 8}, {472, 8}, {860, 8}, {1248, 8}, {1636, 8}, {2024, 8}, + {2412, 8}, {76, 8}, {464, 8}, {852, 8}, {1240, 8}, {1628, 8}, {2016, 8}, + {2404, 8}, {68, 8}, {456, 8}, {844, 8}, {1232, 8}, {1620, 8}, {2008, 8}, + {2396, 8}, {60, 8}, {448, 8}, {836, 8}, {1224, 8}, {1612, 8}, {2000, 8}, + {2388, 8}, {52, 8}, {440, 8}, {828, 8}, {1216, 8}, {1604, 8}, {1992, 8}, + {2380, 8}, {44, 8}, {432, 8}, {820, 8}, {1208, 8}, {1596, 8}, {1984, 8}, + {2372, 8}, {36, 8}, {424, 8}, {812, 8}, {1200, 8}, {1588, 8}, {1976, 8}, + {2364, 8}, {28, 8}, {416, 8}, {804, 8}, {1192, 8}, {1580, 8}, {1968, 8}, + {2356, 8}, {20, 8}, {408, 8}, {796, 8}, {1184, 8}, {1572, 8}, {1960, 8}, + {2348, 8}, {12, 8}, {400, 8}, {788, 8}, {1176, 8}, {1564, 8}, {1952, 8}, + {2340, 8}, {4, 8}, {392, 8}, {780, 8}, {1168, 8}, {1556, 8}, {1944, 8}, + {2332, 8}, + /* missing from original data set */ + {388, 4}, {776, 4}, {1164, 4}, {1552, 4}, {1940, 4}, {2328, 4}, +}; +/* *INDENT-ON* */ + +int +pattern_cmp (const void *arg1, const void *arg2) +{ + test_pattern_t *a1 = (test_pattern_t *) arg1; + test_pattern_t *a2 = (test_pattern_t *) arg2; + + if (a1->offset < a2->offset) + return -1; + else if (a1->offset > a2->offset) + return 1; + return 0; +} + +static u8 +fifo_validate_pattern (vlib_main_t * vm, test_pattern_t * pattern, + u32 pattern_length) +{ + test_pattern_t *tp = pattern; + int i; + + /* Go through the pattern and make 100% sure it's sane */ + for (i = 0; i < pattern_length - 1; i++) + { + if (tp->offset + tp->len != (tp + 1)->offset) + { + vlib_cli_output (vm, "[%d] missing {%d, %d}", i, + (tp->offset + tp->len), + (tp + 1)->offset - (tp->offset + tp->len)); + return 0; + } + tp++; + } + return 1; +} + +static test_pattern_t * +fifo_get_validate_pattern (vlib_main_t * vm, test_pattern_t * test_data, + u32 test_data_len) +{ + test_pattern_t *validate_pattern = 0; + + /* Validate, and try segments in order... */ + vec_validate (validate_pattern, test_data_len - 1); + memcpy (validate_pattern, test_data, + test_data_len * sizeof (test_pattern_t)); + qsort ((u8 *) validate_pattern, test_data_len, sizeof (test_pattern_t), + pattern_cmp); + + if (fifo_validate_pattern (vm, validate_pattern, test_data_len) == 0) + return 0; + + return validate_pattern; +} + +static svm_fifo_t * +fifo_prepare (u32 fifo_size) +{ + svm_fifo_t *f; + f = svm_fifo_create (fifo_size); + + /* Paint fifo data vector with -1's */ + memset (f->data, 0xFF, fifo_size); + + return f; +} + +static int +compare_data (u8 * data1, u8 * data2, u32 start, u32 len, u32 * index) +{ + int i; + + for (i = start; i < len; i++) + { + if (data1[i] != data2[i]) + { + *index = i; + return 1; + } + } + return 0; +} + +int +tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) +{ + svm_fifo_t *f; + u32 fifo_size = 1 << 20; + u32 *test_data = 0; + u32 offset; + int i, rv, verbose = 0; + u32 data_word, test_data_len, j; + ooo_segment_t *ooo_seg; + u8 *data, *s, *data_buf = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "verbose")) + verbose = 1; + } + + test_data_len = fifo_size / sizeof (u32); + vec_validate (test_data, test_data_len - 1); + + for (i = 0; i < vec_len (test_data); i++) + test_data[i] = i; + + f = fifo_prepare (fifo_size); + + /* + * Enqueue an initial (un-dequeued) chunk + */ + rv = svm_fifo_enqueue_nowait (f, sizeof (u32), (u8 *) test_data); + TCP_TEST ((rv == sizeof (u32)), "enqueued %d", rv); + TCP_TEST ((f->tail == 4), "fifo tail %u", f->tail); + + /* + * Create 3 chunks in the future. The offsets are relative + * to the current fifo tail + */ + for (i = 0; i < 3; i++) + { + offset = (2 * i + 1) * sizeof (u32) - f->tail; + data = (u8 *) (test_data + (2 * i + 1)); + if (i == 0) + { + rv = svm_fifo_enqueue_nowait (f, sizeof (u32), data); + rv = rv > 0 ? 0 : rv; + } + else + rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data); + if (verbose) + vlib_cli_output (vm, "add [%d] [%d, %d]", 2 * i + 1, offset, + offset + sizeof (u32)); + if (rv) + { + clib_warning ("enqueue returned %d", rv); + goto err; + } + } + + if (verbose) + vlib_cli_output (vm, "fifo after odd segs: %U", format_svm_fifo, f, 1); + + TCP_TEST ((f->tail == 8), "fifo tail %u", f->tail); + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 2), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); + + /* + * Try adding a completely overlapped segment + */ + offset = 3 * sizeof (u32) - f->tail; + data = (u8 *) (test_data + 3); + rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data); + if (rv) + { + clib_warning ("enqueue returned %d", rv); + goto err; + } + + if (verbose) + vlib_cli_output (vm, "fifo after overlap seg: %U", format_svm_fifo, f, 1); + + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 2), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); + + /* + * Make sure format functions are not buggy + */ + s = format (0, "%U", format_svm_fifo, f, 2); + vec_free (s); + + /* + * Paint some of missing data backwards + */ + for (i = 3; i > 1; i--) + { + offset = (2 * i + 0) * sizeof (u32) - f->tail; + data = (u8 *) (test_data + (2 * i + 0)); + rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data); + if (verbose) + vlib_cli_output (vm, "add [%d] [%d, %d]", 2 * i, offset, + offset + sizeof (u32)); + if (rv) + { + clib_warning ("enqueue returned %d", rv); + goto err; + } + } + + if (verbose) + vlib_cli_output (vm, "fifo before missing link: %U", format_svm_fifo, f, + 1); + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 1), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); + ooo_seg = svm_fifo_first_ooo_segment (f); + TCP_TEST ((ooo_seg->start == 12), + "first ooo seg position %u", ooo_seg->start); + TCP_TEST ((ooo_seg->length == 16), + "first ooo seg length %u", ooo_seg->length); + + /* + * Enqueue the missing u32 + */ + rv = svm_fifo_enqueue_nowait (f, sizeof (u32), (u8 *) (test_data + 2)); + if (verbose) + vlib_cli_output (vm, "fifo after missing link: %U", format_svm_fifo, f, + 1); + TCP_TEST ((rv == 20), "bytes to be enqueued %u", rv); + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 0), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); + + /* + * Collect results + */ + for (i = 0; i < 7; i++) + { + rv = svm_fifo_dequeue_nowait (f, sizeof (u32), (u8 *) & data_word); + if (rv != sizeof (u32)) + { + clib_warning ("bytes dequeues %u", rv); + goto err; + } + if (data_word != test_data[i]) + { + clib_warning ("recovered [%d] %d not %d", i, data_word, + test_data[i]); + goto err; + } + } + + /* + * Test segment overlaps: last ooo segment overlaps all + */ + svm_fifo_free (f); + f = fifo_prepare (fifo_size); + + for (i = 0; i < 4; i++) + { + offset = (2 * i + 1) * sizeof (u32) - f->tail; + data = (u8 *) (test_data + (2 * i + 1)); + rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data); + if (verbose) + vlib_cli_output (vm, "add [%d] [%d, %d]", 2 * i + 1, offset, + offset + sizeof (u32)); + if (rv) + { + clib_warning ("enqueue returned %d", rv); + goto err; + } + } + + rv = svm_fifo_enqueue_with_offset (f, 8 - f->tail, 21, data); + TCP_TEST ((rv == 0), "ooo enqueued %u", rv); + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 1), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); + + vec_validate (data_buf, vec_len (data)); + svm_fifo_peek (f, 0, vec_len (data), data_buf); + if (compare_data (data_buf, data, 8, vec_len (data), &j)) + { + TCP_TEST (0, "[%d] peeked %u expected %u", j, data_buf[j], data[j]); + } + vec_reset_length (data_buf); + + /* + * Test segment overlaps: enqueue and overlap ooo segments + */ + svm_fifo_free (f); + f = fifo_prepare (fifo_size); + + for (i = 0; i < 4; i++) + { + offset = (2 * i + 1) * sizeof (u32) - f->tail; + data = (u8 *) (test_data + (2 * i + 1)); + rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data); + if (verbose) + vlib_cli_output (vm, "add [%d] [%d, %d]", 2 * i + 1, offset, + offset + sizeof (u32)); + if (rv) + { + clib_warning ("enqueue returned %d", rv); + goto err; + } + } + + if (verbose) + vlib_cli_output (vm, "fifo after enqueue: %U", format_svm_fifo, f, 1); + + rv = svm_fifo_enqueue_nowait (f, 29, data); + if (verbose) + vlib_cli_output (vm, "fifo after enqueueing 29: %U", format_svm_fifo, f, + 1); + TCP_TEST ((rv == 32), "ooo enqueued %u", rv); + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 0), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); + + vec_validate (data_buf, vec_len (data)); + svm_fifo_peek (f, 0, vec_len (data), data_buf); + if (compare_data (data_buf, data, 0, vec_len (data), &j)) + { + TCP_TEST (0, "[%d] peeked %u expected %u", j, data_buf[j], data[j]); + } + + /* Try to peek beyond the data */ + rv = svm_fifo_peek (f, svm_fifo_max_dequeue (f), vec_len (data), data_buf); + TCP_TEST ((rv == 0), "peeked %u expected 0", rv); + + vec_free (data_buf); + svm_fifo_free (f); + vec_free (test_data); + + return 0; + +err: + svm_fifo_free (f); + vec_free (test_data); + return -1; +} + +static int +tcp_test_fifo2 (vlib_main_t * vm) +{ + svm_fifo_t *f; + u32 fifo_size = 1 << 20; + int i, rv, test_data_len; + u64 data64; + test_pattern_t *tp, *vp, *test_data; + ooo_segment_t *ooo_seg; + + test_data = test_pattern; + test_data_len = ARRAY_LEN (test_pattern); + + vp = fifo_get_validate_pattern (vm, test_data, test_data_len); + + /* Create a fifo */ + f = fifo_prepare (fifo_size); + + /* + * Try with sorted data + */ + for (i = 0; i < test_data_len; i++) + { + tp = vp + i; + data64 = tp->offset; + svm_fifo_enqueue_with_offset (f, tp->offset - f->tail, tp->len, + (u8 *) & data64); + } + + /* Expected result: one big fat chunk at offset 4 */ + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 1), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); + ooo_seg = svm_fifo_first_ooo_segment (f); + TCP_TEST ((ooo_seg->start == 4), + "first ooo seg position %u", ooo_seg->start); + TCP_TEST ((ooo_seg->length == 2996), + "first ooo seg length %u", ooo_seg->length); + + data64 = 0; + rv = svm_fifo_enqueue_nowait (f, sizeof (u32), (u8 *) & data64); + TCP_TEST ((rv == 3000), "bytes to be enqueued %u", rv); + + svm_fifo_free (f); + vec_free (vp); + + /* + * Now try it again w/ unsorted data... + */ + + f = fifo_prepare (fifo_size); + + for (i = 0; i < test_data_len; i++) + { + tp = &test_data[i]; + data64 = tp->offset; + rv = svm_fifo_enqueue_with_offset (f, tp->offset - f->tail, tp->len, + (u8 *) & data64); + if (rv) + { + clib_warning ("enqueue returned %d", rv); + } + } + + /* Expecting the same result: one big fat chunk at offset 4 */ + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 1), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); + ooo_seg = svm_fifo_first_ooo_segment (f); + TCP_TEST ((ooo_seg->start == 4), + "first ooo seg position %u", ooo_seg->start); + TCP_TEST ((ooo_seg->length == 2996), + "first ooo seg length %u", ooo_seg->length); + + data64 = 0; + rv = svm_fifo_enqueue_nowait (f, sizeof (u32), (u8 *) & data64); + + TCP_TEST ((rv == 3000), "bytes to be enqueued %u", rv); + + svm_fifo_free (f); + + return 0; +} + +static int +tcp_test_fifo3 (vlib_main_t * vm, unformat_input_t * input) +{ + svm_fifo_t *f; + u32 fifo_size = 4 << 10; + u32 fifo_initial_offset = 0; + u32 total_size = 2 << 10; + int overlap = 0, verbose = 0, randomize = 1, drop = 0, in_seq_all = 0; + u8 *data_pattern = 0, *data_buf = 0; + test_pattern_t *tp, *generate = 0; + u32 nsegs = 2, seg_size, length_so_far; + u32 current_offset, offset_increment, len_this_chunk; + u32 seed = 0xdeaddabe, j; + int i, rv; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "fifo-size %d", &fifo_size)) + ; + else if (unformat (input, "total-size %d", &total_size)) + ; + else if (unformat (input, "verbose")) + verbose = 1; + else if (unformat (input, "overlap")) + overlap = 1; + else if (unformat (input, "initial-offset %d", &fifo_initial_offset)) + ; + else if (unformat (input, "seed %d", &seed)) + ; + else if (unformat (input, "nsegs %d", &nsegs)) + ; + else if (unformat (input, "no-randomize")) + randomize = 0; + else if (unformat (input, "in-seq-all")) + in_seq_all = 1; + else if (unformat (input, "drop")) + drop = 1; + else + { + clib_error_t *e = clib_error_return + (0, "unknown input `%U'", format_unformat_error, input); + clib_error_report (e); + return -1; + } + } + + if (total_size > fifo_size) + { + clib_warning ("total_size %d greater than fifo size %d", total_size, + fifo_size); + return -1; + } + if (overlap && randomize == 0) + { + clib_warning ("Can't enqueue in-order with overlap"); + return -1; + } + + /* + * Generate data + */ + vec_validate (data_pattern, total_size - 1); + for (i = 0; i < vec_len (data_pattern); i++) + data_pattern[i] = i & 0xff; + + /* + * Generate segments + */ + seg_size = total_size / nsegs; + length_so_far = 0; + current_offset = randomize; + while (length_so_far < total_size) + { + vec_add2 (generate, tp, 1); + len_this_chunk = clib_min (seg_size, total_size - length_so_far); + tp->offset = current_offset; + tp->len = len_this_chunk; + + if (overlap && (len_this_chunk == seg_size)) + do + { + offset_increment = len_this_chunk + % (1 + (random_u32 (&seed) % len_this_chunk)); + } + while (offset_increment == 0); + else + offset_increment = len_this_chunk; + + current_offset += offset_increment; + length_so_far = tp->offset + tp->len; + } + + /* + * Validate segment list. Only valid for non-overlap cases. + */ + if (overlap == 0) + fifo_validate_pattern (vm, generate, vec_len (generate)); + + if (verbose) + { + vlib_cli_output (vm, "raw data pattern:"); + for (i = 0; i < vec_len (generate); i++) + { + vlib_cli_output (vm, "[%d] offset %u len %u", i, + generate[i].offset, generate[i].len); + } + } + + /* Randomize data pattern */ + if (randomize) + { + for (i = 0; i < vec_len (generate) / 2; i++) + { + u32 src_index, dst_index; + test_pattern_t _tmp, *tmp = &_tmp; + + src_index = random_u32 (&seed) % vec_len (generate); + dst_index = random_u32 (&seed) % vec_len (generate); + + tmp[0] = generate[dst_index]; + generate[dst_index] = generate[src_index]; + generate[src_index] = tmp[0]; + } + if (verbose) + { + vlib_cli_output (vm, "randomized data pattern:"); + for (i = 0; i < vec_len (generate); i++) + { + vlib_cli_output (vm, "[%d] offset %u len %u", i, + generate[i].offset, generate[i].len); + } + } + } + + /* + * Create a fifo and add segments + */ + f = fifo_prepare (fifo_size); + + /* manually set head and tail pointers to validate modular arithmetic */ + fifo_initial_offset = fifo_initial_offset % fifo_size; + f->head = fifo_initial_offset; + f->tail = fifo_initial_offset; + + for (i = !randomize; i < vec_len (generate); i++) + { + tp = generate + i; + svm_fifo_enqueue_with_offset (f, + fifo_initial_offset + tp->offset - + f->tail, tp->len, + (u8 *) data_pattern + tp->offset); + } + + /* Add the first segment in order for non random data */ + if (!randomize) + svm_fifo_enqueue_nowait (f, generate[0].len, (u8 *) data_pattern); + + /* + * Expected result: one big fat chunk at offset 1 if randomize == 1 + */ + + if (verbose) + vlib_cli_output (vm, "fifo before missing link: %U", + format_svm_fifo, f, 1 /* verbose */ ); + + /* + * Add the missing byte if segments were randomized + */ + if (randomize) + { + u32 bytes_to_enq = 1; + if (in_seq_all) + bytes_to_enq = total_size; + rv = svm_fifo_enqueue_nowait (f, bytes_to_enq, data_pattern + 0); + + if (verbose) + vlib_cli_output (vm, "in-order enqueue returned %d", rv); + + TCP_TEST ((rv == total_size), "enqueued %u expected %u", rv, + total_size); + + } + + TCP_TEST ((svm_fifo_has_ooo_data (f) == 0), "number of ooo segments %u", + svm_fifo_number_ooo_segments (f)); + + /* + * Test if peeked data is the same as original data + */ + vec_validate (data_buf, vec_len (data_pattern)); + svm_fifo_peek (f, 0, vec_len (data_pattern), data_buf); + if (compare_data (data_buf, data_pattern, 0, vec_len (data_pattern), &j)) + { + TCP_TEST (0, "[%d] peeked %u expected %u", j, data_buf[j], + data_pattern[j]); + } + vec_reset_length (data_buf); + + /* + * Dequeue or drop all data + */ + if (drop) + { + svm_fifo_dequeue_drop (f, vec_len (data_pattern)); + } + else + { + svm_fifo_dequeue_nowait (f, vec_len (data_pattern), data_buf); + if (compare_data + (data_buf, data_pattern, 0, vec_len (data_pattern), &j)) + { + TCP_TEST (0, "[%d] dequeued %u expected %u", j, data_buf[j], + data_pattern[j]); + } + } + + TCP_TEST ((svm_fifo_max_dequeue (f) == 0), "fifo has %d bytes", + svm_fifo_max_dequeue (f)); + + svm_fifo_free (f); + vec_free (data_pattern); + vec_free (data_buf); + + return 0; +} + +static int +tcp_test_fifo4 (vlib_main_t * vm, unformat_input_t * input) +{ + svm_fifo_t *f; + u32 fifo_size = 6 << 10; + u32 fifo_initial_offset = 1000000000; + u32 test_n_bytes = 5000, j; + u8 *test_data = 0, *data_buf = 0; + int i, rv, verbose = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "verbose")) + verbose = 1; + else + { + clib_error_t *e = clib_error_return + (0, "unknown input `%U'", format_unformat_error, input); + clib_error_report (e); + return -1; + } + } + + /* + * Create a fifo and add segments + */ + f = fifo_prepare (fifo_size); + + /* Set head and tail pointers */ + fifo_initial_offset = fifo_initial_offset % fifo_size; + svm_fifo_init_pointers (f, fifo_initial_offset); + + vec_validate (test_data, test_n_bytes - 1); + for (i = 0; i < vec_len (test_data); i++) + test_data[i] = i; + + for (i = test_n_bytes - 1; i > 0; i--) + { + rv = svm_fifo_enqueue_with_offset (f, fifo_initial_offset + i - f->tail, + sizeof (u8), &test_data[i]); + if (verbose) + vlib_cli_output (vm, "add [%d] [%d, %d]", i, i, i + sizeof (u8)); + if (rv) + { + clib_warning ("enqueue returned %d", rv); + svm_fifo_free (f); + vec_free (test_data); + return -1; + } + } + + svm_fifo_enqueue_nowait (f, sizeof (u8), &test_data[0]); + + vec_validate (data_buf, vec_len (test_data)); + + svm_fifo_dequeue_nowait (f, vec_len (test_data), data_buf); + rv = compare_data (data_buf, test_data, 0, vec_len (test_data), &j); + if (rv) + vlib_cli_output (vm, "[%d] dequeued %u expected %u", j, data_buf[j], + test_data[j]); + TCP_TEST ((rv == 0), "dequeued compared to original returned %d", rv); + + svm_fifo_free (f); + vec_free (test_data); + return 0; +} + +static u32 +fifo_pos (svm_fifo_t * f, u32 pos) +{ + return pos % f->nitems; +} + +static int +tcp_test_fifo5 (vlib_main_t * vm, unformat_input_t * input) +{ + svm_fifo_t *f; + u32 fifo_size = 400, j = 0, offset = 200; + int i, rv, verbose = 0; + u8 *test_data = 0, *data_buf = 0; + ooo_segment_t *ooo_seg; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "verbose")) + verbose = 1; + else + { + clib_error_t *e = clib_error_return + (0, "unknown input `%U'", format_unformat_error, input); + clib_error_report (e); + return -1; + } + } + + f = fifo_prepare (fifo_size); + svm_fifo_init_pointers (f, offset); + + vec_validate (test_data, 399); + for (i = 0; i < vec_len (test_data); i++) + test_data[i] = i % 0xff; + + /* + * Start with [100, 200] and [300, 400] + */ + svm_fifo_enqueue_with_offset (f, 100, 100, &test_data[100]); + svm_fifo_enqueue_with_offset (f, 300, 100, &test_data[300]); + + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 2), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); + TCP_TEST ((f->ooos_newest == 1), "newest %u", f->ooos_newest); + if (verbose) + vlib_cli_output (vm, "fifo after [100, 200] and [300, 400] : %U", + format_svm_fifo, f, 2 /* verbose */ ); + + /* + * Add [225, 275] + */ + + rv = svm_fifo_enqueue_with_offset (f, 225, 50, &test_data[200]); + if (verbose) + vlib_cli_output (vm, "fifo after [225, 275] : %U", + format_svm_fifo, f, 2 /* verbose */ ); + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 3), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); + ooo_seg = svm_fifo_first_ooo_segment (f); + TCP_TEST ((ooo_seg->start == fifo_pos (f, 100 + offset)), + "first seg start %u expected %u", ooo_seg->start, + fifo_pos (f, 100 + offset)); + TCP_TEST ((ooo_seg->length == 100), "first seg length %u expected %u", + ooo_seg->length, 100); + ooo_seg = ooo_segment_next (f, ooo_seg); + TCP_TEST ((ooo_seg->start == fifo_pos (f, 225 + offset)), + "second seg start %u expected %u", + ooo_seg->start, fifo_pos (f, 225 + offset)); + TCP_TEST ((ooo_seg->length == 50), "second seg length %u expected %u", + ooo_seg->length, 50); + ooo_seg = ooo_segment_next (f, ooo_seg); + TCP_TEST ((ooo_seg->start == fifo_pos (f, 300 + offset)), + "third seg start %u expected %u", + ooo_seg->start, fifo_pos (f, 300 + offset)); + TCP_TEST ((ooo_seg->length == 100), "third seg length %u expected %u", + ooo_seg->length, 100); + TCP_TEST ((f->ooos_newest == 2), "newest %u", f->ooos_newest); + /* + * Add [190, 310] + */ + rv = svm_fifo_enqueue_with_offset (f, 190, 120, &test_data[190]); + if (verbose) + vlib_cli_output (vm, "fifo after [190, 310] : %U", + format_svm_fifo, f, 1 /* verbose */ ); + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 1), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); + ooo_seg = svm_fifo_first_ooo_segment (f); + TCP_TEST ((ooo_seg->start == fifo_pos (f, offset + 100)), + "first seg start %u expected %u", + ooo_seg->start, fifo_pos (f, offset + 100)); + TCP_TEST ((ooo_seg->length == 300), "first seg length %u expected %u", + ooo_seg->length, 300); + + /* + * Add [0, 150] + */ + rv = svm_fifo_enqueue_nowait (f, 150, test_data); + + if (verbose) + vlib_cli_output (vm, "fifo after [0 150] : %U", format_svm_fifo, f, + 2 /* verbose */ ); + + TCP_TEST ((rv == 400), "managed to enqueue %u expected %u", rv, 400); + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 0), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); + + vec_validate (data_buf, 399); + svm_fifo_peek (f, 0, 400, data_buf); + if (compare_data (data_buf, test_data, 0, 400, &j)) + { + TCP_TEST (0, "[%d] peeked %u expected %u", j, data_buf[j], + test_data[j]); + } + + /* + * Add [100 200] and overlap it with [50 250] + */ + svm_fifo_free (f); + f = fifo_prepare (fifo_size); + + svm_fifo_enqueue_with_offset (f, 100, 100, &test_data[100]); + svm_fifo_enqueue_with_offset (f, 50, 200, &test_data[50]); + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 1), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); + ooo_seg = svm_fifo_first_ooo_segment (f); + TCP_TEST ((ooo_seg->start == 50), "first seg start %u expected %u", + ooo_seg->start, 50); + TCP_TEST ((ooo_seg->length == 200), "first seg length %u expected %u", + ooo_seg->length, 200); + + svm_fifo_free (f); + vec_free (test_data); + return 0; +} + +/* *INDENT-OFF* */ +svm_fifo_trace_elem_t fifo_trace[] = {}; +/* *INDENT-ON* */ + +static int +tcp_test_fifo_replay (vlib_main_t * vm, unformat_input_t * input) +{ + svm_fifo_t f; + int verbose = 0; + u8 no_read = 0, *str = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "verbose")) + verbose = 1; + else if (unformat (input, "no-read")) + no_read = 1; + else + { + clib_error_t *e = clib_error_return + (0, "unknown input `%U'", format_unformat_error, input); + clib_error_report (e); + return -1; + } + } + +#if SVMF_FIFO_TRACE + f.trace = fifo_trace; +#endif + + str = svm_fifo_replay (str, &f, no_read, verbose); + vlib_cli_output (vm, "%v", str); + return 0; +} + +static int +tcp_test_fifo (vlib_main_t * vm, unformat_input_t * input) +{ + int res = 0; + char *str; + + /* Run all tests */ + if (unformat_check_input (input) == UNFORMAT_END_OF_INPUT) + { + res = tcp_test_fifo1 (vm, input); + if (res) + return res; + + res = tcp_test_fifo2 (vm); + if (res) + return res; + + /* + * Run a number of fifo3 configs + */ + str = "nsegs 10 overlap seed 123"; + unformat_init_cstring (input, str); + if (tcp_test_fifo3 (vm, input)) + return -1; + unformat_free (input); + + str = "nsegs 10 overlap seed 123 in-seq-all"; + unformat_init_cstring (input, str); + if (tcp_test_fifo3 (vm, input)) + return -1; + unformat_free (input); + + str = "nsegs 10 overlap seed 123 initial-offset 3917"; + unformat_init_cstring (input, str); + if (tcp_test_fifo3 (vm, input)) + return -1; + unformat_free (input); + + str = "nsegs 10 overlap seed 123 initial-offset 3917 drop"; + unformat_init_cstring (input, str); + if (tcp_test_fifo3 (vm, input)) + return -1; + unformat_free (input); + + str = "nsegs 10 seed 123 initial-offset 3917 drop no-randomize"; + unformat_init_cstring (input, str); + if (tcp_test_fifo3 (vm, input)) + return -1; + unformat_free (input); + + res = tcp_test_fifo4 (vm, input); + if (res) + return res; + + res = tcp_test_fifo5 (vm, input); + if (res) + return res; + } + else + { + if (unformat (input, "fifo3")) + { + res = tcp_test_fifo3 (vm, input); + } + else if (unformat (input, "fifo2")) + { + res = tcp_test_fifo2 (vm); + } + else if (unformat (input, "fifo1")) + { + res = tcp_test_fifo1 (vm, input); + } + else if (unformat (input, "fifo4")) + { + res = tcp_test_fifo4 (vm, input); + } + else if (unformat (input, "fifo5")) + { + res = tcp_test_fifo5 (vm, input); + } + else if (unformat (input, "replay")) + { + res = tcp_test_fifo_replay (vm, input); + } + } + + return res; +} + +static int +tcp_test_lookup (vlib_main_t * vm, unformat_input_t * input) +{ + session_manager_main_t *smm = &session_manager_main; + tcp_main_t *tm = &tcp_main; + transport_connection_t _tc1, *tc1 = &_tc1, _tc2, *tc2 = &_tc2, *tconn; + tcp_connection_t *tc; + stream_session_t *s; + u8 cmp = 0; + + pool_get (smm->sessions[0], s); + memset (s, 0, sizeof (*s)); + s->session_index = s - smm->sessions[0]; + + pool_get (tm->connections[0], tc); + memset (tc, 0, sizeof (*tc)); + tc->connection.c_index = tc - tm->connections[0]; + tc->connection.s_index = s->session_index; + s->connection_index = tc->connection.c_index; + + tc->connection.lcl_ip.ip4.as_u32 = clib_host_to_net_u32 (0x06000101); + tc->connection.rmt_ip.ip4.as_u32 = clib_host_to_net_u32 (0x06000103); + tc->connection.lcl_port = 35051; + tc->connection.rmt_port = 53764; + tc->connection.transport_proto = 0; + clib_memcpy (tc1, &tc->connection, sizeof (*tc1)); + + pool_get (session_manager_main.sessions[0], s); + memset (s, 0, sizeof (*s)); + s->session_index = s - smm->sessions[0]; + pool_get (tm->connections[0], tc); + memset (tc, 0, sizeof (*tc)); + tc->connection.c_index = tc - tm->connections[0]; + tc->connection.s_index = s->session_index; + s->connection_index = tc->connection.c_index; + + tc->connection.lcl_ip.ip4.as_u32 = clib_host_to_net_u32 (0x06000101); + tc->connection.rmt_ip.ip4.as_u32 = clib_host_to_net_u32 (0x06000102); + tc->connection.lcl_port = 38225; + tc->connection.rmt_port = 53764; + tc->connection.transport_proto = 0; + clib_memcpy (tc2, &tc->connection, sizeof (*tc2)); + + /* + * Confirm that connection lookup works + */ + + stream_session_table_add_for_tc (tc1, tc1->s_index); + tconn = stream_session_lookup_transport_wt4 (&tc1->lcl_ip.ip4, + &tc1->rmt_ip.ip4, + tc1->lcl_port, tc1->rmt_port, + tc1->transport_proto, 0); + cmp = (memcmp (&tconn->rmt_ip, &tc1->rmt_ip, sizeof (tc1->rmt_ip)) == 0); + TCP_TEST ((cmp), "rmt ip is identical %d", cmp); + TCP_TEST ((tconn->lcl_port == tc1->lcl_port), + "rmt port is identical %d", tconn->lcl_port == tc1->lcl_port); + + /* + * Non-existing connection lookup should not work + */ + + tconn = stream_session_lookup_transport_wt4 (&tc2->lcl_ip.ip4, + &tc2->rmt_ip.ip4, + tc2->lcl_port, tc2->rmt_port, + tc2->transport_proto, 0); + TCP_TEST ((tconn == 0), "lookup result should be null"); + + /* + * Delete and lookup again + */ + stream_session_table_del_for_tc (tc1); + tconn = stream_session_lookup_transport_wt4 (&tc1->lcl_ip.ip4, + &tc1->rmt_ip.ip4, + tc1->lcl_port, tc1->rmt_port, + tc1->transport_proto, 0); + TCP_TEST ((tconn == 0), "lookup result should be null"); + tconn = stream_session_lookup_transport_wt4 (&tc2->lcl_ip.ip4, + &tc2->rmt_ip.ip4, + tc2->lcl_port, tc2->rmt_port, + tc2->transport_proto, 0); + TCP_TEST ((tconn == 0), "lookup result should be null"); + + /* + * Re-add and lookup tc2 + */ + stream_session_table_add_for_tc (tc1, tc1->s_index); + tconn = stream_session_lookup_transport_wt4 (&tc2->lcl_ip.ip4, + &tc2->rmt_ip.ip4, + tc2->lcl_port, tc2->rmt_port, + tc2->transport_proto, 0); + TCP_TEST ((tconn == 0), "lookup result should be null"); + + return 0; +} + +static int +tcp_test_session (vlib_main_t * vm, unformat_input_t * input) +{ + int rv = 0; + tcp_connection_t *tc0; + u8 sst = SESSION_TYPE_IP4_TCP; + ip4_address_t local, remote; + u16 local_port, remote_port; + tcp_main_t *tm = vnet_get_tcp_main (); + int is_add = 1; + + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "del")) + is_add = 0; + else if (unformat (input, "add")) + is_add = 1; + else + break; + } + + if (is_add) + { + local.as_u32 = clib_host_to_net_u32 (0x06000101); + remote.as_u32 = clib_host_to_net_u32 (0x06000102); + local_port = clib_host_to_net_u16 (1234); + remote_port = clib_host_to_net_u16 (11234); + + pool_get (tm->connections[0], tc0); + memset (tc0, 0, sizeof (*tc0)); + + tc0->state = TCP_STATE_ESTABLISHED; + tc0->rcv_las = 1; + tc0->c_c_index = tc0 - tm->connections[0]; + tc0->c_lcl_port = local_port; + tc0->c_rmt_port = remote_port; + tc0->c_is_ip4 = 1; + tc0->c_thread_index = 0; + tc0->c_lcl_ip4.as_u32 = local.as_u32; + tc0->c_rmt_ip4.as_u32 = remote.as_u32; + tc0->rcv_opts.mss = 1450; + tcp_connection_init_vars (tc0); + + TCP_EVT_DBG (TCP_EVT_OPEN, tc0); + + if (stream_session_accept (&tc0->connection, 0 /* listener index */ , + sst, 0 /* notify */ )) + clib_warning ("stream_session_accept failed"); + + stream_session_accept_notify (&tc0->connection); + } + else + { + tc0 = tcp_connection_get (0 /* connection index */ , 0 /* thread */ ); + tc0->state = TCP_STATE_CLOSED; + stream_session_disconnect_notify (&tc0->connection); + } + + return rv; +} + +static clib_error_t * +tcp_test (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd_arg) +{ + int res = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "sack")) + { + res = tcp_test_sack (vm, input); + } + else if (unformat (input, "fifo")) + { + res = tcp_test_fifo (vm, input); + } + else if (unformat (input, "session")) + { + res = tcp_test_session (vm, input); + } + else if (unformat (input, "lookup")) + { + res = tcp_test_lookup (vm, input); + } + else + break; + } + + if (res) + { + return clib_error_return (0, "TCP unit test failed"); + } + else + { + return 0; + } +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (tcp_test_command, static) = +{ + .path = "test tcp", + .short_help = "internal tcp unit tests", + .function = tcp_test, +}; +/* *INDENT-ON* */ + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_timer.h b/src/vnet/tcp/tcp_timer.h new file mode 100644 index 00000000..fa25268c --- /dev/null +++ b/src/vnet/tcp/tcp_timer.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_tcp_timer_h__ +#define __included_tcp_timer_h__ + +#include <vppinfra/tw_timer_16t_2w_512sl.h> +#include <vppinfra/tw_timer_16t_1w_2048sl.h> + +#endif /* __included_tcp_timer_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ |