diff options
author | Dave Barach <dbarach@cisco.com> | 2017-06-26 11:35:07 -0400 |
---|---|---|
committer | Florin Coras <florin.coras@gmail.com> | 2017-07-11 15:40:19 +0000 |
commit | 2c25a62cc1cc4937165de740a3b32d78429c72d6 (patch) | |
tree | b860025298501460e21cf8e5722c6155f87495ec /src/vnet/tcp | |
parent | 8af1b2fdecc883eadfec6b91434adc6044e24cb2 (diff) |
Horizontal (nSessions) scaling draft
- Data structure preallocation.
- Input state machine fixes for mid-stream 3-way handshake retries.
- Batch connections in the builtin_client
- Multiple private fifo segment support
- Fix elog simultaneous event type registration
- Fix sacks when segment hole is added after highest sacked
- Add "accepting" session state for sessions pending accept
- Add ssvm non-recursive locking
- Estimate RTT for syn-ack
- Don't init fifo pointers. We're using relative offsets for ooo
segments
- CLI to dump individual session
Change-Id: Ie0598563fd246537bafba4feed7985478ea1d415
Signed-off-by: Dave Barach <dbarach@cisco.com>
Signed-off-by: Florin Coras <fcoras@cisco.com>
Diffstat (limited to 'src/vnet/tcp')
-rw-r--r-- | src/vnet/tcp/builtin_client.c | 118 | ||||
-rw-r--r-- | src/vnet/tcp/builtin_client.h | 7 | ||||
-rw-r--r-- | src/vnet/tcp/builtin_server.c | 66 | ||||
-rw-r--r-- | src/vnet/tcp/tcp.c | 225 | ||||
-rw-r--r-- | src/vnet/tcp/tcp.h | 13 | ||||
-rwxr-xr-x | src/vnet/tcp/tcp_debug.h | 13 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_input.c | 97 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_newreno.c | 4 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_output.c | 53 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_packet.h | 1 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_test.c | 10 |
11 files changed, 480 insertions, 127 deletions
diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index 6f8be082d95..a6c8a23582b 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -170,62 +170,90 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, { tclient_main_t *tm = &tclient_main; int my_thread_index = vlib_get_thread_index (); - vl_api_disconnect_session_t *dmp; session_t *sp; int i; int delete_session; u32 *connection_indices; - u32 tx_quota = 0; - u32 delta, prev_bytes_received_this_session; + u32 *connections_this_batch; + u32 nconnections_this_batch; connection_indices = tm->connection_index_by_thread[my_thread_index]; + connections_this_batch = + tm->connections_this_batch_by_thread[my_thread_index]; - if (tm->run_test == 0 || vec_len (connection_indices) == 0) + if ((tm->run_test == 0) || + ((vec_len (connection_indices) == 0) + && vec_len (connections_this_batch) == 0)) return 0; - for (i = 0; i < vec_len (connection_indices); i++) + /* Grab another pile of connections */ + if (PREDICT_FALSE (vec_len (connections_this_batch) == 0)) + { + nconnections_this_batch = + clib_min (tm->connections_per_batch, vec_len (connection_indices)); + + ASSERT (nconnections_this_batch > 0); + vec_validate (connections_this_batch, nconnections_this_batch - 1); + clib_memcpy (connections_this_batch, + connection_indices + vec_len (connection_indices) + - nconnections_this_batch, + nconnections_this_batch * sizeof (u32)); + _vec_len (connection_indices) -= nconnections_this_batch; + } + + if (PREDICT_FALSE (tm->prev_conns != tm->connections_per_batch + && tm->prev_conns == vec_len (connections_this_batch))) + { + tm->repeats++; + tm->prev_conns = vec_len (connections_this_batch); + if (tm->repeats == 500000) + { + clib_warning ("stuck clients"); + } + } + else + { + tm->prev_conns = vec_len (connections_this_batch); + tm->repeats = 0; + } + + for (i = 0; i < vec_len (connections_this_batch); i++) { delete_session = 1; - sp = pool_elt_at_index (tm->sessions, connection_indices[i]); + sp = pool_elt_at_index (tm->sessions, connections_this_batch[i]); - if ((tm->no_return || tx_quota < 60) && sp->bytes_to_send > 0) + if (sp->bytes_to_send > 0) { send_test_chunk (tm, sp); delete_session = 0; - tx_quota++; } - if (!tm->no_return && sp->bytes_to_receive > 0) + if (sp->bytes_to_receive > 0) { - prev_bytes_received_this_session = sp->bytes_received; receive_test_chunk (tm, sp); - delta = sp->bytes_received - prev_bytes_received_this_session; - if (delta > 0) - tx_quota--; delete_session = 0; } if (PREDICT_FALSE (delete_session == 1)) { - __sync_fetch_and_add (&tm->tx_total, tm->bytes_to_send); + u32 index, thread_index; + stream_session_t *s; + + __sync_fetch_and_add (&tm->tx_total, sp->bytes_sent); __sync_fetch_and_add (&tm->rx_total, sp->bytes_received); - dmp = vl_msg_api_alloc_as_if_client (sizeof (*dmp)); - memset (dmp, 0, sizeof (*dmp)); - dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION); - dmp->client_index = tm->my_client_index; - dmp->handle = sp->vpp_session_handle; - if (!unix_shared_memory_queue_add (tm->vl_input_queue, (u8 *) & dmp, - 1)) + stream_session_parse_handle (sp->vpp_session_handle, + &index, &thread_index); + s = stream_session_get_if_valid (index, thread_index); + + if (s) { - vec_delete (connection_indices, 1, i); - tm->connection_index_by_thread[my_thread_index] = - connection_indices; + stream_session_disconnect (s); + vec_delete (connections_this_batch, 1, i); + i--; __sync_fetch_and_add (&tm->ready_connections, -1); } else - { - vl_msg_api_free (dmp); - } + clib_warning ("session AWOL?"); /* Kick the debug CLI process */ if (tm->ready_connections == 0) @@ -236,6 +264,10 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, } } } + + tm->connection_index_by_thread[my_thread_index] = connection_indices; + tm->connections_this_batch_by_thread[my_thread_index] = + connections_this_batch; return 0; } @@ -356,6 +388,8 @@ tcp_test_clients_init (vlib_main_t * vm) tm->vlib_main = vm; vec_validate (tm->connection_index_by_thread, thread_main->n_vlib_mains); + vec_validate (tm->connections_this_batch_by_thread, + thread_main->n_vlib_mains); return 0; } @@ -388,7 +422,8 @@ builtin_session_connected_callback (u32 app_index, u32 api_context, pool_get (tm->sessions, session); memset (session, 0, sizeof (*session)); session_index = session - tm->sessions; - session->bytes_to_receive = session->bytes_to_send = tm->bytes_to_send; + session->bytes_to_send = tm->bytes_to_send; + session->bytes_to_receive = tm->no_return ? 0ULL : tm->bytes_to_send; session->server_rx_fifo = s->server_rx_fifo; session->server_rx_fifo->client_session_index = session_index; session->server_tx_fifo = s->server_tx_fifo; @@ -485,6 +520,8 @@ attach_builtin_test_clients_app (void) options[SESSION_OPTIONS_SEGMENT_SIZE] = (2ULL << 32); options[SESSION_OPTIONS_RX_FIFO_SIZE] = tm->fifo_size; options[SESSION_OPTIONS_TX_FIFO_SIZE] = tm->fifo_size / 2; + options[APP_OPTIONS_PRIVATE_SEGMENT_COUNT] = tm->private_segment_count; + options[APP_OPTIONS_PRIVATE_SEGMENT_SIZE] = tm->private_segment_size; options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = prealloc_fifos; options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP; @@ -561,6 +598,9 @@ test_tcp_clients_command_fn (vlib_main_t * vm, tm->bytes_to_send = 8192; tm->no_return = 0; tm->fifo_size = 64 << 10; + tm->connections_per_batch = 1000; + tm->private_segment_count = 0; + tm->private_segment_size = 0; vec_free (tm->connect_uri); @@ -582,6 +622,20 @@ test_tcp_clients_command_fn (vlib_main_t * vm, tm->no_return = 1; else if (unformat (input, "fifo-size %d", &tm->fifo_size)) tm->fifo_size <<= 10; + else if (unformat (input, "private-segment-count %d", + &tm->private_segment_count)) + ; + else if (unformat (input, "private-segment-size %dm", &tmp)) + tm->private_segment_size = tmp << 20; + else if (unformat (input, "private-segment-size %dg", &tmp)) + tm->private_segment_size = tmp << 30; + else if (unformat (input, "private-segment-size %d", &tmp)) + tm->private_segment_size = tmp; + else if (unformat (input, "preallocate-fifos")) + tm->prealloc_fifos = 1; + else + if (unformat (input, "client-batch %d", &tm->connections_per_batch)) + ; else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); @@ -688,9 +742,13 @@ test_tcp_clients_command_fn (vlib_main_t * vm, vlib_cli_output (vm, "zero delta-t?"); cleanup: - pool_free (tm->sessions); + tm->run_test = 0; for (i = 0; i < vec_len (tm->connection_index_by_thread); i++) - vec_reset_length (tm->connection_index_by_thread[i]); + { + vec_reset_length (tm->connection_index_by_thread[i]); + vec_reset_length (tm->connections_this_batch_by_thread[i]); + } + pool_free (tm->sessions); return 0; } diff --git a/src/vnet/tcp/builtin_client.h b/src/vnet/tcp/builtin_client.h index 3462e0eeee8..38af231dec3 100644 --- a/src/vnet/tcp/builtin_client.h +++ b/src/vnet/tcp/builtin_client.h @@ -63,6 +63,9 @@ typedef struct u32 configured_segment_size; u32 fifo_size; u32 expected_connections; /**< Number of clients/connections */ + u32 connections_per_batch; /**< Connections to rx/tx at once */ + u32 private_segment_count; /**< Number of private fifo segs */ + u32 private_segment_size; /**< size of private fifo segs */ /* * Test state variables @@ -72,6 +75,7 @@ typedef struct uword *session_index_by_vpp_handles; /**< Hash table for disconnecting */ u8 *connect_test_data; /**< Pre-computed test data */ u32 **connection_index_by_thread; + u32 **connections_this_batch_by_thread; /**< active connection batch */ pthread_t client_thread_handle; volatile u32 ready_connections; @@ -82,7 +86,8 @@ typedef struct f64 test_start_time; f64 test_end_time; - + u32 prev_conns; + u32 repeats; /* * Flags */ diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c index 775bfc26e47..8e958ac0b45 100644 --- a/src/vnet/tcp/builtin_server.c +++ b/src/vnet/tcp/builtin_server.c @@ -56,12 +56,15 @@ typedef struct u32 fifo_size; /**< Fifo size */ u32 rcv_buffer_size; /**< Rcv buffer size */ u32 prealloc_fifos; /**< Preallocate fifos */ + u32 private_segment_count; /**< Number of private segments */ + u32 private_segment_size; /**< Size of private segments */ /* * Test state */ u8 **rx_buf; /**< Per-thread RX buffer */ u64 byte_index; + u32 **rx_retries; vlib_main_t *vlib_main; } builtin_server_main_t; @@ -77,6 +80,8 @@ builtin_session_accept_callback (stream_session_t * s) session_manager_get_vpp_event_queue (s->thread_index); s->session_state = SESSION_STATE_READY; bsm->byte_index = 0; + vec_validate (bsm->rx_retries[s->thread_index], s->session_index); + bsm->rx_retries[s->thread_index][s->session_index] = 0; return 0; } @@ -173,11 +178,16 @@ builtin_server_rx_callback (stream_session_t * s) builtin_server_main_t *bsm = &builtin_server_main; session_fifo_event_t evt; static int serial_number = 0; - u32 my_thread_id = vlib_get_thread_index (); + u32 thread_index = vlib_get_thread_index (); + + ASSERT (s->thread_index == thread_index); rx_fifo = s->server_rx_fifo; tx_fifo = s->server_tx_fifo; + ASSERT (rx_fifo->master_thread_index == thread_index); + ASSERT (tx_fifo->master_thread_index == thread_index); + max_dequeue = svm_fifo_max_dequeue (s->server_rx_fifo); max_enqueue = svm_fifo_max_enqueue (s->server_tx_fifo); @@ -201,21 +211,31 @@ builtin_server_rx_callback (stream_session_t * s) evt.event_type = FIFO_EVENT_BUILTIN_RX; evt.event_id = 0; - q = bsm->vpp_queue[s->thread_index]; + q = bsm->vpp_queue[thread_index]; if (PREDICT_FALSE (q->cursize == q->maxsize)) clib_warning ("out of event queue space"); - else - unix_shared_memory_queue_add (q, (u8 *) & evt, - 0 /* don't wait for mutex */ ); + else if (unix_shared_memory_queue_add (q, (u8 *) & evt, 0 /* don't wait for mutex */ + )) + clib_warning ("failed to enqueue self-tap"); + + bsm->rx_retries[thread_index][s->session_index]++; + if (bsm->rx_retries[thread_index][s->session_index] == 500000) + { + clib_warning ("session stuck: %U", format_stream_session, s, 2); + } + } + else + { + bsm->rx_retries[thread_index][s->session_index] = 0; } return 0; } - _vec_len (bsm->rx_buf[my_thread_id]) = max_transfer; + _vec_len (bsm->rx_buf[thread_index]) = max_transfer; actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, max_transfer, - bsm->rx_buf[my_thread_id]); + bsm->rx_buf[thread_index]); ASSERT (actual_transfer == max_transfer); // test_bytes (bsm, actual_transfer); @@ -225,7 +245,7 @@ builtin_server_rx_callback (stream_session_t * s) */ n_written = svm_fifo_enqueue_nowait (tx_fifo, actual_transfer, - bsm->rx_buf[my_thread_id]); + bsm->rx_buf[thread_index]); if (n_written != max_transfer) clib_warning ("short trout!"); @@ -237,11 +257,13 @@ builtin_server_rx_callback (stream_session_t * s) evt.event_type = FIFO_EVENT_APP_TX; evt.event_id = serial_number++; - unix_shared_memory_queue_add (bsm->vpp_queue[s->thread_index], - (u8 *) & evt, 0 /* do wait for mutex */ ); + if (unix_shared_memory_queue_add (bsm->vpp_queue[s->thread_index], + (u8 *) & evt, + 0 /* do wait for mutex */ )) + clib_warning ("failed to enqueue tx evt"); } - if (PREDICT_FALSE (max_enqueue < max_dequeue)) + if (PREDICT_FALSE (n_written < max_dequeue)) goto rx_event; return 0; @@ -328,9 +350,13 @@ server_attach () a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 512 << 20; a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = bsm->fifo_size; a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = bsm->fifo_size; - a->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP; + a->options[APP_OPTIONS_PRIVATE_SEGMENT_COUNT] = bsm->private_segment_count; + a->options[APP_OPTIONS_PRIVATE_SEGMENT_SIZE] = bsm->private_segment_size; a->options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = bsm->prealloc_fifos ? bsm->prealloc_fifos : 1; + + a->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP; + a->segment_name = segment_name; a->segment_name_length = ARRAY_LEN (segment_name); @@ -374,6 +400,8 @@ server_create (vlib_main_t * vm) num_threads = 1 /* main thread */ + vtm->n_threads; vec_validate (builtin_server_main.vpp_queue, num_threads - 1); vec_validate (bsm->rx_buf, num_threads - 1); + vec_validate (bsm->rx_retries, num_threads - 1); + for (i = 0; i < num_threads; i++) vec_validate (bsm->rx_buf[i], bsm->rcv_buffer_size); @@ -435,11 +463,14 @@ server_create_command_fn (vlib_main_t * vm, unformat_input_t * input, { builtin_server_main_t *bsm = &builtin_server_main; int rv; + u32 tmp; bsm->no_echo = 0; bsm->fifo_size = 64 << 10; bsm->rcv_buffer_size = 128 << 10; bsm->prealloc_fifos = 0; + bsm->private_segment_count = 0; + bsm->private_segment_size = 0; while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { @@ -449,8 +480,17 @@ server_create_command_fn (vlib_main_t * vm, unformat_input_t * input, bsm->fifo_size <<= 10; else if (unformat (input, "rcv-buf-size %d", &bsm->rcv_buffer_size)) ; - else if (unformat (input, "prealloc-fifos", &bsm->prealloc_fifos)) + else if (unformat (input, "prealloc-fifos %d", &bsm->prealloc_fifos)) + ; + else if (unformat (input, "private-segment-count %d", + &bsm->private_segment_count)) ; + else if (unformat (input, "private-segment-size %dm", &tmp)) + bsm->private_segment_size = tmp << 20; + else if (unformat (input, "private-segment-size %dg", &tmp)) + bsm->private_segment_size = tmp << 30; + else if (unformat (input, "private-segment-size %d", &tmp)) + bsm->private_segment_size = tmp; else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 4e85eb3fc93..f379e699839 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -74,8 +74,16 @@ static void tcp_connection_unbind (u32 listener_index) { tcp_main_t *tm = vnet_get_tcp_main (); - TCP_EVT_DBG (TCP_EVT_UNBIND, - pool_elt_at_index (tm->listener_pool, listener_index)); + tcp_connection_t *tc; + + tc = pool_elt_at_index (tm->listener_pool, listener_index); + + TCP_EVT_DBG (TCP_EVT_UNBIND, tc); + + /* Poison the entry */ + if (CLIB_DEBUG > 0) + memset (tc, 0xFA, sizeof (*tc)); + pool_put_index (tm->listener_pool, listener_index); } @@ -124,9 +132,20 @@ tcp_connection_cleanup (tcp_connection_t * tc) /* Check if half-open */ if (tc->state == TCP_STATE_SYN_SENT) - pool_put (tm->half_open_connections, tc); + { + /* Poison the entry */ + if (CLIB_DEBUG > 0) + memset (tc, 0xFA, sizeof (*tc)); + pool_put (tm->half_open_connections, tc); + } else - pool_put (tm->connections[tc->c_thread_index], tc); + { + int thread_index = tc->c_thread_index; + /* Poison the entry */ + if (CLIB_DEBUG > 0) + memset (tc, 0xFA, sizeof (*tc)); + pool_put (tm->connections[thread_index], tc); + } } /** @@ -168,13 +187,14 @@ tcp_connection_reset (tcp_connection_t * tc) /* Make sure all timers are cleared */ tcp_connection_timers_reset (tc); - stream_session_reset_notify (&tc->connection); + + /* Wait for cleanup from session layer but not forever */ + tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); break; case TCP_STATE_CLOSED: return; } - } /** @@ -278,6 +298,9 @@ tcp_allocate_local_port (tcp_main_t * tm, ip46_address_t * ip) tries = max - min; time_now = tcp_time_now (); + /* Only support active opens from thread 0 */ + ASSERT (vlib_get_thread_index () == 0); + /* Start at random point or max */ pool_get (tm->local_endpoints, tep); clib_memcpy (&tep->ip, ip, sizeof (*ip)); @@ -343,6 +366,7 @@ tcp_connection_timers_reset (tcp_connection_t * tc) } } +#if 0 typedef struct ip4_tcp_hdr { ip4_header_t ip; @@ -435,6 +459,7 @@ tcp_connection_fib_attach (tcp_connection_t * tc) tcp_connection_stack_on_fib_entry (tc); } +#endif /* 0 */ /** Initialize tcp connection variables * @@ -447,7 +472,7 @@ tcp_connection_init_vars (tcp_connection_t * tc) tcp_init_mss (tc); scoreboard_init (&tc->sack_sb); tcp_cc_init (tc); - tcp_connection_fib_attach (tc); + // tcp_connection_fib_attach (tc); } int @@ -485,14 +510,38 @@ tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4) if (is_ip4) { ip4_address_t *ip4; - ip4 = ip_interface_get_first_ip (sw_if_index, 1); - lcl_addr.ip4.as_u32 = ip4->as_u32; + int index; + if (vec_len (tm->ip4_src_addresses)) + { + index = tm->last_v4_address_rotor++; + if (tm->last_v4_address_rotor >= vec_len (tm->ip4_src_addresses)) + tm->last_v4_address_rotor = 0; + lcl_addr.ip4.as_u32 = tm->ip4_src_addresses[index].as_u32; + } + else + { + ip4 = ip_interface_get_first_ip (sw_if_index, 1); + lcl_addr.ip4.as_u32 = ip4->as_u32; + } } else { ip6_address_t *ip6; - ip6 = ip_interface_get_first_ip (sw_if_index, 0); - clib_memcpy (&lcl_addr.ip6, ip6, sizeof (*ip6)); + int index; + + if (vec_len (tm->ip6_src_addresses)) + { + index = tm->last_v6_address_rotor++; + if (tm->last_v6_address_rotor >= vec_len (tm->ip6_src_addresses)) + tm->last_v6_address_rotor = 0; + clib_memcpy (&lcl_addr.ip6, &tm->ip6_src_addresses[index], + sizeof (*ip6)); + } + else + { + ip6 = ip_interface_get_first_ip (sw_if_index, 0); + clib_memcpy (&lcl_addr.ip6, ip6, sizeof (*ip6)); + } } /* Allocate source port */ @@ -614,7 +663,7 @@ u8 * format_tcp_vars (u8 * s, va_list * args) { tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - s = format (s, " snd_una %u snd_nxt %u snd_una_max %u\n", + s = format (s, " snd_una %u snd_nxt %u snd_una_max %u", tc->snd_una - tc->iss, tc->snd_nxt - tc->iss, tc->snd_una_max - tc->iss); s = format (s, " rcv_nxt %u rcv_las %u\n", @@ -628,12 +677,17 @@ format_tcp_vars (u8 * s, va_list * args) s = format (s, " cong %U ", format_tcp_congestion_status, tc); s = format (s, "cwnd %u ssthresh %u rtx_bytes %u bytes_acked %u\n", tc->cwnd, tc->ssthresh, tc->snd_rxt_bytes, tc->bytes_acked); - s = format (s, " prev_ssthresh %u snd_congestion %u dupack %u\n", + s = format (s, " prev_ssthresh %u snd_congestion %u dupack %u", tc->prev_ssthresh, tc->snd_congestion - tc->iss, tc->rcv_dupacks); + s = format (s, " limited_transmit %u\n", tc->limited_transmit - tc->iss); + s = format (s, " tsecr %u tsecr_last_ack %u\n", tc->rcv_opts.tsecr, + tc->tsecr_last_ack); s = format (s, " rto %u rto_boff %u srtt %u rttvar %u rtt_ts %u ", tc->rto, tc->rto_boff, tc->srtt, tc->rttvar, tc->rtt_ts); s = format (s, "rtt_seq %u\n", tc->rtt_seq); + s = format (s, " tsval_recent %u tsval_recent_age %u\n", tc->tsval_recent, + tcp_time_now () - tc->tsval_recent_age); s = format (s, " scoreboard: %U\n", format_tcp_scoreboard, &tc->sack_sb); if (vec_len (tc->snd_sacks)) s = format (s, " sacks tx: %U\n", format_tcp_sacks, tc); @@ -719,11 +773,21 @@ format_tcp_sacks (u8 * s, va_list * args) tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); sack_block_t *sacks = tc->snd_sacks; sack_block_t *block; - vec_foreach (block, sacks) - { - s = format (s, " start %u end %u\n", block->start - tc->irs, - block->end - tc->irs); - } + int i, len = 0; + + len = vec_len (sacks); + for (i = 0; i < len - 1; i++) + { + block = &sacks[i]; + s = format (s, " start %u end %u\n", block->start - tc->irs, + block->end - tc->irs); + } + if (len) + { + block = &sacks[len - 1]; + s = format (s, " start %u end %u", block->start - tc->irs, + block->end - tc->irs); + } return s; } @@ -796,14 +860,18 @@ tcp_session_send_mss (transport_connection_t * trans_conn) always_inline u32 tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space) { - if (tc->snd_wnd < tc->snd_mss) + if (PREDICT_FALSE (tc->snd_wnd < tc->snd_mss)) { return tc->snd_wnd <= snd_space ? tc->snd_wnd : 0; } /* If we can't write at least a segment, don't try at all */ - if (snd_space < tc->snd_mss) - return 0; + if (PREDICT_FALSE (snd_space < tc->snd_mss)) + { + if (snd_space > clib_min (tc->mss, tc->rcv_opts.mss) - TCP_HDR_LEN_MAX) + return snd_space; + return 0; + } /* round down to mss multiple */ return snd_space - (snd_space % tc->snd_mss); @@ -1042,6 +1110,8 @@ tcp_main_enable (vlib_main_t * vm) vlib_thread_main_t *vtm = vlib_get_thread_main (); clib_error_t *error = 0; u32 num_threads; + int thread, i; + tcp_connection_t *tc __attribute__ ((unused)); if ((error = vlib_call_init_function (vm, ip_main_init))) return error; @@ -1074,6 +1144,27 @@ tcp_main_enable (vlib_main_t * vm) num_threads = 1 /* main thread */ + vtm->n_threads; vec_validate (tm->connections, num_threads - 1); + /* + * Preallocate connections + */ + for (thread = 0; thread < num_threads; thread++) + { + for (i = 0; i < tm->preallocated_connections; i++) + pool_get (tm->connections[thread], tc); + + for (i = 0; i < tm->preallocated_connections; i++) + pool_put_index (tm->connections[thread], i); + } + + /* + * Preallocate half-open connections + */ + for (i = 0; i < tm->preallocated_half_open_connections; i++) + pool_get (tm->half_open_connections, tc); + + for (i = 0; i < tm->preallocated_half_open_connections; i++) + pool_put_index (tm->half_open_connections, i); + /* Initialize per worker thread tx buffers (used for control messages) */ vec_validate (tm->tx_buffers, num_threads - 1); @@ -1116,7 +1207,6 @@ tcp_init (vlib_main_t * vm) { tcp_main_t *tm = vnet_get_tcp_main (); - tm->vlib_main = vm; tm->vnet_main = vnet_get_main (); tm->is_enabled = 0; @@ -1125,6 +1215,97 @@ tcp_init (vlib_main_t * vm) VLIB_INIT_FUNCTION (tcp_init); + +static clib_error_t * +tcp_config_fn (vlib_main_t * vm, unformat_input_t * input) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (input, "preallocated-connections %d", + &tm->preallocated_connections)) + ; + else if (unformat (input, "preallocated-half-open-connections %d", + &tm->preallocated_half_open_connections)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + return 0; +} + +VLIB_CONFIG_FUNCTION (tcp_config_fn, "tcp"); + +static clib_error_t * +tcp_src_address (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd_arg) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + ip4_address_t v4start, v4end; + ip6_address_t v6start, v6end; + int v4set = 0; + int v6set = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "%U - %U", unformat_ip4_address, &v4start, + unformat_ip4_address, &v4end)) + v4set = 1; + else if (unformat (input, "%U", unformat_ip4_address, &v4start)) + { + memcpy (&v4end, &v4start, sizeof (v4start)); + v4set = 1; + } + else if (unformat (input, "%U - %U", unformat_ip6_address, &v6start, + unformat_ip4_address, &v6end)) + v6set = 1; + else if (unformat (input, "%U", unformat_ip6_address, &v6start)) + { + memcpy (&v6end, &v6start, sizeof (v4start)); + v6set = 1; + } + else + break; + } + + if (!v4set && !v6set) + return clib_error_return (0, "at least one v4 or v6 address required"); + + if (v4set) + { + u32 tmp; + + do + { + vec_add1 (tm->ip4_src_addresses, v4start); + tmp = clib_net_to_host_u32 (v4start.as_u32); + tmp++; + v4start.as_u32 = clib_host_to_net_u32 (tmp); + } + while (clib_host_to_net_u32 (v4start.as_u32) <= + clib_host_to_net_u32 (v4end.as_u32)); + } + if (v6set) + { + clib_warning ("v6 src address list unimplemented..."); + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (tcp_src_address_command, static) = +{ + .path = "tcp src-address", + .short_help = "tcp src-address <ip-addr> [- <ip-addr>] add src address range", + .function = tcp_src_address, +}; +/* *INDENT-ON* */ + + + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 12d804b82f6..37b10fd4753 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -348,6 +348,16 @@ typedef struct _tcp_main /* Flag that indicates if stack is on or off */ u8 is_enabled; + /** Number of preallocated connections */ + u32 preallocated_connections; + u32 preallocated_half_open_connections; + + /** Vectors of src addresses. Optional unless one needs > 63K active-opens */ + ip4_address_t *ip4_src_addresses; + u32 last_v4_address_rotor; + u32 last_v6_address_rotor; + ip6_address_t *ip6_src_addresses; + /* convenience */ vlib_main_t *vlib_main; vnet_main_t *vnet_main; @@ -569,6 +579,7 @@ tcp_connection_force_ack (tcp_connection_t * tc, vlib_buffer_t * b) always_inline void tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval) { + ASSERT (tc->c_thread_index == vlib_get_thread_index ()); tc->timers[timer_id] = tw_timer_start_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index], tc->c_c_index, timer_id, interval); @@ -577,6 +588,7 @@ tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval) always_inline void tcp_timer_reset (tcp_connection_t * tc, u8 timer_id) { + ASSERT (tc->c_thread_index == vlib_get_thread_index ()); if (tc->timers[timer_id] == TCP_TIMER_HANDLE_INVALID) return; @@ -588,6 +600,7 @@ tcp_timer_reset (tcp_connection_t * tc, u8 timer_id) always_inline void tcp_timer_update (tcp_connection_t * tc, u8 timer_id, u32 interval) { + ASSERT (tc->c_thread_index == vlib_get_thread_index ()); if (tc->timers[timer_id] != TCP_TIMER_HANDLE_INVALID) tw_timer_stop_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index], tc->timers[timer_id]); diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index ae68ad1b264..be51bca2a26 100755 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -383,9 +383,16 @@ typedef enum _tcp_dbg_evt "establish", \ }, \ }; \ - DECLARE_ETD(_tc, _e, 2); \ - ed->data[0] = _timer_id; \ - ed->data[1] = _timer_id; \ + if (_tc) \ + { \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = _timer_id; \ + ed->data[1] = _timer_id; \ + } \ + else \ + { \ + clib_warning ("pop for unexisting connection %d", _tc_index); \ + } \ } #define TCP_EVT_SEG_INVALID_HANDLER(_tc, _seq, _end, ...) \ diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index a2e6dad1298..45db0da69c6 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -251,6 +251,7 @@ tcp_update_timestamp (tcp_connection_t * tc, u32 seq, u32 seq_end) if (tcp_opts_tstamp (&tc->rcv_opts) && tc->tsval_recent && seq_leq (seq, tc->rcv_las) && seq_leq (tc->rcv_las, seq_end)) { + ASSERT (timestamp_leq (tc->tsval_recent, tc->rcv_opts.tsval)); tc->tsval_recent = tc->rcv_opts.tsval; tc->tsval_recent_age = tcp_time_now (); } @@ -383,12 +384,9 @@ tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt) if (tc->srtt != 0) { err = mrtt - tc->srtt; -// tc->srtt += err >> 3; /* XXX Drop in RTT results in RTTVAR increase and bigger RTO. * The increase should be bound */ -// tc->rttvar += ((int) clib_abs (err) - (int) tc->rttvar) >> 2; - tc->srtt = clib_max ((int) tc->srtt + (err >> 3), 1); diff = (clib_abs (err) - (int) tc->rttvar) >> 2; tc->rttvar = clib_max ((int) tc->rttvar + diff, 1); @@ -491,6 +489,14 @@ tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 prev_snd_wnd, && (prev_snd_wnd == tc->snd_wnd)); } +static u8 +tcp_is_lost_fin (tcp_connection_t * tc) +{ + if ((tc->flags & TCP_CONN_FINSNT) && tc->snd_una_max - tc->snd_una == 1) + return 1; + return 0; +} + /** * Checks if ack is a congestion control event. */ @@ -503,7 +509,7 @@ tcp_ack_is_cc_event (tcp_connection_t * tc, vlib_buffer_t * b, *is_dack = tc->sack_sb.last_sacked_bytes || tcp_ack_is_dupack (tc, b, prev_snd_wnd, prev_snd_una); - return (*is_dack || tcp_in_cong_recovery (tc)); + return ((*is_dack || tcp_in_cong_recovery (tc)) && !tcp_is_lost_fin (tc)); } void @@ -750,10 +756,20 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) * last hole end */ tmp = tc->rcv_opts.sacks[vec_len (tc->rcv_opts.sacks) - 1]; last_hole = scoreboard_last_hole (sb); - if (seq_gt (tc->snd_una_max, sb->high_sacked) - && seq_gt (tc->snd_una_max, last_hole->end)) - last_hole->end = tc->snd_una_max; - /* keep track of max byte sacked for when the last hole + if (seq_gt (tc->snd_una_max, last_hole->end)) + { + if (seq_geq (last_hole->start, sb->high_sacked)) + { + last_hole->end = tc->snd_una_max; + } + /* New hole after high sacked block */ + else if (seq_lt (sb->high_sacked, tc->snd_una_max)) + { + scoreboard_insert_hole (sb, sb->tail, sb->high_sacked, + tc->snd_una_max); + } + } + /* Keep track of max byte sacked for when the last hole * is acked */ if (seq_gt (tmp.end, sb->high_sacked)) sb->high_sacked = tmp.end; @@ -764,7 +780,6 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) while (hole && blk_index < vec_len (tc->rcv_opts.sacks)) { blk = &tc->rcv_opts.sacks[blk_index]; - if (seq_leq (blk->start, hole->start)) { /* Block covers hole. Remove hole */ @@ -784,6 +799,7 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) } else if (!next_hole) { + ASSERT (seq_geq (sb->high_sacked, ack)); sb->snd_una_adv = sb->high_sacked - ack; sb->last_bytes_delivered += sb->high_sacked - hole->end; } @@ -819,7 +835,6 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { hole->end = blk->start; } - hole = scoreboard_next_hole (sb, hole); } } @@ -827,10 +842,13 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) scoreboard_update_bytes (tc, sb); sb->last_sacked_bytes = sb->sacked_bytes - (old_sacked_bytes - sb->last_bytes_delivered); + ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes); ASSERT (sb->sacked_bytes == 0 || sb->sacked_bytes < tc->snd_una_max - seq_max (tc->snd_una, ack)); ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_una_max - seq_max (tc->snd_una, ack)); + ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc) + || sb->holes[sb->head].start == ack + sb->snd_una_adv); } /** @@ -916,7 +934,8 @@ tcp_cc_congestion_undo (tcp_connection_t * tc) static u8 tcp_cc_is_spurious_retransmit (tcp_connection_t * tc) { - return (tc->snd_rxt_ts + return (tcp_in_recovery (tc) + && tc->snd_rxt_ts && tcp_opts_tstamp (&tc->rcv_opts) && timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts)); } @@ -994,6 +1013,7 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) { ASSERT (tc->snd_una != tc->snd_una_max || tc->sack_sb.last_sacked_bytes); + tc->rcv_dupacks++; if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD && !tc->bytes_acked) @@ -1012,17 +1032,20 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) goto partial_ack_test; } - /* If of of the two conditions lower hold, reset dupacks - * 1) Cumulative ack does not cover more than congestion threshold, - * and the following doesn't hold: the congestion window is - * greater than SMSS bytes and the difference between highest_ack - * and prev_highest_ack is at most 4*SMSS bytes (XXX) - * 2) RFC6582 heuristic to avoid multiple fast retransmits + /* If of of the two conditions lower hold, reset dupacks because + * we're probably after timeout (RFC6582 heuristics). + * If Cumulative ack does not cover more than congestion threshold, + * and: + * 1) The following doesn't hold: The congestion window is greater + * than SMSS bytes and the difference between highest_ack + * and prev_highest_ack is at most 4*SMSS bytes + * 2) Echoed timestamp in the last non-dup ack does not equal the + * stored timestamp */ - if ((seq_gt (tc->snd_una, tc->snd_congestion) - || !(tc->cwnd > tc->snd_mss - && tc->bytes_acked <= 4 * tc->snd_mss)) - || tc->rcv_opts.tsecr != tc->tsecr_last_ack) + if (seq_leq (tc->snd_una, tc->snd_congestion) + && ((!(tc->cwnd > tc->snd_mss + && tc->bytes_acked <= 4 * tc->snd_mss)) + || (tc->rcv_opts.tsecr != tc->tsecr_last_ack))) { tc->rcv_dupacks = 0; return; @@ -1038,6 +1061,7 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) * three segments that have left the network and should've been * buffered at the receiver XXX */ tc->cwnd = tc->ssthresh + tc->rcv_dupacks * tc->snd_mss; + ASSERT (tc->cwnd >= tc->snd_mss); /* If cwnd allows, send more data */ if (tcp_opts_sack_permitted (&tc->rcv_opts) @@ -1112,7 +1136,7 @@ partial_ack: >= tc->sack_sb.last_bytes_delivered); rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv - tc->sack_sb.last_bytes_delivered; - if (rxt_delivered && seq_gt (tc->sack_sb.high_rxt, tc->snd_una)) + if (0 && rxt_delivered && seq_gt (tc->sack_sb.high_rxt, tc->snd_una)) { /* If we have sacks and we haven't gotten an ack beyond high_rxt, * remove sacked bytes delivered */ @@ -1301,6 +1325,8 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, { int written; + ASSERT (seq_geq (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt)); + /* Pure ACK. Update rcv_nxt and be done. */ if (PREDICT_FALSE (data_len == 0)) { @@ -1450,6 +1476,7 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, /* Chop off the bytes in the past */ n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number; n_data_bytes -= n_bytes_to_drop; + vnet_buffer (b)->tcp.seq_number = tc->rcv_nxt; vlib_buffer_advance (b, n_bytes_to_drop); goto in_order; @@ -1912,11 +1939,12 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; } - stream_session_init_fifos_pointers (&new_tc0->connection, - new_tc0->irs + 1, - new_tc0->iss + 1); /* Make sure after data segment processing ACK is sent */ new_tc0->flags |= TCP_CONN_SNDACK; + + /* Update rtt with the syn-ack sample */ + new_tc0->bytes_acked = 1; + tcp_update_rtt (new_tc0, vnet_buffer (b0)->tcp.ack_number); } /* SYN: Simultaneous open. Change state to SYN-RCVD and send SYN-ACK */ else @@ -1932,9 +1960,8 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; } - stream_session_init_fifos_pointers (&new_tc0->connection, - new_tc0->irs + 1, - new_tc0->iss + 1); + tc0->rtt_ts = 0; + tcp_make_synack (new_tc0, b0); next0 = tcp_next_output (is_ip4); @@ -2151,8 +2178,6 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, << tc0->rcv_opts.wscale; tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number; tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number; - - /* Shoulder tap the server */ stream_session_accept_notify (&tc0->connection); /* Reset SYN-ACK retransmit timer */ @@ -2175,6 +2200,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* If FIN is ACKed */ if (tc0->snd_una == tc0->snd_una_max) { + ASSERT (tcp_fin (tcp0)); tc0->state = TCP_STATE_FIN_WAIT_2; /* Stop all timers, 2MSL will be set lower */ tcp_connection_timers_reset (tc0); @@ -2545,10 +2571,6 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_make_synack (child0, b0); next0 = tcp_next_output (is_ip4); - /* Init fifo pointers after we have iss */ - stream_session_init_fifos_pointers (&child0->connection, - child0->irs + 1, - child0->iss + 1); drop: if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { @@ -2886,9 +2908,12 @@ do { \ _(LISTEN, TCP_FLAG_SYN, TCP_INPUT_NEXT_LISTEN, TCP_ERROR_NONE); _(LISTEN, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_NONE); _(LISTEN, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_NONE); + _(LISTEN, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, + TCP_ERROR_NONE); /* ACK for for a SYN-ACK -> tcp-rcv-process. */ _(SYN_RCVD, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(SYN_RCVD, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(SYN_RCVD, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); /* SYN-ACK for a SYN */ _(SYN_SENT, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE); @@ -2905,12 +2930,14 @@ do { \ _(ESTABLISHED, TCP_FLAG_RST, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); _(ESTABLISHED, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); + _(ESTABLISHED, TCP_FLAG_SYN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); /* ACK or FIN-ACK to our FIN */ _(FIN_WAIT_1, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(FIN_WAIT_1, TCP_FLAG_ACK | TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); /* FIN in reply to our FIN from the other side */ _(FIN_WAIT_1, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(FIN_WAIT_1, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); /* FIN confirming that the peer (app) has closed */ _(FIN_WAIT_2, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(FIN_WAIT_2, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); @@ -2929,6 +2956,8 @@ do { \ TCP_ERROR_NONE); _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED); _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); + _(CLOSED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, + TCP_ERROR_CONNECTION_CLOSED); #undef _ } diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c index c825e952c9b..103fea4c194 100644 --- a/src/vnet/tcp/tcp_newreno.c +++ b/src/vnet/tcp/tcp_newreno.c @@ -63,8 +63,8 @@ newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type) * window deflation" attempts to ensure that, when fast recovery * eventually ends, approximately ssthresh amount of data will be * outstanding in the network.*/ - tc->cwnd = (tc->cwnd > tc->bytes_acked) ? - tc->cwnd - tc->bytes_acked : 0; + tc->cwnd = (tc->cwnd > tc->bytes_acked + tc->snd_mss) ? + tc->cwnd - tc->bytes_acked : tc->snd_mss; if (tc->bytes_acked > tc->snd_mss) tc->cwnd += tc->snd_mss; } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 41bebcb34af..b418e8baa54 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -19,17 +19,20 @@ vlib_node_registration_t tcp4_output_node; vlib_node_registration_t tcp6_output_node; -typedef enum _tcp_output_nect +typedef enum _tcp_output_next { TCP_OUTPUT_NEXT_DROP, + TCP_OUTPUT_NEXT_IP_LOOKUP, TCP_OUTPUT_N_NEXT } tcp_output_next_t; #define foreach_tcp4_output_next \ _ (DROP, "error-drop") \ + _ (IP_LOOKUP, "ip4-lookup") #define foreach_tcp6_output_next \ _ (DROP, "error-drop") \ + _ (IP_LOOKUP, "ip6-lookup") static char *tcp_error_strings[] = { #define tcp_error(n,s) s, @@ -427,16 +430,16 @@ tcp_init_mss (tcp_connection_t * tc) #define tcp_get_free_buffer_index(tm, bidx) \ do { \ u32 *my_tx_buffers, n_free_buffers; \ - u32 thread_index = vlib_get_thread_index(); \ - my_tx_buffers = tm->tx_buffers[thread_index]; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ if (PREDICT_FALSE(vec_len (my_tx_buffers) == 0)) \ { \ n_free_buffers = 32; /* TODO config or macro */ \ vec_validate (my_tx_buffers, n_free_buffers - 1); \ _vec_len(my_tx_buffers) = vlib_buffer_alloc_from_free_list ( \ - tm->vlib_main, my_tx_buffers, n_free_buffers, \ + vlib_get_main(), my_tx_buffers, n_free_buffers, \ VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); \ - tm->tx_buffers[thread_index] = my_tx_buffers; \ + tm->tx_buffers[thread_index] = my_tx_buffers; \ } \ /* buffer shortage */ \ if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) \ @@ -445,12 +448,12 @@ do { \ _vec_len (my_tx_buffers) -= 1; \ } while (0) -#define tcp_return_buffer(tm) \ -do { \ - u32 *my_tx_buffers; \ - u32 thread_index = vlib_get_thread_index(); \ - my_tx_buffers = tm->tx_buffers[thread_index]; \ - _vec_len (my_tx_buffers) +=1; \ +#define tcp_return_buffer(tm) \ +do { \ + u32 *my_tx_buffers; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ + _vec_len (my_tx_buffers) +=1; \ } while (0) always_inline void @@ -757,23 +760,22 @@ void tcp_push_ip_hdr (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b) { tcp_header_t *th = vlib_buffer_get_current (b); - + vlib_main_t *vm = vlib_get_main (); if (tc->c_is_ip4) { ip4_header_t *ih; - ih = vlib_buffer_push_ip4 (tm->vlib_main, b, &tc->c_lcl_ip4, + ih = vlib_buffer_push_ip4 (vm, b, &tc->c_lcl_ip4, &tc->c_rmt_ip4, IP_PROTOCOL_TCP); - th->checksum = ip4_tcp_udp_compute_checksum (tm->vlib_main, b, ih); + th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih); } else { ip6_header_t *ih; int bogus = ~0; - ih = vlib_buffer_push_ip6 (tm->vlib_main, b, &tc->c_lcl_ip6, + ih = vlib_buffer_push_ip6 (vm, b, &tc->c_lcl_ip6, &tc->c_rmt_ip6, IP_PROTOCOL_TCP); - th->checksum = ip6_tcp_udp_icmp_compute_checksum (tm->vlib_main, b, ih, - &bogus); + th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih, &bogus); ASSERT (!bogus); } } @@ -851,6 +853,13 @@ tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) /* Decide where to send the packet */ next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index; + /* Initialize the trajectory trace, if configured */ + if (VLIB_BUFFER_TRACE_TRAJECTORY > 0) + { + b->pre_data[0] = 1; + b->pre_data[1] = next_index; + } + /* Enqueue the packet */ f = vlib_get_frame_to_node (vm, next_index); to_next = vlib_frame_vector_args (f); @@ -1144,6 +1153,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Account for the SYN */ tc->snd_nxt += 1; + tc->rtt_ts = 0; } else { @@ -1232,7 +1242,7 @@ tcp_timer_persist_handler (u32 index) /* Nothing to send */ if (n_bytes <= 0) { - clib_warning ("persist found nothing to send"); + // clib_warning ("persist found nothing to send"); tcp_return_buffer (tm); return; } @@ -1448,7 +1458,7 @@ tcp46_output_inline (vlib_main_t * vm, tcp_connection_t *tc0; tcp_tx_trace_t *t0; tcp_header_t *th0 = 0; - u32 error0 = TCP_ERROR_PKTS_SENT, next0 = TCP_OUTPUT_NEXT_DROP; + u32 error0 = TCP_ERROR_PKTS_SENT, next0 = TCP_OUTPUT_NEXT_IP_LOOKUP; bi0 = from[0]; to_next[0] = bi0; @@ -1527,6 +1537,7 @@ tcp46_output_inline (vlib_main_t * vm, tc0->rto_boff = 0; } +#if 0 /* Make sure we haven't lost route to our peer */ if (PREDICT_FALSE (tc0->last_fib_check < tc0->snd_opts.tsval + TCP_FIB_RECHECK_PERIOD)) @@ -1547,6 +1558,10 @@ tcp46_output_inline (vlib_main_t * vm, /* Use pre-computed dpo to set next node */ next0 = tc0->c_rmt_dpo.dpoi_next_node; vnet_buffer (b0)->ip.adj_index[VLIB_TX] = tc0->c_rmt_dpo.dpoi_index; +#endif + + vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0; b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; done: diff --git a/src/vnet/tcp/tcp_packet.h b/src/vnet/tcp/tcp_packet.h index a6f62ee16d5..9ccfe6553ff 100644 --- a/src/vnet/tcp/tcp_packet.h +++ b/src/vnet/tcp/tcp_packet.h @@ -168,6 +168,7 @@ typedef struct #define TCP_OPTION_LEN_TIMESTAMP 10 #define TCP_OPTION_LEN_SACK_BLOCK 8 +#define TCP_HDR_LEN_MAX 60 #define TCP_WND_MAX 65535U #define TCP_MAX_WND_SCALE 14 /* See RFC 1323 */ #define TCP_OPTS_ALIGN 4 diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c index a461e3b8b7d..510deb4fec0 100644 --- a/src/vnet/tcp/tcp_test.c +++ b/src/vnet/tcp/tcp_test.c @@ -290,7 +290,7 @@ tcp_test_sack_tx (vlib_main_t * vm, unformat_input_t * input) { tcp_connection_t _tc, *tc = &_tc; sack_block_t *sacks; - int i, verbose = 0; + int i, verbose = 0, expected; while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { @@ -326,8 +326,12 @@ tcp_test_sack_tx (vlib_main_t * vm, unformat_input_t * input) sacks = vec_dup (tc->snd_sacks); tcp_update_sack_list (tc, 1100, 1200); - TCP_TEST ((vec_len (tc->snd_sacks) == 5), "sack blocks %d expected %d", - vec_len (tc->snd_sacks), 5); + if (verbose) + vlib_cli_output (vm, "add new segment [1100, 1200]\n%U", + format_tcp_sacks, tc); + expected = 5 < TCP_MAX_SACK_BLOCKS ? 6 : 5; + TCP_TEST ((vec_len (tc->snd_sacks) == expected), + "sack blocks %d expected %d", vec_len (tc->snd_sacks), expected); TCP_TEST ((tc->snd_sacks[0].start == 1100), "first sack block start %u expected %u", tc->snd_sacks[0].start, 1100); |