From e04c29942af6a130591059679531c9ffa3d7237a Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Wed, 1 Mar 2017 08:17:34 -0800 Subject: Cleanup URI code and TCP bugfixing - Add CLI/API to enable session layer, by default it's disabled - Improve rcv wnd computation - Improvements to tx path - URI code cleanup - Builtin test tcp server - Improve src port allocation Change-Id: I2ace498e76a0771d4c31a8075cc14fe33d7dfa38 Signed-off-by: Florin Coras --- src/vnet/tcp/builtin_server.c | 135 ++++++++++++++++++++++++++++++++++++++++++ src/vnet/tcp/tcp.c | 48 +++++++-------- src/vnet/tcp/tcp.h | 4 +- src/vnet/tcp/tcp_input.c | 56 +++++++++++------- src/vnet/tcp/tcp_output.c | 90 +++++++++++++++++++--------- 5 files changed, 257 insertions(+), 76 deletions(-) create mode 100644 src/vnet/tcp/builtin_server.c (limited to 'src/vnet/tcp') diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c new file mode 100644 index 00000000000..be65642ae3b --- /dev/null +++ b/src/vnet/tcp/builtin_server.c @@ -0,0 +1,135 @@ +/* +* Copyright (c) 2015-2017 Cisco and/or its affiliates. +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at: +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include +#include +#include +#include + +int +builtin_session_accept_callback (stream_session_t * s) +{ + clib_warning ("called..."); + s->session_state = SESSION_STATE_READY; + return 0; +} + +void +builtin_session_disconnect_callback (stream_session_t * s) +{ + clib_warning ("called..."); +} + +int +builtin_session_connected_callback (u32 client_index, + stream_session_t * s, u8 is_fail) +{ + clib_warning ("called..."); + return -1; +} + +int +builtin_add_segment_callback (u32 client_index, + const u8 * seg_name, u32 seg_size) +{ + clib_warning ("called..."); + return -1; +} + +int +builtin_redirect_connect_callback (u32 client_index, void *mp) +{ + clib_warning ("called..."); + return -1; +} + +int +builtin_server_rx_callback (stream_session_t * s) +{ + clib_warning ("called..."); + return 0; +} + +static session_cb_vft_t builtin_session_cb_vft = { + .session_accept_callback = builtin_session_accept_callback, + .session_disconnect_callback = builtin_session_disconnect_callback, + .session_connected_callback = builtin_session_connected_callback, + .add_segment_callback = builtin_add_segment_callback, + .redirect_connect_callback = builtin_redirect_connect_callback, + .builtin_server_rx_callback = builtin_server_rx_callback +}; + +static int +server_create (vlib_main_t * vm) +{ + vnet_bind_args_t _a, *a = &_a; + u64 options[SESSION_OPTIONS_N_OPTIONS]; + char segment_name[128]; + + memset (a, 0, sizeof (*a)); + memset (options, 0, sizeof (options)); + + a->uri = "tcp://0.0.0.0/80"; + a->api_client_index = ~0; + a->session_cb_vft = &builtin_session_cb_vft; + a->options = options; + a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 256 << 10; + a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 64 << 10; + a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 64 << 10; + a->segment_name = segment_name; + a->segment_name_length = ARRAY_LEN (segment_name); + + return vnet_bind_uri (a); +} + +static clib_error_t * +server_create_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + int rv; +#if 0 + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "whatever %d", &whatever)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } +#endif + + rv = server_create (vm); + switch (rv) + { + case 0: + break; + default: + return clib_error_return (0, "server_create returned %d", rv); + } + return 0; +} + +VLIB_CLI_COMMAND (server_create_command, static) = +{ +.path = "test server",.short_help = "test server",.function = + server_create_command_fn,}; + +/* +* fd.io coding-style-patch-verification: ON +* +* Local Variables: +* eval: (c-set-style "gnu") +* End: +*/ diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 0f9b7097b42..e5feaeb1e03 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -217,6 +217,7 @@ ip_interface_get_first_ip (u32 sw_if_index, u8 is_ip4) return 0; } +#define PORT_MASK ((1 << 16)- 1) /** * Allocate local port and add if successful add entry to local endpoint * table to mark the pair as used. @@ -224,7 +225,6 @@ ip_interface_get_first_ip (u32 sw_if_index, u8 is_ip4) u16 tcp_allocate_local_port (tcp_main_t * tm, ip46_address_t * ip) { - u8 unique = 0; transport_endpoint_t *tep; u32 time_now, tei; u16 min = 1024, max = 65535, tries; /* XXX configurable ? */ @@ -235,37 +235,34 @@ tcp_allocate_local_port (tcp_main_t * tm, ip46_address_t * ip) /* Start at random point or max */ pool_get (tm->local_endpoints, tep); clib_memcpy (&tep->ip, ip, sizeof (*ip)); - tep->port = random_u32 (&time_now) << 16; - tep->port = tep->port < min ? max : tep->port; /* Search for first free slot */ - while (tries) + for (; tries >= 0; tries--) { + u16 port = 0; + + /* Find a port in the specified range */ + while (1) + { + port = random_u32 (&time_now) & PORT_MASK; + if (PREDICT_TRUE (port >= min && port < max)) + break; + } + + tep->port = port; + + /* Look it up */ tei = transport_endpoint_lookup (&tm->local_endpoints_table, &tep->ip, tep->port); + /* If not found, we're done */ if (tei == TRANSPORT_ENDPOINT_INVALID_INDEX) { - unique = 1; - break; + transport_endpoint_table_add (&tm->local_endpoints_table, tep, + tep - tm->local_endpoints); + return tep->port; } - - tep->port--; - - if (tep->port < min) - tep->port = max; - - tries--; } - - if (unique) - { - transport_endpoint_table_add (&tm->local_endpoints_table, tep, - tep - tm->local_endpoints); - - return tep->port; - } - - /* Failed */ + /* No free ports */ pool_put (tm->local_endpoints, tep); return -1; } @@ -360,7 +357,10 @@ tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4) /* Allocate source port */ lcl_port = tcp_allocate_local_port (tm, &lcl_addr); if (lcl_port < 1) - return -1; + { + clib_warning ("Failed to allocate src port"); + return -1; + } /* * Create connection and send SYN diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 22f00a63273..3560509d090 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -30,7 +30,8 @@ #define TCP_MAX_OPTION_SPACE 40 #define TCP_DUPACK_THRESHOLD 3 -#define TCP_DEFAULT_RX_FIFO_SIZE 64 << 10 +#define TCP_MAX_RX_FIFO_SIZE 2 << 20 +#define TCP_IW_N_SEGMENTS 10 /** TCP FSM state definitions as per RFC793. */ #define foreach_tcp_fsm_state \ @@ -590,7 +591,6 @@ vlib_buffer_push_tcp_net_order (vlib_buffer_t * b, u16 sp, u16 dp, u32 seq, /** * Push TCP header to buffer * - * @param vm - vlib_main * @param b - buffer to write the header to * @param sp_net - source port net order * @param dp_net - destination port net order diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index daa0683b48e..0a907d0a3d7 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -711,7 +711,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, if (tcp_opts_sack_permitted (&tc->opt)) tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number); - new_snd_wnd = clib_net_to_host_u32 (th->window) << tc->snd_wscale; + new_snd_wnd = clib_net_to_host_u16 (th->window) << tc->snd_wscale; if (tcp_ack_is_dupack (tc, b, new_snd_wnd)) { @@ -1320,7 +1320,6 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Parse options */ tcp_options_parse (tcp0, &new_tc0->opt); - tcp_connection_init_vars (new_tc0); if (tcp_opts_tstamp (&new_tc0->opt)) { @@ -1331,11 +1330,13 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (tcp_opts_wscale (&new_tc0->opt)) new_tc0->snd_wscale = new_tc0->opt.wscale; - new_tc0->snd_wnd = clib_net_to_host_u32 (tcp0->window) - << new_tc0->snd_wscale; + /* No scaling */ + new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window); new_tc0->snd_wl1 = seq0; new_tc0->snd_wl2 = ack0; + tcp_connection_init_vars (new_tc0); + /* SYN-ACK: See if we can switch to ESTABLISHED state */ if (tcp_ack (tcp0)) { @@ -1345,6 +1346,9 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, new_tc0->snd_una = ack0; new_tc0->state = TCP_STATE_ESTABLISHED; + /* Make sure las is initialized for the wnd computation */ + new_tc0->rcv_las = new_tc0->rcv_nxt; + /* Notify app that we have connection */ stream_session_connect_notify (&new_tc0->connection, sst, 0); @@ -1575,7 +1579,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Initialize session variables */ tc0->snd_una = vnet_buffer (b0)->tcp.ack_number; - tc0->snd_wnd = clib_net_to_host_u32 (tcp0->window) + tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window) << tc0->opt.wscale; tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number; tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number; @@ -1899,7 +1903,6 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } tcp_options_parse (th0, &child0->opt); - tcp_connection_init_vars (child0); child0->irs = vnet_buffer (b0)->tcp.seq_number; child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1; @@ -1913,6 +1916,16 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, child0->tsval_recent_age = tcp_time_now (); } + if (tcp_opts_wscale (&child0->opt)) + child0->snd_wscale = child0->opt.wscale; + + /* No scaling */ + child0->snd_wnd = clib_net_to_host_u16 (th0->window); + child0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number; + child0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number; + + tcp_connection_init_vars (child0); + /* Reuse buffer to make syn-ack and send */ tcp_make_synack (child0, b0); next0 = tcp_next_output (is_ip4); @@ -1923,7 +1936,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } - b0->error = error0 ? node->errors[error0] : 0; + b0->error = node->errors[error0]; vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, n_left_to_next, bi0, next0); @@ -2069,7 +2082,6 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, u32 n_left_from, next_index, *from, *to_next; u32 my_thread_index = vm->cpu_index; tcp_main_t *tm = vnet_get_tcp_main (); - session_manager_main_t *ssm = vnet_get_session_manager_main (); from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -2109,26 +2121,26 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* lookup session */ tc0 = - (tcp_connection_t *) stream_session_lookup_transport4 (ssm, - &ip40->dst_address, - &ip40->src_address, - tcp0->dst_port, - tcp0->src_port, - SESSION_TYPE_IP4_TCP, - my_thread_index); + (tcp_connection_t *) + stream_session_lookup_transport4 (&ip40->dst_address, + &ip40->src_address, + tcp0->dst_port, + tcp0->src_port, + SESSION_TYPE_IP4_TCP, + my_thread_index); } else { ip60 = vlib_buffer_get_current (b0); tcp0 = ip6_next_header (ip60); tc0 = - (tcp_connection_t *) stream_session_lookup_transport6 (ssm, - &ip60->src_address, - &ip60->dst_address, - tcp0->src_port, - tcp0->dst_port, - SESSION_TYPE_IP6_TCP, - my_thread_index); + (tcp_connection_t *) + stream_session_lookup_transport6 (&ip60->src_address, + &ip60->dst_address, + tcp0->src_port, + tcp0->dst_port, + SESSION_TYPE_IP6_TCP, + my_thread_index); } /* Session exists */ diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index dbcf1f74975..7e431cd0454 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -90,6 +90,15 @@ tcp_window_compute_scale (u32 available_space) return wnd_scale; } +/** + * TCP's IW as recommended by RFC6928 + */ +always_inline u32 +tcp_initial_wnd_unscaled (tcp_connection_t * tc) +{ + return TCP_IW_N_SEGMENTS * dummy_mtu; +} + /** * Compute initial window and scale factor. As per RFC1323, window field in * SYN and SYN-ACK segments is never scaled. @@ -97,18 +106,15 @@ tcp_window_compute_scale (u32 available_space) u32 tcp_initial_window_to_advertise (tcp_connection_t * tc) { - u32 available_space; + u32 max_fifo; /* Initial wnd for SYN. Fifos are not allocated yet. - * Use some predefined value */ - if (tc->state != TCP_STATE_SYN_RCVD) - { - return TCP_DEFAULT_RX_FIFO_SIZE; - } + * Use some predefined value. For SYN-ACK we still want the + * scale to be computed in the same way */ + max_fifo = TCP_MAX_RX_FIFO_SIZE; - available_space = stream_session_max_enqueue (&tc->connection); - tc->rcv_wscale = tcp_window_compute_scale (available_space); - tc->rcv_wnd = clib_min (available_space, TCP_WND_MAX << tc->rcv_wscale); + tc->rcv_wscale = tcp_window_compute_scale (max_fifo); + tc->rcv_wnd = tcp_initial_wnd_unscaled (tc); return clib_min (tc->rcv_wnd, TCP_WND_MAX); } @@ -119,23 +125,43 @@ tcp_initial_window_to_advertise (tcp_connection_t * tc) u32 tcp_window_to_advertise (tcp_connection_t * tc, tcp_state_t state) { - u32 available_space, wnd, scaled_space; + u32 available_space, max_fifo, observed_wnd; - if (state != TCP_STATE_ESTABLISHED) + if (state < TCP_STATE_ESTABLISHED) return tcp_initial_window_to_advertise (tc); + /* + * Figure out how much space we have available + */ available_space = stream_session_max_enqueue (&tc->connection); - scaled_space = available_space >> tc->rcv_wscale; + max_fifo = stream_session_fifo_size (&tc->connection); + + ASSERT (tc->opt.mss < max_fifo); + + if (available_space < tc->opt.mss && available_space < max_fifo / 8) + available_space = 0; - /* Need to update scale */ - if (PREDICT_FALSE ((scaled_space == 0 && available_space != 0)) - || (scaled_space >= TCP_WND_MAX)) - tc->rcv_wscale = tcp_window_compute_scale (available_space); + /* + * Use the above and what we know about what we've previously advertised + * to compute the new window + */ + observed_wnd = tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las); - wnd = clib_min (available_space, TCP_WND_MAX << tc->rcv_wscale); - tc->rcv_wnd = wnd; + /* Bad. Thou shalt not shrink */ + if (available_space < observed_wnd) + { + if (available_space == 0) + clib_warning ("Didn't shrink rcv window despite not having space"); + } + + tc->rcv_wnd = clib_min (available_space, TCP_WND_MAX << tc->rcv_wscale); + + if (tc->rcv_wnd == 0) + { + tc->flags |= TCP_CONN_SENT_RCV_WND0; + } - return wnd >> tc->rcv_wscale; + return tc->rcv_wnd >> tc->rcv_wscale; } /** @@ -225,7 +251,7 @@ tcp_options_write (u8 * data, tcp_options_t * opts) } always_inline int -tcp_make_syn_options (tcp_options_t * opts, u32 initial_wnd) +tcp_make_syn_options (tcp_options_t * opts, u8 wnd_scale) { u8 len = 0; @@ -234,7 +260,7 @@ tcp_make_syn_options (tcp_options_t * opts, u32 initial_wnd) len += TCP_OPTION_LEN_MSS; opts->flags |= TCP_OPTS_FLAG_WSCALE; - opts->wscale = tcp_window_compute_scale (initial_wnd); + opts->wscale = wnd_scale; len += TCP_OPTION_LEN_WINDOW_SCALE; opts->flags |= TCP_OPTS_FLAG_TSTAMP; @@ -327,8 +353,7 @@ tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts, case TCP_STATE_SYN_RCVD: return tcp_make_synack_options (tc, opts); case TCP_STATE_SYN_SENT: - return tcp_make_syn_options (opts, - tcp_initial_window_to_advertise (tc)); + return tcp_make_syn_options (opts, tc->rcv_wscale); default: clib_warning ("Not handled!"); return 0; @@ -732,7 +757,7 @@ tcp_send_syn (tcp_connection_t * tc) /* Make and write options */ memset (&snd_opts, 0, sizeof (snd_opts)); - tcp_opts_len = tcp_make_syn_options (&snd_opts, initial_wnd); + tcp_opts_len = tcp_make_syn_options (&snd_opts, tc->rcv_wscale); tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t); th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss, @@ -900,7 +925,7 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, tcp_reuse_buffer (vm, b); - ASSERT (tc->state == TCP_STATE_ESTABLISHED); + ASSERT (tc->state >= TCP_STATE_ESTABLISHED); ASSERT (max_bytes != 0); if (tcp_opts_sack_permitted (&tc->opt)) @@ -929,7 +954,6 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, max_bytes); ASSERT (n_bytes != 0); - tc->snd_nxt += n_bytes; tcp_push_hdr_i (tc, b, tc->state); return n_bytes; @@ -967,7 +991,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); - if (tc->state == TCP_STATE_ESTABLISHED) + if (tc->state >= TCP_STATE_ESTABLISHED) { tcp_fastrecovery_off (tc); @@ -977,6 +1001,12 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Figure out what and how many bytes we can send */ snd_space = tcp_available_snd_space (tc); max_bytes = clib_min (tc->snd_mss, snd_space); + + if (max_bytes == 0) + { + clib_warning ("no wnd to retransmit"); + return; + } tcp_prepare_retransmit_segment (tc, b, max_bytes); tc->rtx_bytes += max_bytes; @@ -996,7 +1026,11 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); vlib_buffer_make_headroom (b, MAX_HDRS_LEN); + tcp_push_hdr_i (tc, b, tc->state); + + /* Account for the SYN */ + tc->snd_nxt += 1; } if (!is_syn) @@ -1163,8 +1197,8 @@ tcp46_output_inline (vlib_main_t * vm, if (PREDICT_FALSE (vnet_buffer (b0)->tcp.flags & TCP_BUF_FLAG_DUPACK)) { + ASSERT (tc0->snt_dupacks > 0); tc0->snt_dupacks--; - ASSERT (tc0->snt_dupacks >= 0); if (!tcp_session_has_ooo_data (tc0)) { error0 = TCP_ERROR_FILTERED_DUPACKS; -- cgit 1.2.3-korg