summaryrefslogtreecommitdiffstats
path: root/src/vnet/tcp
diff options
context:
space:
mode:
authorFlorin Coras <fcoras@cisco.com>2017-03-01 08:17:34 -0800
committerDave Barach <openvpp@barachs.net>2017-03-04 01:22:36 +0000
commite04c29942af6a130591059679531c9ffa3d7237a (patch)
tree3dd68c33cb346820d098390a088d733e02e779e4 /src/vnet/tcp
parentfb38095d1c9d1b84850f345f0344f82b9ae2c375 (diff)
Cleanup URI code and TCP bugfixing
- Add CLI/API to enable session layer, by default it's disabled - Improve rcv wnd computation - Improvements to tx path - URI code cleanup - Builtin test tcp server - Improve src port allocation Change-Id: I2ace498e76a0771d4c31a8075cc14fe33d7dfa38 Signed-off-by: Florin Coras <fcoras@cisco.com>
Diffstat (limited to 'src/vnet/tcp')
-rw-r--r--src/vnet/tcp/builtin_server.c135
-rw-r--r--src/vnet/tcp/tcp.c48
-rw-r--r--src/vnet/tcp/tcp.h4
-rw-r--r--src/vnet/tcp/tcp_input.c56
-rw-r--r--src/vnet/tcp/tcp_output.c90
5 files changed, 257 insertions, 76 deletions
diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c
new file mode 100644
index 00000000000..be65642ae3b
--- /dev/null
+++ b/src/vnet/tcp/builtin_server.c
@@ -0,0 +1,135 @@
+/*
+* Copyright (c) 2015-2017 Cisco and/or its affiliates.
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <vnet/vnet.h>
+#include <vlibmemory/api.h>
+#include <vnet/session/application.h>
+#include <vnet/session/application_interface.h>
+
+int
+builtin_session_accept_callback (stream_session_t * s)
+{
+ clib_warning ("called...");
+ s->session_state = SESSION_STATE_READY;
+ return 0;
+}
+
+void
+builtin_session_disconnect_callback (stream_session_t * s)
+{
+ clib_warning ("called...");
+}
+
+int
+builtin_session_connected_callback (u32 client_index,
+ stream_session_t * s, u8 is_fail)
+{
+ clib_warning ("called...");
+ return -1;
+}
+
+int
+builtin_add_segment_callback (u32 client_index,
+ const u8 * seg_name, u32 seg_size)
+{
+ clib_warning ("called...");
+ return -1;
+}
+
+int
+builtin_redirect_connect_callback (u32 client_index, void *mp)
+{
+ clib_warning ("called...");
+ return -1;
+}
+
+int
+builtin_server_rx_callback (stream_session_t * s)
+{
+ clib_warning ("called...");
+ return 0;
+}
+
+static session_cb_vft_t builtin_session_cb_vft = {
+ .session_accept_callback = builtin_session_accept_callback,
+ .session_disconnect_callback = builtin_session_disconnect_callback,
+ .session_connected_callback = builtin_session_connected_callback,
+ .add_segment_callback = builtin_add_segment_callback,
+ .redirect_connect_callback = builtin_redirect_connect_callback,
+ .builtin_server_rx_callback = builtin_server_rx_callback
+};
+
+static int
+server_create (vlib_main_t * vm)
+{
+ vnet_bind_args_t _a, *a = &_a;
+ u64 options[SESSION_OPTIONS_N_OPTIONS];
+ char segment_name[128];
+
+ memset (a, 0, sizeof (*a));
+ memset (options, 0, sizeof (options));
+
+ a->uri = "tcp://0.0.0.0/80";
+ a->api_client_index = ~0;
+ a->session_cb_vft = &builtin_session_cb_vft;
+ a->options = options;
+ a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 256 << 10;
+ a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 64 << 10;
+ a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 64 << 10;
+ a->segment_name = segment_name;
+ a->segment_name_length = ARRAY_LEN (segment_name);
+
+ return vnet_bind_uri (a);
+}
+
+static clib_error_t *
+server_create_command_fn (vlib_main_t * vm,
+ unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+ int rv;
+#if 0
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "whatever %d", &whatever))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+#endif
+
+ rv = server_create (vm);
+ switch (rv)
+ {
+ case 0:
+ break;
+ default:
+ return clib_error_return (0, "server_create returned %d", rv);
+ }
+ return 0;
+}
+
+VLIB_CLI_COMMAND (server_create_command, static) =
+{
+.path = "test server",.short_help = "test server",.function =
+ server_create_command_fn,};
+
+/*
+* fd.io coding-style-patch-verification: ON
+*
+* Local Variables:
+* eval: (c-set-style "gnu")
+* End:
+*/
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index 0f9b7097b42..e5feaeb1e03 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -217,6 +217,7 @@ ip_interface_get_first_ip (u32 sw_if_index, u8 is_ip4)
return 0;
}
+#define PORT_MASK ((1 << 16)- 1)
/**
* Allocate local port and add if successful add entry to local endpoint
* table to mark the pair as used.
@@ -224,7 +225,6 @@ ip_interface_get_first_ip (u32 sw_if_index, u8 is_ip4)
u16
tcp_allocate_local_port (tcp_main_t * tm, ip46_address_t * ip)
{
- u8 unique = 0;
transport_endpoint_t *tep;
u32 time_now, tei;
u16 min = 1024, max = 65535, tries; /* XXX configurable ? */
@@ -235,37 +235,34 @@ tcp_allocate_local_port (tcp_main_t * tm, ip46_address_t * ip)
/* Start at random point or max */
pool_get (tm->local_endpoints, tep);
clib_memcpy (&tep->ip, ip, sizeof (*ip));
- tep->port = random_u32 (&time_now) << 16;
- tep->port = tep->port < min ? max : tep->port;
/* Search for first free slot */
- while (tries)
+ for (; tries >= 0; tries--)
{
+ u16 port = 0;
+
+ /* Find a port in the specified range */
+ while (1)
+ {
+ port = random_u32 (&time_now) & PORT_MASK;
+ if (PREDICT_TRUE (port >= min && port < max))
+ break;
+ }
+
+ tep->port = port;
+
+ /* Look it up */
tei = transport_endpoint_lookup (&tm->local_endpoints_table, &tep->ip,
tep->port);
+ /* If not found, we're done */
if (tei == TRANSPORT_ENDPOINT_INVALID_INDEX)
{
- unique = 1;
- break;
+ transport_endpoint_table_add (&tm->local_endpoints_table, tep,
+ tep - tm->local_endpoints);
+ return tep->port;
}
-
- tep->port--;
-
- if (tep->port < min)
- tep->port = max;
-
- tries--;
}
-
- if (unique)
- {
- transport_endpoint_table_add (&tm->local_endpoints_table, tep,
- tep - tm->local_endpoints);
-
- return tep->port;
- }
-
- /* Failed */
+ /* No free ports */
pool_put (tm->local_endpoints, tep);
return -1;
}
@@ -360,7 +357,10 @@ tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4)
/* Allocate source port */
lcl_port = tcp_allocate_local_port (tm, &lcl_addr);
if (lcl_port < 1)
- return -1;
+ {
+ clib_warning ("Failed to allocate src port");
+ return -1;
+ }
/*
* Create connection and send SYN
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index 22f00a63273..3560509d090 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -30,7 +30,8 @@
#define TCP_MAX_OPTION_SPACE 40
#define TCP_DUPACK_THRESHOLD 3
-#define TCP_DEFAULT_RX_FIFO_SIZE 64 << 10
+#define TCP_MAX_RX_FIFO_SIZE 2 << 20
+#define TCP_IW_N_SEGMENTS 10
/** TCP FSM state definitions as per RFC793. */
#define foreach_tcp_fsm_state \
@@ -590,7 +591,6 @@ vlib_buffer_push_tcp_net_order (vlib_buffer_t * b, u16 sp, u16 dp, u32 seq,
/**
* Push TCP header to buffer
*
- * @param vm - vlib_main
* @param b - buffer to write the header to
* @param sp_net - source port net order
* @param dp_net - destination port net order
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index daa0683b48e..0a907d0a3d7 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -711,7 +711,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
if (tcp_opts_sack_permitted (&tc->opt))
tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number);
- new_snd_wnd = clib_net_to_host_u32 (th->window) << tc->snd_wscale;
+ new_snd_wnd = clib_net_to_host_u16 (th->window) << tc->snd_wscale;
if (tcp_ack_is_dupack (tc, b, new_snd_wnd))
{
@@ -1320,7 +1320,6 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
/* Parse options */
tcp_options_parse (tcp0, &new_tc0->opt);
- tcp_connection_init_vars (new_tc0);
if (tcp_opts_tstamp (&new_tc0->opt))
{
@@ -1331,11 +1330,13 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
if (tcp_opts_wscale (&new_tc0->opt))
new_tc0->snd_wscale = new_tc0->opt.wscale;
- new_tc0->snd_wnd = clib_net_to_host_u32 (tcp0->window)
- << new_tc0->snd_wscale;
+ /* No scaling */
+ new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window);
new_tc0->snd_wl1 = seq0;
new_tc0->snd_wl2 = ack0;
+ tcp_connection_init_vars (new_tc0);
+
/* SYN-ACK: See if we can switch to ESTABLISHED state */
if (tcp_ack (tcp0))
{
@@ -1345,6 +1346,9 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
new_tc0->snd_una = ack0;
new_tc0->state = TCP_STATE_ESTABLISHED;
+ /* Make sure las is initialized for the wnd computation */
+ new_tc0->rcv_las = new_tc0->rcv_nxt;
+
/* Notify app that we have connection */
stream_session_connect_notify (&new_tc0->connection, sst, 0);
@@ -1575,7 +1579,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
/* Initialize session variables */
tc0->snd_una = vnet_buffer (b0)->tcp.ack_number;
- tc0->snd_wnd = clib_net_to_host_u32 (tcp0->window)
+ tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window)
<< tc0->opt.wscale;
tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number;
tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
@@ -1899,7 +1903,6 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
}
tcp_options_parse (th0, &child0->opt);
- tcp_connection_init_vars (child0);
child0->irs = vnet_buffer (b0)->tcp.seq_number;
child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1;
@@ -1913,6 +1916,16 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
child0->tsval_recent_age = tcp_time_now ();
}
+ if (tcp_opts_wscale (&child0->opt))
+ child0->snd_wscale = child0->opt.wscale;
+
+ /* No scaling */
+ child0->snd_wnd = clib_net_to_host_u16 (th0->window);
+ child0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number;
+ child0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
+
+ tcp_connection_init_vars (child0);
+
/* Reuse buffer to make syn-ack and send */
tcp_make_synack (child0, b0);
next0 = tcp_next_output (is_ip4);
@@ -1923,7 +1936,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
}
- b0->error = error0 ? node->errors[error0] : 0;
+ b0->error = node->errors[error0];
vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
n_left_to_next, bi0, next0);
@@ -2069,7 +2082,6 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
u32 n_left_from, next_index, *from, *to_next;
u32 my_thread_index = vm->cpu_index;
tcp_main_t *tm = vnet_get_tcp_main ();
- session_manager_main_t *ssm = vnet_get_session_manager_main ();
from = vlib_frame_vector_args (from_frame);
n_left_from = from_frame->n_vectors;
@@ -2109,26 +2121,26 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
/* lookup session */
tc0 =
- (tcp_connection_t *) stream_session_lookup_transport4 (ssm,
- &ip40->dst_address,
- &ip40->src_address,
- tcp0->dst_port,
- tcp0->src_port,
- SESSION_TYPE_IP4_TCP,
- my_thread_index);
+ (tcp_connection_t *)
+ stream_session_lookup_transport4 (&ip40->dst_address,
+ &ip40->src_address,
+ tcp0->dst_port,
+ tcp0->src_port,
+ SESSION_TYPE_IP4_TCP,
+ my_thread_index);
}
else
{
ip60 = vlib_buffer_get_current (b0);
tcp0 = ip6_next_header (ip60);
tc0 =
- (tcp_connection_t *) stream_session_lookup_transport6 (ssm,
- &ip60->src_address,
- &ip60->dst_address,
- tcp0->src_port,
- tcp0->dst_port,
- SESSION_TYPE_IP6_TCP,
- my_thread_index);
+ (tcp_connection_t *)
+ stream_session_lookup_transport6 (&ip60->src_address,
+ &ip60->dst_address,
+ tcp0->src_port,
+ tcp0->dst_port,
+ SESSION_TYPE_IP6_TCP,
+ my_thread_index);
}
/* Session exists */
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index dbcf1f74975..7e431cd0454 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -91,24 +91,30 @@ tcp_window_compute_scale (u32 available_space)
}
/**
+ * TCP's IW as recommended by RFC6928
+ */
+always_inline u32
+tcp_initial_wnd_unscaled (tcp_connection_t * tc)
+{
+ return TCP_IW_N_SEGMENTS * dummy_mtu;
+}
+
+/**
* Compute initial window and scale factor. As per RFC1323, window field in
* SYN and SYN-ACK segments is never scaled.
*/
u32
tcp_initial_window_to_advertise (tcp_connection_t * tc)
{
- u32 available_space;
+ u32 max_fifo;
/* Initial wnd for SYN. Fifos are not allocated yet.
- * Use some predefined value */
- if (tc->state != TCP_STATE_SYN_RCVD)
- {
- return TCP_DEFAULT_RX_FIFO_SIZE;
- }
+ * Use some predefined value. For SYN-ACK we still want the
+ * scale to be computed in the same way */
+ max_fifo = TCP_MAX_RX_FIFO_SIZE;
- available_space = stream_session_max_enqueue (&tc->connection);
- tc->rcv_wscale = tcp_window_compute_scale (available_space);
- tc->rcv_wnd = clib_min (available_space, TCP_WND_MAX << tc->rcv_wscale);
+ tc->rcv_wscale = tcp_window_compute_scale (max_fifo);
+ tc->rcv_wnd = tcp_initial_wnd_unscaled (tc);
return clib_min (tc->rcv_wnd, TCP_WND_MAX);
}
@@ -119,23 +125,43 @@ tcp_initial_window_to_advertise (tcp_connection_t * tc)
u32
tcp_window_to_advertise (tcp_connection_t * tc, tcp_state_t state)
{
- u32 available_space, wnd, scaled_space;
+ u32 available_space, max_fifo, observed_wnd;
- if (state != TCP_STATE_ESTABLISHED)
+ if (state < TCP_STATE_ESTABLISHED)
return tcp_initial_window_to_advertise (tc);
+ /*
+ * Figure out how much space we have available
+ */
available_space = stream_session_max_enqueue (&tc->connection);
- scaled_space = available_space >> tc->rcv_wscale;
+ max_fifo = stream_session_fifo_size (&tc->connection);
+
+ ASSERT (tc->opt.mss < max_fifo);
+
+ if (available_space < tc->opt.mss && available_space < max_fifo / 8)
+ available_space = 0;
- /* Need to update scale */
- if (PREDICT_FALSE ((scaled_space == 0 && available_space != 0))
- || (scaled_space >= TCP_WND_MAX))
- tc->rcv_wscale = tcp_window_compute_scale (available_space);
+ /*
+ * Use the above and what we know about what we've previously advertised
+ * to compute the new window
+ */
+ observed_wnd = tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las);
- wnd = clib_min (available_space, TCP_WND_MAX << tc->rcv_wscale);
- tc->rcv_wnd = wnd;
+ /* Bad. Thou shalt not shrink */
+ if (available_space < observed_wnd)
+ {
+ if (available_space == 0)
+ clib_warning ("Didn't shrink rcv window despite not having space");
+ }
+
+ tc->rcv_wnd = clib_min (available_space, TCP_WND_MAX << tc->rcv_wscale);
+
+ if (tc->rcv_wnd == 0)
+ {
+ tc->flags |= TCP_CONN_SENT_RCV_WND0;
+ }
- return wnd >> tc->rcv_wscale;
+ return tc->rcv_wnd >> tc->rcv_wscale;
}
/**
@@ -225,7 +251,7 @@ tcp_options_write (u8 * data, tcp_options_t * opts)
}
always_inline int
-tcp_make_syn_options (tcp_options_t * opts, u32 initial_wnd)
+tcp_make_syn_options (tcp_options_t * opts, u8 wnd_scale)
{
u8 len = 0;
@@ -234,7 +260,7 @@ tcp_make_syn_options (tcp_options_t * opts, u32 initial_wnd)
len += TCP_OPTION_LEN_MSS;
opts->flags |= TCP_OPTS_FLAG_WSCALE;
- opts->wscale = tcp_window_compute_scale (initial_wnd);
+ opts->wscale = wnd_scale;
len += TCP_OPTION_LEN_WINDOW_SCALE;
opts->flags |= TCP_OPTS_FLAG_TSTAMP;
@@ -327,8 +353,7 @@ tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts,
case TCP_STATE_SYN_RCVD:
return tcp_make_synack_options (tc, opts);
case TCP_STATE_SYN_SENT:
- return tcp_make_syn_options (opts,
- tcp_initial_window_to_advertise (tc));
+ return tcp_make_syn_options (opts, tc->rcv_wscale);
default:
clib_warning ("Not handled!");
return 0;
@@ -732,7 +757,7 @@ tcp_send_syn (tcp_connection_t * tc)
/* Make and write options */
memset (&snd_opts, 0, sizeof (snd_opts));
- tcp_opts_len = tcp_make_syn_options (&snd_opts, initial_wnd);
+ tcp_opts_len = tcp_make_syn_options (&snd_opts, tc->rcv_wscale);
tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t);
th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss,
@@ -900,7 +925,7 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b,
tcp_reuse_buffer (vm, b);
- ASSERT (tc->state == TCP_STATE_ESTABLISHED);
+ ASSERT (tc->state >= TCP_STATE_ESTABLISHED);
ASSERT (max_bytes != 0);
if (tcp_opts_sack_permitted (&tc->opt))
@@ -929,7 +954,6 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b,
max_bytes);
ASSERT (n_bytes != 0);
- tc->snd_nxt += n_bytes;
tcp_push_hdr_i (tc, b, tc->state);
return n_bytes;
@@ -967,7 +991,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
tcp_get_free_buffer_index (tm, &bi);
b = vlib_get_buffer (vm, bi);
- if (tc->state == TCP_STATE_ESTABLISHED)
+ if (tc->state >= TCP_STATE_ESTABLISHED)
{
tcp_fastrecovery_off (tc);
@@ -977,6 +1001,12 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
/* Figure out what and how many bytes we can send */
snd_space = tcp_available_snd_space (tc);
max_bytes = clib_min (tc->snd_mss, snd_space);
+
+ if (max_bytes == 0)
+ {
+ clib_warning ("no wnd to retransmit");
+ return;
+ }
tcp_prepare_retransmit_segment (tc, b, max_bytes);
tc->rtx_bytes += max_bytes;
@@ -996,7 +1026,11 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
vlib_buffer_make_headroom (b, MAX_HDRS_LEN);
+
tcp_push_hdr_i (tc, b, tc->state);
+
+ /* Account for the SYN */
+ tc->snd_nxt += 1;
}
if (!is_syn)
@@ -1163,8 +1197,8 @@ tcp46_output_inline (vlib_main_t * vm,
if (PREDICT_FALSE
(vnet_buffer (b0)->tcp.flags & TCP_BUF_FLAG_DUPACK))
{
+ ASSERT (tc0->snt_dupacks > 0);
tc0->snt_dupacks--;
- ASSERT (tc0->snt_dupacks >= 0);
if (!tcp_session_has_ooo_data (tc0))
{
error0 = TCP_ERROR_FILTERED_DUPACKS;