aboutsummaryrefslogtreecommitdiffstats
path: root/src/vnet/tcp/tcp_input.c
diff options
context:
space:
mode:
authorFlorin Coras <fcoras@cisco.com>2017-07-18 05:38:03 -0400
committerFlorin Coras <fcoras@cisco.com>2017-07-21 19:20:09 -0400
commit6534b7aa13bc5bed15ed87f47bb766405963e9e8 (patch)
tree999524eff2a5c811ef61e65354e6018c8ae3de33 /src/vnet/tcp/tcp_input.c
parent161c59c75c667ce7a3c1d6173723831dc30e994c (diff)
Improvements to tcp rx path and debugging
- Increment rcv_nxt for fin packets - Call tcp_segment_rcv only if buffer has data - Parse rcv opts before deleting half-open connection - Fix initial rcv_wnd - Improved event logging Change-Id: I9b83c04f432c4cec832c480b03e534deff02c3b1 Signed-off-by: Florin Coras <fcoras@cisco.com>
Diffstat (limited to 'src/vnet/tcp/tcp_input.c')
-rw-r--r--src/vnet/tcp/tcp_input.c165
1 files changed, 99 insertions, 66 deletions
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index bc7d9015789..cc5cecdc473 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -349,7 +349,10 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0,
/* 4th: check the SYN bit */
if (tcp_syn (th0))
{
- tcp_send_reset (b0, tc0->c_is_ip4);
+ /* TODO implement RFC 5961 */
+ tcp_make_ack (tc0, b0);
+ *next0 = tcp_next_output (tc0->c_is_ip4);
+ TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0);
return -1;
}
@@ -1246,8 +1249,6 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
* Looks okay, process feedback
*/
- TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc);
-
if (tcp_opts_sack_permitted (&tc->rcv_opts))
tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number);
@@ -1263,6 +1264,8 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
if (tc->bytes_acked)
tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number);
+ TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc);
+
/*
* Check if we have congestion event
*/
@@ -1496,9 +1499,13 @@ tcp_can_delack (tcp_connection_t * tc)
static int
tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b,
- u16 n_data_bytes, u32 * next0)
+ u32 * next0)
{
- u32 error = 0, n_bytes_to_drop;
+ u32 error = 0, n_bytes_to_drop, n_data_bytes;
+
+ vlib_buffer_advance (b, vnet_buffer (b)->tcp.data_offset);
+ n_data_bytes = vnet_buffer (b)->tcp.data_len;
+ ASSERT (n_data_bytes);
/* Handle out-of-order data */
if (PREDICT_FALSE (vnet_buffer (b)->tcp.seq_number != tc->rcv_nxt))
@@ -1512,7 +1519,12 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b,
/* Completely in the past (possible retransmit) */
if (seq_leq (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt))
- goto done;
+ {
+ /* Ack retransmissions since we may not have any data to send */
+ tcp_make_ack (tc, b);
+ *next0 = tcp_next_output (tc->c_is_ip4);
+ goto done;
+ }
/* Chop off the bytes in the past */
n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number;
@@ -1550,12 +1562,6 @@ in_order:
* segments can be enqueued after fifo tail offset changes. */
error = tcp_session_enqueue_data (tc, b, n_data_bytes);
- if (n_data_bytes == 0)
- {
- *next0 = TCP_NEXT_DROP;
- goto done;
- }
-
/* Check if ACK can be delayed */
if (tcp_can_delack (tc))
{
@@ -1680,7 +1686,9 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
}
th0 = tcp_buffer_hdr (b0);
- is_fin = (th0->flags & TCP_FLAG_FIN) != 0;
+ /* N.B. buffer is rewritten if segment is ooo. Thus, th0 becomes a
+ * dangling reference. */
+ is_fin = tcp_is_fin (th0);
/* SYNs, FINs and data consume sequence numbers */
vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number
@@ -1700,29 +1708,23 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
/* 5: check the ACK field */
if (tcp_rcv_ack (tc0, b0, th0, &next0, &error0))
- {
- goto done;
- }
+ goto done;
/* 6: check the URG bit TODO */
/* 7: process the segment text */
-
- vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset);
- error0 = tcp_segment_rcv (tm, tc0, b0,
- vnet_buffer (b0)->tcp.data_len, &next0);
-
- /* N.B. buffer is rewritten if segment is ooo. Thus, th0 becomes a
- * dangling reference. */
+ if (vnet_buffer (b0)->tcp.data_len)
+ error0 = tcp_segment_rcv (tm, tc0, b0, &next0);
/* 8: check the FIN bit */
- if (is_fin)
+ if (PREDICT_FALSE (is_fin))
{
/* Enter CLOSE-WAIT and notify session. Don't send ACK, instead
* wait for session to call close. To avoid lingering
* in CLOSE-WAIT, set timer (reuse WAITCLOSE). */
tc0->state = TCP_STATE_CLOSE_WAIT;
TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0);
+ tc0->rcv_nxt += (vnet_buffer (b0)->tcp.data_len == 0);
stream_session_disconnect_notify (&tc0->connection);
tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
}
@@ -1856,6 +1858,21 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
seq0 = vnet_buffer (b0)->tcp.seq_number;
tcp0 = tcp_buffer_hdr (b0);
+ if (!tc0)
+ {
+ ip4_header_t *ip40 = vlib_buffer_get_current (b0);
+ tcp0 = ip4_next_header (ip40);
+ tc0 =
+ (tcp_connection_t *)
+ stream_session_lookup_transport_wt4 (&ip40->dst_address,
+ &ip40->src_address,
+ tcp0->dst_port,
+ tcp0->src_port,
+ SESSION_TYPE_IP4_TCP,
+ my_thread_index);
+ ASSERT (0);
+ goto drop;
+ }
if (PREDICT_FALSE
(!tcp_ack (tcp0) && !tcp_rst (tcp0) && !tcp_syn (tcp0)))
goto drop;
@@ -1881,8 +1898,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
if (ack0 <= tc0->iss || ack0 > tc0->snd_nxt)
{
if (!tcp_rst (tcp0))
- tcp_send_reset (b0, is_ip4);
-
+ tcp_send_reset (tc0, b0, is_ip4);
goto drop;
}
@@ -1900,11 +1916,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
/* If ACK is acceptable, signal client that peer is not
* willing to accept connection and drop connection*/
if (tcp_ack (tcp0))
- {
- stream_session_connect_notify (&tc0->connection, sst,
- 1 /* fail */ );
- tcp_connection_cleanup (tc0);
- }
+ tcp_connection_reset (tc0);
goto drop;
}
@@ -1920,6 +1932,10 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
if (!tcp_syn (tcp0))
goto drop;
+ /* Parse options */
+ if (tcp_options_parse (tcp0, &tc0->rcv_opts))
+ goto drop;
+
/* Stop connection establishment and retransmit timers */
tcp_timer_reset (tc0, TCP_TIMER_ESTABLISH);
tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT_SYN);
@@ -1928,19 +1944,11 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
* current thread pool. */
pool_get (tm->connections[my_thread_index], new_tc0);
clib_memcpy (new_tc0, tc0, sizeof (*new_tc0));
-
- new_tc0->c_thread_index = my_thread_index;
new_tc0->c_c_index = new_tc0 - tm->connections[my_thread_index];
-
- /* Cleanup half-open connection XXX lock */
- pool_put (tm->half_open_connections, tc0);
-
+ new_tc0->c_thread_index = my_thread_index;
new_tc0->rcv_nxt = vnet_buffer (b0)->tcp.seq_end;
new_tc0->irs = seq0;
-
- /* Parse options */
- if (tcp_options_parse (tcp0, &new_tc0->rcv_opts))
- goto drop;
+ tcp_half_open_connection_del (tc0);
if (tcp_opts_tstamp (&new_tc0->rcv_opts))
{
@@ -1959,7 +1967,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
tcp_connection_init_vars (new_tc0);
/* SYN-ACK: See if we can switch to ESTABLISHED state */
- if (tcp_ack (tcp0))
+ if (PREDICT_TRUE (tcp_ack (tcp0)))
{
/* Our SYN is ACKed: we have iss < ack = snd_una */
@@ -1976,7 +1984,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
0))
{
tcp_connection_cleanup (new_tc0);
- tcp_send_reset (b0, is_ip4);
+ tcp_send_reset (tc0, b0, is_ip4);
goto drop;
}
@@ -1986,6 +1994,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
/* Update rtt with the syn-ack sample */
new_tc0->bytes_acked = 1;
tcp_update_rtt (new_tc0, vnet_buffer (b0)->tcp.ack_number);
+ TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, new_tc0);
}
/* SYN: Simultaneous open. Change state to SYN-RCVD and send SYN-ACK */
else
@@ -1997,12 +2006,12 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
(&new_tc0->connection, sst, 0))
{
tcp_connection_cleanup (new_tc0);
- tcp_send_reset (b0, is_ip4);
+ tcp_send_reset (tc0, b0, is_ip4);
+ TCP_EVT_DBG (TCP_EVT_RST_SENT, tc0);
goto drop;
}
tc0->rtt_ts = 0;
-
tcp_make_synack (new_tc0, b0);
next0 = tcp_next_output (is_ip4);
@@ -2010,12 +2019,10 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
}
/* Read data, if any */
- if (vnet_buffer (b0)->tcp.data_len)
+ if (PREDICT_FALSE (vnet_buffer (b0)->tcp.data_len))
{
- vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset);
- error0 = tcp_segment_rcv (tm, new_tc0, b0,
- vnet_buffer (b0)->tcp.data_len,
- &next0);
+ ASSERT (0);
+ error0 = tcp_segment_rcv (tm, new_tc0, b0, &next0);
if (error0 == TCP_ERROR_PURE_ACK)
error0 = TCP_ERROR_SYN_ACKS_RCVD;
}
@@ -2114,6 +2121,7 @@ VLIB_REGISTER_NODE (tcp6_syn_sent_node) =
/* *INDENT-ON* */
VLIB_NODE_FUNCTION_MULTIARCH (tcp6_syn_sent_node, tcp6_syn_sent_rcv);
+
/**
* Handles reception for all states except LISTEN, SYN-SENT and ESTABLISHED
* as per RFC793 p. 64
@@ -2202,7 +2210,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
*/
if (!tcp_rcv_ack_is_acceptable (tc0, b0))
{
- tcp_send_reset (b0, is_ip4);
+ tcp_send_reset (tc0, b0, is_ip4);
goto drop;
}
@@ -2243,6 +2251,8 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
{
ASSERT (tcp_fin (tcp0));
tc0->state = TCP_STATE_FIN_WAIT_2;
+ TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
+
/* Stop all timers, 2MSL will be set lower */
tcp_connection_timers_reset (tc0);
}
@@ -2269,6 +2279,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
/* XXX test that send queue empty */
tc0->state = TCP_STATE_TIME_WAIT;
+ TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
goto drop;
break;
@@ -2289,6 +2300,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
}
tc0->state = TCP_STATE_CLOSED;
+ TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
/* Don't delete the connection/session yet. Instead, wait a
* reasonable amount of time until the pipes are cleared. In
@@ -2329,10 +2341,8 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
case TCP_STATE_ESTABLISHED:
case TCP_STATE_FIN_WAIT_1:
case TCP_STATE_FIN_WAIT_2:
- vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset);
- error0 = tcp_segment_rcv (tm, tc0, b0,
- vnet_buffer (b0)->tcp.data_len,
- &next0);
+ if (vnet_buffer (b0)->tcp.data_len)
+ error0 = tcp_segment_rcv (tm, tc0, b0, &next0);
break;
case TCP_STATE_CLOSE_WAIT:
case TCP_STATE_CLOSING:
@@ -2357,6 +2367,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
next0 = tcp_next_output (tc0->c_is_ip4);
stream_session_disconnect_notify (&tc0->connection);
tc0->state = TCP_STATE_CLOSE_WAIT;
+ TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
break;
case TCP_STATE_CLOSE_WAIT:
case TCP_STATE_CLOSING:
@@ -2367,6 +2378,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
tc0->state = TCP_STATE_TIME_WAIT;
tcp_connection_timers_reset (tc0);
tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
+ TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
break;
case TCP_STATE_FIN_WAIT_2:
/* Got FIN, send ACK! */
@@ -2375,6 +2387,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
tcp_make_ack (tc0, b0);
next0 = tcp_next_output (is_ip4);
+ TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
break;
case TCP_STATE_TIME_WAIT:
/* Remain in the TIME-WAIT state. Restart the 2 MSL time-wait
@@ -2486,7 +2499,6 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
{
u32 n_left_from, next_index, *from, *to_next;
u32 my_thread_index = vm->thread_index;
- tcp_main_t *tm = vnet_get_tcp_main ();
u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP;
from = vlib_frame_vector_args (from_frame);
@@ -2549,14 +2561,10 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
/* 3. check for a SYN (did that already) */
/* Create child session and send SYN-ACK */
- pool_get (tm->connections[my_thread_index], child0);
- memset (child0, 0, sizeof (*child0));
-
- child0->c_c_index = child0 - tm->connections[my_thread_index];
+ child0 = tcp_connection_new (my_thread_index);
child0->c_lcl_port = lc0->c_lcl_port;
child0->c_rmt_port = th0->src_port;
child0->c_is_ip4 = is_ip4;
- child0->c_thread_index = my_thread_index;
child0->state = TCP_STATE_SYN_RCVD;
if (is_ip4)
@@ -2605,7 +2613,6 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
child0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
tcp_connection_init_vars (child0);
-
TCP_EVT_DBG (TCP_EVT_SYN_RCVD, child0);
/* Reuse buffer to make syn-ack and send */
@@ -2722,6 +2729,31 @@ typedef enum _tcp_input_next
#define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN)
+static u8
+tcp_lookup_is_valid (tcp_connection_t * tc, tcp_header_t * hdr)
+{
+ transport_connection_t *tmp;
+ if (!tc)
+ return 1;
+
+ u8 is_valid = (tc->c_lcl_port == hdr->dst_port
+ && (tc->state == TCP_STATE_LISTEN
+ || tc->c_rmt_port == hdr->src_port));
+
+ if (!is_valid)
+ {
+ if ((tmp = stream_session_lookup_half_open (&tc->connection)))
+ {
+ if (tmp->lcl_port == hdr->dst_port
+ && tmp->rmt_port == hdr->src_port)
+ {
+ clib_warning ("half-open is valid!");
+ }
+ }
+ }
+ return is_valid;
+}
+
always_inline uword
tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
vlib_frame_t * from_frame, int is_ip4)
@@ -2774,7 +2806,6 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
n_data_bytes0 = clib_net_to_host_u16 (ip40->length)
- n_advance_bytes0;
- /* lookup session */
tc0 =
(tcp_connection_t *)
stream_session_lookup_transport_wt4 (&ip40->dst_address,
@@ -2783,6 +2814,7 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
tcp0->src_port,
SESSION_TYPE_IP4_TCP,
my_thread_index);
+ ASSERT (tcp_lookup_is_valid (tc0, tcp0));
}
else
{
@@ -2795,12 +2827,13 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
tc0 =
(tcp_connection_t *)
- stream_session_lookup_transport_wt6 (&ip60->src_address,
- &ip60->dst_address,
- tcp0->src_port,
+ stream_session_lookup_transport_wt6 (&ip60->dst_address,
+ &ip60->src_address,
tcp0->dst_port,
+ tcp0->src_port,
SESSION_TYPE_IP6_TCP,
my_thread_index);
+ ASSERT (tcp_lookup_is_valid (tc0, tcp0));
}
/* Length check */