aboutsummaryrefslogtreecommitdiffstats
path: root/src/vnet/tcp
diff options
context:
space:
mode:
Diffstat (limited to 'src/vnet/tcp')
-rw-r--r--src/vnet/tcp/builtin_client.c2
-rw-r--r--src/vnet/tcp/tcp.c7
-rw-r--r--src/vnet/tcp/tcp.h9
-rwxr-xr-xsrc/vnet/tcp/tcp_debug.h9
-rw-r--r--src/vnet/tcp/tcp_input.c81
-rw-r--r--src/vnet/tcp/tcp_output.c91
6 files changed, 129 insertions, 70 deletions
diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c
index 5b4c8679970..527b3289924 100644
--- a/src/vnet/tcp/builtin_client.c
+++ b/src/vnet/tcp/builtin_client.c
@@ -510,7 +510,7 @@ clients_connect (vlib_main_t * vm, u8 * uri, u32 n_clients)
if ((i % 4) == 0)
vlib_process_suspend (vm, 10e-6);
ASSERT (i + 1 >= tm->ready_connections);
- while (i + 1 - tm->ready_connections > 8000)
+ while (i + 1 - tm->ready_connections > 1000)
{
vlib_process_suspend (vm, 100e-6);
}
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index 04f1e068b9d..f779428fbaf 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -1035,7 +1035,7 @@ tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space)
/* If not snd_wnd constrained and we can't write at least a segment,
* don't try at all */
if (PREDICT_FALSE (snd_space < tc->snd_mss))
- return 0;
+ return snd_space < tc->cwnd ? 0 : snd_space;
/* round down to mss multiple */
return snd_space - (snd_space % tc->snd_mss);
@@ -1167,6 +1167,7 @@ tcp_timer_establish_handler (u32 conn_index)
{
ASSERT (tc->state == TCP_STATE_SYN_SENT);
stream_session_connect_notify (&tc->connection, 1 /* fail */ );
+ TCP_DBG ("establish pop: %U", format_tcp_connection, tc, 2);
}
else
{
@@ -1174,7 +1175,7 @@ tcp_timer_establish_handler (u32 conn_index)
/* note: the connection may have already disappeared */
if (PREDICT_FALSE (tc == 0))
return;
-
+ TCP_DBG ("establish pop: %U", format_tcp_connection, tc, 2);
ASSERT (tc->state == TCP_STATE_SYN_RCVD);
/* Start cleanup. App wasn't notified yet so use delete notify as
* opposed to delete to cleanup session layer state. */
@@ -1369,6 +1370,8 @@ tcp_main_enable (vlib_main_t * vm)
vec_validate (tm->tx_frames[0], num_threads - 1);
vec_validate (tm->tx_frames[1], num_threads - 1);
+ vec_validate (tm->ip_lookup_tx_frames[0], num_threads - 1);
+ vec_validate (tm->ip_lookup_tx_frames[1], num_threads - 1);
tm->bytes_per_buffer = vlib_buffer_free_list_buffer_size
(vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index 6020a3debbe..bb8091af84f 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -99,8 +99,9 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler;
#define TCP_ESTABLISH_TIME 750 /* 75s */
#define TCP_SYN_RCVD_TIME 600 /* 60s */
#define TCP_2MSL_TIME 300 /* 30s */
-#define TCP_CLOSEWAIT_TIME 20 /* 0.1s */
-#define TCP_CLEANUP_TIME 5 /* 0.5s Time to wait before cleanup */
+#define TCP_CLOSEWAIT_TIME 20 /* 2s */
+#define TCP_TIMEWAIT_TIME 20 /* 2s */
+#define TCP_CLEANUP_TIME 10 /* 1s Time to wait before cleanup */
#define TCP_TIMER_PERSIST_MIN 2 /* 0.2s */
#define TCP_RTO_MAX 60 * THZ /* Min max RTO (60s) as per RFC6298 */
@@ -372,8 +373,10 @@ typedef struct _tcp_main
/** per-worker tx buffer free lists */
u32 **tx_buffers;
- /** per-worker tx frames to 4/6 output nodes */
+ /** per-worker tx frames to tcp 4/6 output nodes */
vlib_frame_t **tx_frames[2];
+ /** per-worker tx frames to ip 4/6 lookup nodes */
+ vlib_frame_t **ip_lookup_tx_frames[2];
/* Per worker-thread timer wheel for connections timers */
tw_timer_wheel_16t_2w_512sl_t *timer_wheels;
diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h
index cf77e6e6682..4bc6b42e297 100755
--- a/src/vnet/tcp/tcp_debug.h
+++ b/src/vnet/tcp/tcp_debug.h
@@ -82,13 +82,7 @@ typedef enum _tcp_dbg_evt
* Infra and evt track setup
*/
-#define TCP_DBG(_tc, _evt, _args...) \
-{ \
- u8 *_tmp = 0; \
- _tmp = format(_tmp, "%U", format_tcp_connection_verbose, _tc); \
- clib_warning("%s", _tmp); \
- vec_free(_tmp); \
-}
+#define TCP_DBG(_fmt, _args...) clib_warning (_fmt, ##_args)
#define DECLARE_ETD(_tc, _e, _size) \
struct \
@@ -240,6 +234,7 @@ typedef enum _tcp_dbg_evt
#define TCP_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args)
#else
#define TCP_EVT_DBG(_evt, _args...)
+#define TCP_DBG(_fmt, _args...)
#endif
/*
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index 841e72a503e..64a07070ec2 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -351,12 +351,17 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0,
if (tcp_syn (th0))
{
/* TODO implement RFC 5961 */
- if (tc0->state != TCP_STATE_SYN_RCVD)
- tcp_make_ack (tc0, b0);
+ if (tc0->state == TCP_STATE_SYN_RCVD)
+ {
+ tcp_make_synack (tc0, b0);
+ TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0, 0);
+ }
else
- tcp_make_synack (tc0, b0);
+ {
+ tcp_make_ack (tc0, b0);
+ TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, tc0);
+ }
*next0 = tcp_next_output (tc0->c_is_ip4);
- TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0, 0);
return -1;
}
@@ -1747,18 +1752,17 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
/* 8: check the FIN bit */
if (PREDICT_FALSE (is_fin))
{
- /* Enter CLOSE-WAIT and notify session. Don't send ACK, instead
- * wait for session to call close. To avoid lingering
+ /* Enter CLOSE-WAIT and notify session. To avoid lingering
* in CLOSE-WAIT, set timer (reuse WAITCLOSE). */
- tc0->state = TCP_STATE_CLOSE_WAIT;
- TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0);
+ /* Account for the FIN if nothing else was received */
if (vnet_buffer (b0)->tcp.data_len == 0)
- {
- tc0->rcv_nxt += 1;
- next0 = TCP_ESTABLISHED_NEXT_DROP;
- }
+ tc0->rcv_nxt += 1;
+ tcp_make_ack (tc0, b0);
+ next0 = tcp_next_output (tc0->c_is_ip4);
+ tc0->state = TCP_STATE_CLOSE_WAIT;
stream_session_disconnect_notify (&tc0->connection);
tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
+ TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0);
}
done:
@@ -1973,6 +1977,12 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
seq0 = vnet_buffer (b0)->tcp.seq_number;
tcp0 = tcp_buffer_hdr (b0);
+ /* Crude check to see if the connection handle does not match
+ * the packet. Probably connection just switched to established */
+ if (PREDICT_FALSE (tcp0->dst_port != tc0->c_lcl_port
+ || tcp0->src_port != tc0->c_rmt_port))
+ goto drop;
+
if (PREDICT_FALSE
(!tcp_ack (tcp0) && !tcp_rst (tcp0) && !tcp_syn (tcp0)))
goto drop;
@@ -2265,6 +2275,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
tcp_header_t *tcp0 = 0;
tcp_connection_t *tc0;
u32 next0 = TCP_RCV_PROCESS_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED;
+ u8 is_fin0;
bi0 = from[0];
to_next[0] = bi0;
@@ -2283,11 +2294,11 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
}
tcp0 = tcp_buffer_hdr (b0);
+ is_fin0 = tcp_is_fin (tcp0);
/* SYNs, FINs and data consume sequence numbers */
vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number
- + tcp_is_syn (tcp0) + tcp_is_fin (tcp0)
- + vnet_buffer (b0)->tcp.data_len;
+ + tcp_is_syn (tcp0) + is_fin0 + vnet_buffer (b0)->tcp.data_len;
if (CLIB_DEBUG)
{
@@ -2384,21 +2395,14 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
/* If FIN is ACKed */
else if (tc0->snd_una == tc0->snd_una_max)
{
- tc0->rcv_nxt += 1;
tc0->state = TCP_STATE_FIN_WAIT_2;
TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
- if (tcp_fin (tcp0))
- {
- /* Stop all timers, 2MSL will be set lower */
- tcp_connection_timers_reset (tc0);
- }
- else
- {
- /* Wait for peer to finish sending its data */
- tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE,
- TCP_2MSL_TIME);
- }
+ /* Stop all retransmit timers because we have nothing more
+ * to send. Enable waitclose though because we're willing to
+ * wait for peer's FIN but not indefinitely. */
+ tcp_connection_timers_reset (tc0);
+ tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
}
break;
case TCP_STATE_FIN_WAIT_2:
@@ -2434,10 +2438,10 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
if (!tcp_rcv_ack_is_acceptable (tc0, b0))
goto drop;
+ tc0->snd_una = vnet_buffer (b0)->tcp.ack_number;
/* Apparently our FIN was lost */
- if (tcp_fin (tcp0))
+ if (is_fin0)
{
- /* Don't "make" fin since that increments snd_nxt */
tcp_send_fin (tc0);
goto drop;
}
@@ -2450,8 +2454,6 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
* particular, this makes sure that we won't have dead sessions
* when processing events on the tx path */
tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME);
-
- /* Stop retransmit */
tcp_retransmit_timer_reset (tc0);
goto drop;
@@ -2466,8 +2468,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
goto drop;
tcp_make_ack (tc0, b0);
- tcp_timer_reset (tc0, TCP_TIMER_WAITCLOSE);
- tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
+ tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
goto drop;
@@ -2486,6 +2487,8 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
case TCP_STATE_FIN_WAIT_2:
if (vnet_buffer (b0)->tcp.data_len)
error0 = tcp_segment_rcv (tm, tc0, b0, &next0);
+ else if (is_fin0)
+ tc0->rcv_nxt += 1;
break;
case TCP_STATE_CLOSE_WAIT:
case TCP_STATE_CLOSING:
@@ -2497,7 +2500,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
}
/* 8: check the FIN bit */
- if (!tcp_fin (tcp0))
+ if (!is_fin0)
goto drop;
switch (tc0->state)
@@ -2527,19 +2530,19 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
break;
case TCP_STATE_FIN_WAIT_2:
- /* Got FIN, send ACK! */
+ /* Got FIN, send ACK! Be more aggressive with resource cleanup */
tc0->state = TCP_STATE_TIME_WAIT;
tcp_connection_timers_reset (tc0);
- tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
+ tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_TIMEWAIT_TIME);
tcp_make_ack (tc0, b0);
next0 = tcp_next_output (is_ip4);
TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
break;
case TCP_STATE_TIME_WAIT:
- /* Remain in the TIME-WAIT state. Restart the 2 MSL time-wait
+ /* Remain in the TIME-WAIT state. Restart the time-wait
* timeout.
*/
- tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
+ tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_TIMEWAIT_TIME);
break;
}
TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0);
@@ -3162,9 +3165,9 @@ do { \
TCP_ERROR_NONE);
_(TIME_WAIT, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
_(TIME_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
- _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED);
+ _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED);
_(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED);
- _(CLOSED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET,
+ _(CLOSED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP,
TCP_ERROR_CONNECTION_CLOSED);
#undef _
}
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index b843c926afe..be29f05f65c 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -629,9 +629,11 @@ tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b)
}
always_inline void
-tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
- u8 is_ip4)
+tcp_enqueue_to_ip_lookup_i (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
+ u8 is_ip4, u8 flush)
{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ u32 thread_index = vlib_get_thread_index ();
u32 *to_next, next_index;
vlib_frame_t *f;
@@ -643,13 +645,42 @@ tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
/* Send to IP lookup */
next_index = is_ip4 ? ip4_lookup_node.index : ip6_lookup_node.index;
- f = vlib_get_frame_to_node (vm, next_index);
+ if (VLIB_BUFFER_TRACE_TRAJECTORY > 0)
+ {
+ b->pre_data[0] = 2;
+ b->pre_data[1] = next_index;
+ }
+
+ f = tm->ip_lookup_tx_frames[!is_ip4][thread_index];
+ if (!f)
+ {
+ f = vlib_get_frame_to_node (vm, next_index);
+ ASSERT (f);
+ tm->ip_lookup_tx_frames[!is_ip4][thread_index] = f;
+ }
- /* Enqueue the packet */
to_next = vlib_frame_vector_args (f);
- to_next[0] = bi;
- f->n_vectors = 1;
- vlib_put_frame_to_node (vm, next_index, f);
+ to_next[f->n_vectors] = bi;
+ f->n_vectors += 1;
+ if (flush || f->n_vectors == VLIB_FRAME_SIZE)
+ {
+ vlib_put_frame_to_node (vm, next_index, f);
+ tm->ip_lookup_tx_frames[!is_ip4][thread_index] = 0;
+ }
+}
+
+always_inline void
+tcp_enqueue_to_ip_lookup_now (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
+ u8 is_ip4)
+{
+ tcp_enqueue_to_ip_lookup_i (vm, b, bi, is_ip4, 1);
+}
+
+always_inline void
+tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
+ u8 is_ip4)
+{
+ tcp_enqueue_to_ip_lookup_i (vm, b, bi, is_ip4, 0);
}
always_inline void
@@ -666,8 +697,6 @@ tcp_enqueue_to_output_i (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
/* Decide where to send the packet */
next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index;
-
- /* Initialize the trajectory trace, if configured */
if (VLIB_BUFFER_TRACE_TRAJECTORY > 0)
{
b->pre_data[0] = 1;
@@ -856,7 +885,7 @@ tcp_send_reset_w_pkt (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4)
ASSERT (!bogus);
}
- tcp_enqueue_to_ip_lookup (vm, b, bi, is_ip4);
+ tcp_enqueue_to_ip_lookup_now (vm, b, bi, is_ip4);
TCP_EVT_DBG (TCP_EVT_RST_SENT, tc);
}
@@ -968,7 +997,24 @@ tcp_flush_frame_to_output (vlib_main_t * vm, u8 thread_index, u8 is_ip4)
}
/**
- * Flush both v4 and v6 tx frames for thread index
+ * Flush ip lookup tx frames populated by timer pops
+ */
+always_inline void
+tcp_flush_frame_to_ip_lookup (vlib_main_t * vm, u8 thread_index, u8 is_ip4)
+{
+ if (tcp_main.ip_lookup_tx_frames[!is_ip4][thread_index])
+ {
+ u32 next_index;
+ next_index = is_ip4 ? ip4_lookup_node.index : ip6_lookup_node.index;
+ vlib_put_frame_to_node (vm, next_index,
+ tcp_main.ip_lookup_tx_frames[!is_ip4]
+ [thread_index]);
+ tcp_main.ip_lookup_tx_frames[!is_ip4][thread_index] = 0;
+ }
+}
+
+/**
+ * Flush v4 and v6 tcp and ip-lookup tx frames for thread index
*/
void
tcp_flush_frames_to_output (u8 thread_index)
@@ -976,6 +1022,8 @@ tcp_flush_frames_to_output (u8 thread_index)
vlib_main_t *vm = vlib_get_main ();
tcp_flush_frame_to_output (vm, thread_index, 1);
tcp_flush_frame_to_output (vm, thread_index, 0);
+ tcp_flush_frame_to_ip_lookup (vm, thread_index, 1);
+ tcp_flush_frame_to_ip_lookup (vm, thread_index, 0);
}
/**
@@ -984,22 +1032,28 @@ tcp_flush_frames_to_output (u8 thread_index)
void
tcp_send_fin (tcp_connection_t * tc)
{
- vlib_buffer_t *b;
- u32 bi;
tcp_main_t *tm = vnet_get_tcp_main ();
vlib_main_t *vm = vlib_get_main ();
+ vlib_buffer_t *b;
+ u32 bi;
+ u8 fin_snt = 0;
+
if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
return;
b = vlib_get_buffer (vm, bi);
- /* buffer will be initialized by in tcp_make_fin */
+ fin_snt = tc->flags & TCP_CONN_FINSNT;
+ if (fin_snt)
+ tc->snd_nxt = tc->snd_una;
tcp_make_fin (tc, b);
tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4);
- if (!(tc->flags & TCP_CONN_FINSNT))
+ if (!fin_snt)
{
tc->flags |= TCP_CONN_FINSNT;
tc->flags &= ~TCP_CONN_FINPNDG;
- tc->snd_nxt += 1;
+ /* Account for the FIN */
+ tc->snd_una_max += 1;
+ tc->snd_nxt = tc->snd_una_max;
}
tcp_retransmit_timer_force_update (tc);
TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc);
@@ -1398,7 +1452,8 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
else if (tc->state == TCP_STATE_SYN_RCVD)
{
tc->rto_boff += 1;
- tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
+ if (tc->rto_boff > TCP_RTO_SYN_RETRIES)
+ tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
tc->rtt_ts = 0;
if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
@@ -1414,7 +1469,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
else
{
ASSERT (tc->state == TCP_STATE_CLOSED);
- clib_warning ("connection closed ...");
+ TCP_DBG ("connection state: %d", tc->state);
return;
}
}