aboutsummaryrefslogtreecommitdiffstats
path: root/src/vnet/tcp
diff options
context:
space:
mode:
authorFlorin Coras <fcoras@cisco.com>2018-11-07 12:49:19 -0800
committerDamjan Marion <dmarion@me.com>2018-11-08 11:20:29 +0000
commitefefc6b4b219e2897e48def83352b4df52bc03a0 (patch)
tree50e7a57901e34365c1c4f5dc8868c705b192d864 /src/vnet/tcp
parent221d6f131d644b2d14f741c4b3031f53b8a8ff03 (diff)
tcp: pacer and mrtt estimation improvements
- update pacer once per burst - better estimate initial rtt - compute smoothed average for higher precision rtt estimate Change-Id: I06d41a98784cdf861bedfbee2e7d0afc0d0154ef Signed-off-by: Florin Coras <fcoras@cisco.com>
Diffstat (limited to 'src/vnet/tcp')
-rw-r--r--src/vnet/tcp/tcp.c7
-rw-r--r--src/vnet/tcp/tcp.h1
-rwxr-xr-xsrc/vnet/tcp/tcp_debug.h9
-rw-r--r--src/vnet/tcp/tcp_input.c62
-rw-r--r--src/vnet/tcp/tcp_output.c2
5 files changed, 61 insertions, 20 deletions
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index ea350dddc69..d759cf0d0cd 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -802,9 +802,10 @@ format_tcp_vars (u8 * s, va_list * args)
tcp_rcv_wnd_available (tc));
s = format (s, " tsval_recent %u tsval_recent_age %u\n", tc->tsval_recent,
tcp_time_now () - tc->tsval_recent_age);
- s = format (s, " rto %u rto_boff %u srtt %u rttvar %u rtt_ts %2.5f ",
- tc->rto, tc->rto_boff, tc->srtt, tc->rttvar, tc->rtt_ts);
- s = format (s, "rtt_seq %u\n", tc->rtt_seq - tc->iss);
+ s = format (s, " rto %u rto_boff %u srtt %u us %.3f rttvar %u rtt_ts %x",
+ tc->rto, tc->rto_boff, tc->srtt, tc->mrtt_us * 1000, tc->rttvar,
+ tc->rtt_ts);
+ s = format (s, " rtt_seq %u\n", tc->rtt_seq - tc->iss);
s = format (s, " cong: %U", format_tcp_congestion, tc);
if (tc->state >= TCP_STATE_ESTABLISHED)
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index 5a3a96570d2..843b90d987e 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -753,7 +753,6 @@ tcp_cc_rcv_ack (tcp_connection_t * tc)
{
tc->cc_algo->rcv_ack (tc);
tc->tsecr_last_ack = tc->rcv_opts.tsecr;
- tcp_connection_tx_pacer_update (tc);
}
always_inline void
diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h
index cd4a6f04d6e..d125ee84612 100755
--- a/src/vnet/tcp/tcp_debug.h
+++ b/src/vnet/tcp/tcp_debug.h
@@ -806,13 +806,14 @@ if (TCP_DEBUG_CC > 1) \
{ \
ELOG_TYPE_DECLARE (_e) = \
{ \
- .format = "rcv_stat: rto %u srtt %u rttvar %u ", \
- .format_args = "i4i4i4", \
+ .format = "rcv_stat: rto %u srtt %u mrtt-us %u rttvar %u", \
+ .format_args = "i4i4i4i4", \
}; \
- DECLARE_ETD(_tc, _e, 3); \
+ DECLARE_ETD(_tc, _e, 4); \
ed->data[0] = _tc->rto; \
ed->data[1] = _tc->srtt; \
- ed->data[2] = _tc->rttvar; \
+ ed->data[2] = (u32) (_tc->mrtt_us * 1e6); \
+ ed->data[3] = _tc->rttvar; \
}
#define TCP_EVT_CC_RTO_STAT_HANDLER(_tc, ...) \
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index 9c303eb01a5..0f1ab1ab3b0 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -455,8 +455,11 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack)
if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq))
{
- tc->mrtt_us = tcp_time_now_us (tc->c_thread_index) - tc->rtt_ts;
- mrtt = clib_max ((u32) (tc->mrtt_us * THZ), 1);
+ f64 sample = tcp_time_now_us (tc->c_thread_index) - tc->rtt_ts;
+ tc->mrtt_us = tc->mrtt_us + (sample - tc->mrtt_us) * 0.125;
+ mrtt = clib_max ((u32) (sample * THZ), 1);
+ /* Allow measuring of a new RTT */
+ tc->rtt_ts = 0;
}
/* As per RFC7323 TSecr can be used for RTTM only if the segment advances
* snd_una, i.e., the left side of the send window:
@@ -475,9 +478,6 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack)
done:
- /* Allow measuring of a new RTT */
- tc->rtt_ts = 0;
-
/* If we got here something must've been ACKed so make sure boff is 0,
* even if mrtt is not valid since we update the rto lower */
tc->rto_boff = 0;
@@ -486,6 +486,29 @@ done:
return 0;
}
+static void
+tcp_estimate_initial_rtt (tcp_connection_t * tc)
+{
+ u8 thread_index = vlib_num_workers ()? 1 : 0;
+ int mrtt;
+
+ if (tc->rtt_ts)
+ {
+ tc->mrtt_us = tcp_time_now_us (thread_index) - tc->rtt_ts;
+ mrtt = clib_max ((u32) (tc->mrtt_us * THZ), 1);
+ tc->rtt_ts = 0;
+ }
+ else
+ {
+ mrtt = tcp_time_now_w_thread (thread_index) - tc->rcv_opts.tsecr;
+ tc->mrtt_us = (f64) mrtt *TCP_TICK;
+
+ }
+
+ if (mrtt > 0 && mrtt < TCP_RTT_MAX)
+ tcp_estimate_rtt (tc, mrtt);
+}
+
/**
* Dequeue bytes for connections that have received acks in last burst
*/
@@ -506,6 +529,9 @@ tcp_handle_postponed_dequeues (tcp_worker_ctx_t * wrk)
tc = tcp_connection_get (pending_deq_acked[i], thread_index);
tc->flags &= ~TCP_CONN_DEQ_PENDING;
+ if (PREDICT_FALSE (!tc->burst_acked))
+ continue;
+
/* Dequeue the newly ACKed bytes */
stream_session_dequeue_drop (&tc->connection, tc->burst_acked);
tc->burst_acked = 0;
@@ -514,6 +540,11 @@ tcp_handle_postponed_dequeues (tcp_worker_ctx_t * wrk)
/* If everything has been acked, stop retransmit timer
* otherwise update. */
tcp_retransmit_timer_update (tc);
+
+ /* If not congested, update pacer based on our new
+ * cwnd estimate */
+ if (!tcp_in_fastrecovery (tc))
+ tcp_connection_tx_pacer_update (tc);
}
_vec_len (wrk->pending_deq_acked) = 0;
}
@@ -1084,6 +1115,7 @@ tcp_cc_recovery_exit (tcp_connection_t * tc)
tcp_update_rto (tc);
tc->snd_rxt_ts = 0;
tc->snd_nxt = tc->snd_una_max;
+ tc->rtt_ts = 0;
tcp_recovery_off (tc);
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3);
}
@@ -1096,6 +1128,7 @@ tcp_cc_fastrecovery_exit (tcp_connection_t * tc)
tc->rcv_dupacks = 0;
tc->snd_nxt = tc->snd_una_max;
tc->snd_rxt_bytes = 0;
+ tc->rtt_ts = 0;
tcp_fastrecovery_off (tc);
tcp_fastrecovery_1_smss_off (tc);
@@ -1381,6 +1414,10 @@ partial_ack:
* Legitimate ACK. 1) See if we can exit recovery
*/
+ /* Update the pacing rate. For the first partial ack we move from
+ * the artificially constrained rate to the one after congestion */
+ tcp_connection_tx_pacer_update (tc);
+
if (seq_geq (tc->snd_una, tc->snd_congestion))
{
tcp_retransmit_timer_update (tc);
@@ -1403,10 +1440,6 @@ partial_ack:
* Legitimate ACK. 2) If PARTIAL ACK try to retransmit
*/
- /* Update the pacing rate. For the first partial ack we move from
- * the artificially constrained rate to the one after congestion */
- tcp_connection_tx_pacer_update (tc);
-
/* XXX limit this only to first partial ack? */
tcp_retransmit_timer_force_update (tc);
@@ -2427,7 +2460,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
}
/* Update rtt with the syn-ack sample */
- tcp_update_rtt (new_tc0, vnet_buffer (b0)->tcp.ack_number);
+ tcp_estimate_initial_rtt (new_tc0);
TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, new_tc0);
error0 = TCP_ERROR_SYN_ACKS_RCVD;
}
@@ -2636,7 +2669,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
}
/* Update rtt and rto */
- tcp_update_rtt (tc0, vnet_buffer (b0)->tcp.ack_number);
+ tcp_estimate_initial_rtt (tc0);
/* Switch state to ESTABLISHED */
tc0->state = TCP_STATE_ESTABLISHED;
@@ -2687,6 +2720,12 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
* wait for peer's FIN but not indefinitely. */
tcp_connection_timers_reset (tc0);
tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
+
+ /* Don't try to deq the FIN acked */
+ if (tc0->burst_acked > 1)
+ stream_session_dequeue_drop (&tc0->connection,
+ tc0->burst_acked - 1);
+ tc0->burst_acked = 0;
}
break;
case TCP_STATE_FIN_WAIT_2:
@@ -2695,6 +2734,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
* acknowledged ("ok") but do not delete the TCB. */
if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &error0))
goto drop;
+ tc0->burst_acked = 0;
break;
case TCP_STATE_CLOSE_WAIT:
/* Do the same processing as for the ESTABLISHED state. */
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index 089f85a0ea0..192e820e648 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -1000,7 +1000,7 @@ tcp_send_syn (tcp_connection_t * tc)
tcp_make_syn (tc, b);
/* Measure RTT with this */
- tc->rtt_ts = tcp_time_now ();
+ tc->rtt_ts = tcp_time_now_us (vlib_num_workers ()? 1 : 0);
tc->rtt_seq = tc->snd_nxt;
tc->rto_boff = 0;