summaryrefslogtreecommitdiffstats
path: root/src/vnet/tcp
diff options
context:
space:
mode:
authorFlorin Coras <fcoras@cisco.com>2018-05-21 17:47:40 -0700
committerDamjan Marion <dmarion@me.com>2018-10-25 10:13:18 +0000
commitd67f112063e6c57160a3d0260537b9dcfe23d217 (patch)
treec2d5251e7896290cc0a968fb2b4d6d9ba87aef17 /src/vnet/tcp
parent2fab01ee0f9b406584272968863eee16a3bb1fb9 (diff)
tcp/session: add tx pacer
Adds tx pacing infrastructure for transport protocols that want to use it. Particularly useful for connections with non-negligible rtt and constrained network throughput as it avoids large tx bursts that lead to local interface tx or network drops. By default the pacer is disabled. To enabled it for tcp, add tx-pacing to tcp's startup conf. We are still slightly inefficient in the handling of incoming packets in established state so the pacer slightly affect maximum throughput in low lacency scenarios. Change-Id: Id445b2ffcd64cce015f75b773f7d722faa0f7ca9 Signed-off-by: Florin Coras <fcoras@cisco.com>
Diffstat (limited to 'src/vnet/tcp')
-rw-r--r--src/vnet/tcp/tcp.c44
-rw-r--r--src/vnet/tcp/tcp.h23
-rw-r--r--src/vnet/tcp/tcp_input.c22
-rw-r--r--src/vnet/tcp/tcp_output.c2
4 files changed, 72 insertions, 19 deletions
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index cb05b8c0533..626b49997ac 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -555,6 +555,16 @@ tcp_init_snd_vars (tcp_connection_t * tc)
tc->snd_una_max = tc->snd_nxt;
}
+void
+tcp_enable_pacing (tcp_connection_t * tc)
+{
+ u32 max_burst, byte_rate;
+ max_burst = 16 * tc->snd_mss;
+ byte_rate = 2 << 16;
+ transport_connection_tx_pacer_init (&tc->connection, byte_rate, max_burst);
+ tc->mrtt_us = (u32) ~ 0;
+}
+
/** Initialize tcp connection variables
*
* Should be called after having received a msg from the peer, i.e., a SYN or
@@ -572,7 +582,11 @@ tcp_connection_init_vars (tcp_connection_t * tc)
if (!tc->c_is_ip4 && ip6_address_is_link_local_unicast (&tc->c_rmt_ip6))
tcp_add_del_adjacency (tc, 1);
- // tcp_connection_fib_attach (tc);
+ /* tcp_connection_fib_attach (tc); */
+
+ if (transport_connection_is_tx_paced (&tc->connection)
+ || tcp_main.tx_pacing)
+ tcp_enable_pacing (tc);
}
static int
@@ -784,14 +798,19 @@ format_tcp_vars (u8 * s, va_list * args)
s = format (s, " limited_transmit %u\n", tc->limited_transmit - tc->iss);
s = format (s, " tsecr %u tsecr_last_ack %u\n", tc->rcv_opts.tsecr,
tc->tsecr_last_ack);
- s = format (s, " rto %u rto_boff %u srtt %u rttvar %u rtt_ts %u ", tc->rto,
- tc->rto_boff, tc->srtt, tc->rttvar, tc->rtt_ts);
+ s = format (s, " rto %u rto_boff %u srtt %u rttvar %u rtt_ts %2.5f ",
+ tc->rto, tc->rto_boff, tc->srtt, tc->rttvar, tc->rtt_ts);
s = format (s, "rtt_seq %u\n", tc->rtt_seq);
s = format (s, " tsval_recent %u tsval_recent_age %u\n", tc->tsval_recent,
tcp_time_now () - tc->tsval_recent_age);
if (tc->state >= TCP_STATE_ESTABLISHED)
- s = format (s, " scoreboard: %U\n", format_tcp_scoreboard, &tc->sack_sb,
- tc);
+ {
+ s = format (s, " scoreboard: %U\n", format_tcp_scoreboard, &tc->sack_sb,
+ tc);
+ if (transport_connection_is_tx_paced (&tc->connection))
+ s = format (s, " pacer: %U\n", format_transport_pacer,
+ &tc->connection.pacer);
+ }
if (vec_len (tc->snd_sacks))
s = format (s, " sacks tx: %U\n", format_tcp_sacks, tc);
@@ -1129,6 +1148,19 @@ const static transport_proto_vft_t tcp_proto = {
};
/* *INDENT-ON* */
+void
+tcp_update_pacer (tcp_connection_t * tc)
+{
+ f64 srtt;
+
+ if (!transport_connection_is_tx_paced (&tc->connection))
+ return;
+
+ srtt = clib_min ((f64) tc->srtt * TCP_TICK, tc->mrtt_us);
+ transport_connection_tx_pacer_update (&tc->connection,
+ ((f64) tc->cwnd) / srtt);
+}
+
static void
tcp_timer_keep_handler (u32 conn_index)
{
@@ -1408,6 +1440,8 @@ tcp_config_fn (vlib_main_t * vm, unformat_input_t * input)
else if (unformat (input, "max-rx-fifo %U", unformat_memory_size,
&tm->max_rx_fifo))
;
+ else if (unformat (input, "tx-pacing"))
+ tm->tx_pacing = 1;
else
return clib_error_return (0, "unknown input `%U'",
format_unformat_error, input);
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index a036072e546..4ba3d5e9d87 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -160,7 +160,7 @@ enum
};
#define TCP_SCOREBOARD_TRACE (0)
-#define TCP_MAX_SACK_BLOCKS 15 /**< Max number of SACK blocks stored */
+#define TCP_MAX_SACK_BLOCKS 32 /**< Max number of SACK blocks stored */
#define TCP_INVALID_SACK_HOLE_INDEX ((u32)~0)
typedef struct _scoreboard_trace_elt
@@ -319,8 +319,9 @@ typedef struct _tcp_connection
u32 rto_boff; /**< Index for RTO backoff */
u32 srtt; /**< Smoothed RTT */
u32 rttvar; /**< Smoothed mean RTT difference. Approximates variance */
- u32 rtt_ts; /**< Timestamp for tracked ACK */
u32 rtt_seq; /**< Sequence number for tracked ACK */
+ f64 rtt_ts; /**< Timestamp for tracked ACK */
+ f64 mrtt_us; /**< High precision mrtt from tracked acks */
u16 mss; /**< Our max seg size that includes options */
u32 limited_transmit; /**< snd_nxt when limited transmit starts */
@@ -444,6 +445,9 @@ typedef struct _tcp_main
u32 last_v6_address_rotor;
ip6_address_t *ip6_src_addresses;
+ /** Enable tx pacing for new connections */
+ u8 tx_pacing;
+
u8 punt_unknown4;
u8 punt_unknown6;
@@ -692,6 +696,12 @@ tcp_time_now (void)
return tcp_main.wrk_ctx[vlib_get_thread_index ()].time_now;
}
+always_inline f64
+tcp_time_now_us (u32 thread_index)
+{
+ return transport_time_now (thread_index);
+}
+
always_inline u32
tcp_set_time_now (u32 thread_index)
{
@@ -706,6 +716,15 @@ void tcp_connection_timers_init (tcp_connection_t * tc);
void tcp_connection_timers_reset (tcp_connection_t * tc);
void tcp_init_snd_vars (tcp_connection_t * tc);
void tcp_connection_init_vars (tcp_connection_t * tc);
+void tcp_update_pacer (tcp_connection_t * tc);
+
+always_inline void
+tcp_cc_rcv_ack (tcp_connection_t * tc)
+{
+ tc->cc_algo->rcv_ack (tc);
+ tcp_update_pacer (tc);
+ tc->tsecr_last_ack = tc->rcv_opts.tsecr;
+}
always_inline void
tcp_connection_force_ack (tcp_connection_t * tc, vlib_buffer_t * b)
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index 39a538ba681..ac0e996567e 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -462,14 +462,15 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack)
if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq))
{
- mrtt = tcp_time_now () - tc->rtt_ts;
+ tc->mrtt_us = tcp_time_now_us (tc->c_thread_index) - tc->rtt_ts;
+ mrtt = clib_max ((u32) (tc->mrtt_us * THZ), 1);
}
/* As per RFC7323 TSecr can be used for RTTM only if the segment advances
* snd_una, i.e., the left side of the send window:
* seq_lt (tc->snd_una, ack). This is a condition for calling update_rtt */
else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr)
{
- mrtt = tcp_time_now () - tc->rcv_opts.tsecr;
+ mrtt = clib_max (tcp_time_now () - tc->rcv_opts.tsecr, 1);
}
/* Ignore dubious measurements */
@@ -1079,12 +1080,14 @@ tcp_cc_fastrecovery_exit (tcp_connection_t * tc)
tc->snd_nxt = tc->snd_una_max;
tc->snd_rxt_bytes = 0;
- /* HACK: since we don't have an output pacer, force slow start */
- tc->cwnd = 20 * tc->snd_mss;
-
tcp_fastrecovery_off (tc);
tcp_fastrecovery_1_smss_off (tc);
tcp_fastrecovery_first_off (tc);
+
+ /* Update pacer because our cwnd changed. Also makes sure
+ * that we recompute the max burst size */
+ tcp_update_pacer (tc);
+
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3);
}
@@ -1153,8 +1156,7 @@ tcp_cc_update (tcp_connection_t * tc, vlib_buffer_t * b)
ASSERT (!tcp_in_cong_recovery (tc) || tcp_is_lost_fin (tc));
/* Congestion avoidance */
- tc->cc_algo->rcv_ack (tc);
- tc->tsecr_last_ack = tc->rcv_opts.tsecr;
+ tcp_cc_rcv_ack (tc);
/* If a cumulative ack, make sure dupacks is 0 */
tc->rcv_dupacks = 0;
@@ -1372,8 +1374,7 @@ partial_ack:
tc->snd_nxt = tc->snd_una_max;
/* Treat as congestion avoidance ack */
- tc->cc_algo->rcv_ack (tc);
- tc->tsecr_last_ack = tc->rcv_opts.tsecr;
+ tcp_cc_rcv_ack (tc);
return;
}
@@ -1391,8 +1392,7 @@ partial_ack:
/* Post RTO timeout don't try anything fancy */
if (tcp_in_recovery (tc))
{
- tc->cc_algo->rcv_ack (tc);
- tc->tsecr_last_ack = tc->rcv_opts.tsecr;
+ tcp_cc_rcv_ack (tc);
transport_add_tx_event (&tc->connection);
return;
}
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index 2e6036b410a..f14a61263d4 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -1202,7 +1202,7 @@ tcp_push_header (tcp_connection_t * tc, vlib_buffer_t * b)
/* If not tracking an ACK, start tracking */
if (tc->rtt_ts == 0 && !tcp_in_cong_recovery (tc))
{
- tc->rtt_ts = tcp_time_now ();
+ tc->rtt_ts = tcp_time_now_us (tc->c_thread_index);
tc->rtt_seq = tc->snd_nxt;
}
if (PREDICT_FALSE (!tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)))