diff options
author | Florin Coras <fcoras@cisco.com> | 2019-06-12 15:38:19 -0700 |
---|---|---|
committer | Dave Barach <openvpp@barachs.net> | 2019-06-25 16:02:51 +0000 |
commit | 52814737c351b394d28a8b0ee1544176180f45e0 (patch) | |
tree | 76c8f32699624a6beb9554d4effa646c6270a791 /src | |
parent | 6e5baf29e2e48dc62a439d148f243dbb735de786 (diff) |
tcp: delivery rate estimator
Type: feature
First cut implementation with limited testing. The feature is not
enabled by default and the expectation is that cc algorithms will enable
it on demand.
Change-Id: I92b70cb4dabcff0e9ccd1d725952c4880af394da
Signed-off-by: Florin Coras <fcoras@cisco.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/plugins/unittest/tcp_test.c | 312 | ||||
-rw-r--r-- | src/vnet/CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/vnet/session/transport.c | 12 | ||||
-rw-r--r-- | src/vnet/session/transport.h | 15 | ||||
-rw-r--r-- | src/vnet/tcp/tcp.c | 6 | ||||
-rw-r--r-- | src/vnet/tcp/tcp.h | 139 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_bt.c | 586 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_cubic.c | 2 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_input.c | 29 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_newreno.c | 5 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_output.c | 21 |
11 files changed, 1100 insertions, 28 deletions
diff --git a/src/plugins/unittest/tcp_test.c b/src/plugins/unittest/tcp_test.c index f919790b031..e604884d107 100644 --- a/src/plugins/unittest/tcp_test.c +++ b/src/plugins/unittest/tcp_test.c @@ -780,6 +780,312 @@ tcp_test_session (vlib_main_t * vm, unformat_input_t * input) return rv; } +static inline int +tbt_seq_lt (u32 a, u32 b) +{ + return seq_lt (a, b); +} + +static int +tcp_test_delivery (vlib_main_t * vm, unformat_input_t * input) +{ + u32 thread_index = 0, snd_una, *min_seqs = 0; + tcp_rate_sample_t _rs = { 0 }, *rs = &_rs; + tcp_connection_t _tc, *tc = &_tc; + sack_scoreboard_t *sb = &tc->sack_sb; + int __clib_unused verbose = 0, i; + u64 rate = 100, burst = 100; + sack_block_t *sacks = 0; + tcp_byte_tracker_t *bt; + rb_node_t *root, *rbn; + tcp_bt_sample_t *bts; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "verbose")) + verbose = 1; + else + { + vlib_cli_output (vm, "parse error: '%U'", format_unformat_error, + input); + return -1; + } + } + + /* Init data structures */ + memset (tc, 0, sizeof (*tc)); + session_main.wrk[thread_index].last_vlib_time = 1; + transport_connection_tx_pacer_update (&tc->connection, rate); + + tcp_bt_init (tc); + bt = tc->bt; + + /* + * Track simple bursts without rxt + */ + + /* 1) track first burst a time 1 */ + tcp_bt_track_tx (tc); + + TCP_TEST (tcp_bt_is_sane (bt), "tracker should be sane"); + TCP_TEST (pool_elts (bt->samples) == 1, "should have 1 sample"); + bts = pool_elt_at_index (bt->samples, bt->head); + TCP_TEST (bts->min_seq == tc->snd_una, "min seq should be snd_una"); + TCP_TEST (bts->next == TCP_BTS_INVALID_INDEX, "next should be invalid"); + TCP_TEST (bts->prev == TCP_BTS_INVALID_INDEX, "prev should be invalid"); + TCP_TEST (bts->delivered_time == 1, "delivered time should be 1"); + TCP_TEST (bts->delivered == 0, "delivered should be 0"); + TCP_TEST (!(bts->flags & TCP_BTS_IS_RXT), "not retransmitted"); + TCP_TEST (!(bts->flags & TCP_BTS_IS_APP_LIMITED), "not app limited"); + + /* 2) check delivery rate at time 2 */ + session_main.wrk[thread_index].last_vlib_time = 2; + tc->snd_una = tc->snd_nxt = burst; + tc->bytes_acked = burst; + + tcp_bt_sample_delivery_rate (tc, rs); + + TCP_TEST (tcp_bt_is_sane (bt), "tracker should be sane"); + TCP_TEST (pool_elts (bt->samples) == 0, "sample should've been consumed"); + TCP_TEST (tc->delivered_time == 2, "delivered time should be 2"); + TCP_TEST (tc->delivered == burst, "delivered should be 100"); + TCP_TEST (rs->ack_time == 1, "ack time should be 1"); + TCP_TEST (rs->delivered == burst, "delivered should be 100"); + TCP_TEST (rs->sample_delivered == 0, "sample delivered should be 0"); + TCP_TEST (rs->tx_rate == rate, "delivered should be %u", rate); + TCP_TEST (!(rs->flags & TCP_BTS_IS_RXT), "not retransmitted"); + + /* 3) track second burst at time 2 */ + tcp_bt_track_tx (tc); + tc->snd_nxt += burst; + + /* 4) track second burst at time 3 */ + session_main.wrk[thread_index].last_vlib_time = 3; + tcp_bt_track_tx (tc); + tc->snd_nxt += burst; + + TCP_TEST (pool_elts (bt->samples) == 2, "should have 2 samples"); + + TCP_TEST (tcp_bt_is_sane (bt), "tracker should be sane"); + bts = pool_elt_at_index (bt->samples, bt->head); + TCP_TEST (bts->min_seq == tc->snd_una, "min seq should be snd_una"); + TCP_TEST (bts->next == bt->tail, "next should tail"); + + bts = pool_elt_at_index (bt->samples, bt->tail); + TCP_TEST (bts->min_seq == tc->snd_nxt - burst, + "min seq should be snd_nxt prior to burst"); + TCP_TEST (bts->prev == bt->head, "prev should be head"); + + /* 5) check delivery rate at time 4 */ + session_main.wrk[thread_index].last_vlib_time = 4; + tc->snd_una = tc->snd_nxt; + tc->bytes_acked = 2 * burst; + + tcp_bt_sample_delivery_rate (tc, rs); + + TCP_TEST (tcp_bt_is_sane (bt), "tracker should be sane"); + TCP_TEST (pool_elts (bt->samples) == 0, "sample should've been consumed"); + TCP_TEST (tc->delivered_time == 4, "delivered time should be 4"); + TCP_TEST (tc->delivered == 3 * burst, "delivered should be 300 is %u", + tc->delivered); + TCP_TEST (rs->ack_time == 2, "ack time should be 2"); + TCP_TEST (rs->delivered == 2 * burst, "delivered should be 200"); + TCP_TEST (rs->sample_delivered == burst, "delivered should be 100"); + TCP_TEST (rs->tx_rate == rate, "delivered should be %u", rate); + TCP_TEST (!(rs->flags & TCP_BTS_IS_RXT), "not retransmitted"); + TCP_TEST (!(bts->flags & TCP_BTS_IS_APP_LIMITED), "not app limited"); + + /* + * Track retransmissions + * + * snd_una should be 300 at this point + */ + + snd_una = tc->snd_una; + + /* 1) track first burst a time 4 */ + tcp_bt_track_tx (tc); + tc->snd_nxt += burst; + + /* 2) track second burst at time 5 */ + session_main.wrk[thread_index].last_vlib_time = 5; + tcp_bt_track_tx (tc); + tc->snd_nxt += burst; + + /* 3) track third burst at time 6 */ + session_main.wrk[thread_index].last_vlib_time = 6; + tcp_bt_track_tx (tc); + tc->snd_nxt += burst; + + /* 4) track fourth burst at time 7 */ + session_main.wrk[thread_index].last_vlib_time = 7; + /* Limited until last burst is acked */ + tc->app_limited = snd_una + 4 * burst - 1; + tcp_bt_track_tx (tc); + tc->snd_nxt += burst; + + /* 5) check delivery rate at time 8 + * + * tc->snd_una = snd_una + 10 + * sacks: + * [snd_una + burst, snd_una + burst + 10] + * [snd_una + 2 * burst + 10, snd_una + 2 * burst + 20] + */ + session_main.wrk[thread_index].last_vlib_time = 8; + tc->snd_una += 10; + tc->bytes_acked = 10; + sb->last_sacked_bytes = 20; + + TCP_TEST (pool_elts (bt->samples) == 4, "there should be 4 samples"); + + vec_validate (sacks, 1); + sacks[0].start = snd_una + burst; + sacks[0].end = snd_una + burst + 10; + sacks[1].start = snd_una + 2 * burst + 10; + sacks[1].end = snd_una + 2 * burst + 20; + tc->rcv_opts.sacks = sacks; + + tcp_bt_sample_delivery_rate (tc, rs); + + TCP_TEST (tcp_bt_is_sane (bt), "tracker should be sane"); + TCP_TEST (pool_elts (bt->samples) == 4, "there should be 4 samples"); + TCP_TEST (tc->delivered_time == 8, "delivered time should be 8"); + TCP_TEST (tc->delivered == 3 * burst + 30, "delivered should be %u is %u", + 3 * burst + 30, tc->delivered); + /* All 3 samples have the same delivered number of bytes. So the first is + * the reference for delivery estimate. */ + TCP_TEST (rs->ack_time == 4, "ack time should be 4 is %.2f", rs->ack_time); + TCP_TEST (rs->delivered == 30, "delivered should be 30"); + TCP_TEST (rs->sample_delivered == 3 * burst, + "sample delivered should be %u", 3 * burst); + TCP_TEST (rs->tx_rate == rate, "delivered should be %u", rate); + TCP_TEST (!(rs->flags & TCP_BTS_IS_RXT), "not retransmitted"); + TCP_TEST (!(rs->flags & TCP_BTS_IS_APP_LIMITED), "not app limited"); + + /* 6) Retransmit and track at time 9 + * + * delivered = 3 * burst + 30 + * delivered_time = 8 (last ack) + * + * segments: + * [snd_una + 10, snd_una + burst] + * [snd_una + burst + 10, snd_una + 2 * burst + 10] + * [snd_una + 2 * burst + 20, snd_una + 4 * burst] + */ + session_main.wrk[thread_index].last_vlib_time = 9; + + tcp_bt_track_rxt (tc, snd_una + 10, snd_una + burst); + TCP_TEST (tcp_bt_is_sane (bt), "tracker should be sane"); + /* The retransmit covers everything left from first burst */ + TCP_TEST (pool_elts (bt->samples) == 4, "there should be 4 samples"); + + tcp_bt_track_rxt (tc, snd_una + burst + 10, snd_una + 2 * burst + 10); + TCP_TEST (tcp_bt_is_sane (bt), "tracker should be sane"); + TCP_TEST (pool_elts (bt->samples) == 5, "there should be 5 samples"); + + /* Retransmit covers last sample entirely so it should be removed */ + tcp_bt_track_rxt (tc, snd_una + 2 * burst + 20, snd_una + 4 * burst); + TCP_TEST (tcp_bt_is_sane (bt), "tracker should be sane"); + TCP_TEST (pool_elts (bt->samples) == 5, "there should be 5 samples"); + + vec_validate (min_seqs, 4); + min_seqs[0] = snd_una + 10; + min_seqs[1] = snd_una + burst; + min_seqs[2] = snd_una + burst + 10; + min_seqs[3] = snd_una + 2 * burst + 10; + min_seqs[4] = snd_una + 2 * burst + 20; + + root = bt->sample_lookup.nodes + bt->sample_lookup.root; + bts = bt->samples + bt->head; + for (i = 0; i < vec_len (min_seqs); i++) + { + if (bts->min_seq != min_seqs[i]) + TCP_TEST (0, "should be %u is %u", min_seqs[i], bts->min_seq); + rbn = rb_tree_search_subtree_custom (&bt->sample_lookup, root, + bts->min_seq, tbt_seq_lt); + if (rbn->opaque != bts - bt->samples) + TCP_TEST (0, "lookup should work"); + bts = bt->samples + bts->next; + } + + /* 7) check delivery rate at time 10 + * + * tc->snd_una = snd_una + 2 * burst + * sacks: + * [snd_una + 2 * burst + 20, snd_una + 2 * burst + 30] + * [snd_una + 2 * burst + 50, snd_una + 2 * burst + 60] + */ + session_main.wrk[thread_index].last_vlib_time = 10; + tc->snd_una = snd_una + 2 * burst; + tc->bytes_acked = 2 * burst - 10; + sb->last_sacked_bytes = 20; + + sacks[0].start = snd_una + 2 * burst + 20; + sacks[0].end = snd_una + 2 * burst + 30; + sacks[1].start = snd_una + 2 * burst + 50; + sacks[1].end = snd_una + 2 * burst + 60; + + tcp_bt_sample_delivery_rate (tc, rs); + + TCP_TEST (tcp_bt_is_sane (bt), "tracker should be sane"); + TCP_TEST (pool_elts (bt->samples) == 3, "num samples should be 3 is %u", + pool_elts (bt->samples)); + TCP_TEST (tc->delivered_time == 10, "delivered time should be 10"); + TCP_TEST (tc->delivered == 5 * burst + 40, "delivered should be %u is %u", + 5 * burst + 40, tc->delivered); + /* A rxt was acked and delivered time for it is 8 (last ack time) */ + TCP_TEST (rs->ack_time == 2, "ack time should be 2 is %.2f", rs->ack_time); + /* delivered_now - delivered_rxt ~ 5 * burst + 40 - 3 * burst - 30 */ + TCP_TEST (rs->delivered == 2 * burst + 10, "delivered should be 210 is %u", + rs->delivered); + TCP_TEST (rs->sample_delivered == 3 * burst + 30, + "sample delivered should be %u", 3 * burst + 30); + TCP_TEST (rs->tx_rate == rate, "delivered should be %u", rate); + TCP_TEST (rs->flags & TCP_BTS_IS_RXT, "is retransmitted"); + /* Sample is app limited because of the retransmits */ + TCP_TEST (rs->flags & TCP_BTS_IS_APP_LIMITED, "is app limited"); + TCP_TEST (tc->app_limited, "app limited should be set"); + + /* + * 8) check delivery rate at time 11 + */ + session_main.wrk[thread_index].last_vlib_time = 11; + tc->snd_una = tc->snd_nxt; + tc->bytes_acked = 2 * burst; + sb->last_sacked_bytes = 0; + sb->last_bytes_delivered = 40; + + memset (rs, 0, sizeof (*rs)); + tcp_bt_sample_delivery_rate (tc, rs); + + TCP_TEST (tcp_bt_is_sane (bt), "tracker should be sane"); + TCP_TEST (pool_elts (bt->samples) == 0, "num samples should be 3 is %u", + pool_elts (bt->samples)); + TCP_TEST (tc->delivered_time == 11, "delivered time should be 10"); + TCP_TEST (tc->delivered == 7 * burst, "delivered should be %u is %u", + 7 * burst, tc->delivered); + /* Last rxt was at time 8 */ + TCP_TEST (rs->ack_time == 3, "ack time should be 3 is %.2f", rs->ack_time); + /* delivered_now - delivered_rxt ~ 7 * burst - 3 * burst - 30. + * That's because we didn't retransmit any new segment. */ + TCP_TEST (rs->delivered == 4 * burst - 30, "delivered should be 160 is %u", + rs->delivered); + TCP_TEST (rs->sample_delivered == 3 * burst + 30, + "sample delivered should be %u", 3 * burst + 30); + TCP_TEST (rs->tx_rate == rate, "delivered should be %u", rate); + TCP_TEST (rs->flags & TCP_BTS_IS_RXT, "is retransmitted"); + TCP_TEST (rs->flags & TCP_BTS_IS_APP_LIMITED, "is app limited"); + TCP_TEST (tc->app_limited == 0, "app limited should be cleared"); + + /* + * Cleanup + */ + vec_free (sacks); + vec_free (min_seqs); + tcp_bt_cleanup (tc); + return 0; +} + static clib_error_t * tcp_test (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd_arg) @@ -800,12 +1106,18 @@ tcp_test (vlib_main_t * vm, { res = tcp_test_lookup (vm, input); } + else if (unformat (input, "delivery")) + { + res = tcp_test_delivery (vm, input); + } else if (unformat (input, "all")) { if ((res = tcp_test_sack (vm, input))) goto done; if ((res = tcp_test_lookup (vm, input))) goto done; + if ((res = tcp_test_delivery (vm, input))) + goto done; } else break; diff --git a/src/vnet/CMakeLists.txt b/src/vnet/CMakeLists.txt index 5e1a75f9e75..83183af660e 100644 --- a/src/vnet/CMakeLists.txt +++ b/src/vnet/CMakeLists.txt @@ -629,6 +629,7 @@ list(APPEND VNET_SOURCES tcp/tcp_input.c tcp/tcp_newreno.c tcp/tcp_cubic.c + tcp/tcp_bt.c tcp/tcp.c ) diff --git a/src/vnet/session/transport.c b/src/vnet/session/transport.c index 6077ab19070..bbd9c3555f4 100644 --- a/src/vnet/session/transport.c +++ b/src/vnet/session/transport.c @@ -634,6 +634,12 @@ spacer_set_pace_rate (spacer_t * pacer, u64 rate_bytes_per_sec) pacer->tokens_per_period = rate_bytes_per_sec / transport_pacer_period; } +static inline u64 +spacer_pace_rate (spacer_t * pacer) +{ + return pacer->tokens_per_period * transport_pacer_period; +} + void transport_connection_tx_pacer_reset (transport_connection_t * tc, u32 rate_bytes_per_sec, @@ -690,6 +696,12 @@ transport_connection_snd_space (transport_connection_t * tc, u64 time_now, return snd_space; } +u64 +transport_connection_tx_pacer_rate (transport_connection_t * tc) +{ + return spacer_pace_rate (&tc->pacer); +} + void transport_connection_update_tx_stats (transport_connection_t * tc, u32 bytes) { diff --git a/src/vnet/session/transport.h b/src/vnet/session/transport.h index 0aaaf7482d3..fadb02d6710 100644 --- a/src/vnet/session/transport.h +++ b/src/vnet/session/transport.h @@ -193,10 +193,25 @@ void transport_connection_tx_pacer_update (transport_connection_t * tc, u32 transport_connection_snd_space (transport_connection_t * tc, u64 time_now, u16 mss); +/** + * Get tx pacer max burst + * + * @param tc transport connection + * @param time_now current cpu time + * @return max burst for connection + */ u32 transport_connection_tx_pacer_burst (transport_connection_t * tc, u64 time_now); /** + * Get tx pacer current rate + * + * @param tc transport connection + * @return rate for connection in bytes/s + */ +u64 transport_connection_tx_pacer_rate (transport_connection_t * tc); + +/** * Initialize period for tx pacers * * Defines a unit of time with respect to number of cpu cycles that is to diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index f72b9577eba..03110e5c911 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -270,6 +270,9 @@ tcp_connection_cleanup (tcp_connection_t * tc) vec_free (tc->snd_sacks); vec_free (tc->snd_sacks_fl); + if (tc->flags & TCP_CONN_RATE_SAMPLE) + tcp_bt_cleanup (tc); + /* Poison the entry */ if (CLIB_DEBUG > 0) clib_memset (tc, 0xFA, sizeof (*tc)); @@ -662,6 +665,9 @@ tcp_connection_init_vars (tcp_connection_t * tc) if (transport_connection_is_tx_paced (&tc->connection) || tcp_main.tx_pacing) tcp_enable_pacing (tc); + + if (tc->flags & TCP_CONN_RATE_SAMPLE) + tcp_bt_init (tc); } static int diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index bc1e3c0f813..b0c3ecc8c15 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -128,6 +128,8 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler; _(DEQ_PENDING, "Pending dequeue acked") \ _(PSH_PENDING, "PSH pending") \ _(FINRCVD, "FIN received") \ + _(RATE_SAMPLE, "Conn does rate sampling") \ + _(TRACK_BURST, "Track burst") \ typedef enum _tcp_connection_flag_bits { @@ -174,7 +176,7 @@ typedef struct _sack_scoreboard u32 tail; /**< Index of last entry */ u32 sacked_bytes; /**< Number of bytes sacked in sb */ u32 last_sacked_bytes; /**< Number of bytes last sacked */ - u32 last_bytes_delivered; /**< Number of sack bytes delivered */ + u32 last_bytes_delivered; /**< Sack bytes delivered to app */ u32 snd_una_adv; /**< Bytes to add to snd_una */ u32 high_sacked; /**< Highest byte sacked (fack) */ u32 high_rxt; /**< Highest retransmitted sequence */ @@ -231,6 +233,44 @@ void scoreboard_clear (sack_scoreboard_t * sb); void scoreboard_init (sack_scoreboard_t * sb); u8 *format_tcp_scoreboard (u8 * s, va_list * args); +#define TCP_BTS_INVALID_INDEX ((u32)~0) + +typedef enum tcp_bts_flags_ +{ + TCP_BTS_IS_RXT = 1, + TCP_BTS_IS_APP_LIMITED = 1 << 1, +} __clib_packed tcp_bts_flags_t; + +typedef struct tcp_bt_sample_ +{ + u32 next; /**< Next sample index in list */ + u32 prev; /**< Previous sample index in list */ + u32 min_seq; /**< Min seq number in sample */ + u32 max_seq; /**< Max seq number. Set for rxt samples */ + u64 delivered; /**< Total delivered when sample taken */ + f64 delivered_time; /**< Delivered time when sample taken */ + u64 tx_rate; /**< Tx pacing rate */ + tcp_bts_flags_t flags; /**< Sample flag */ +} tcp_bt_sample_t; + +typedef struct tcp_rate_sample_ +{ + u64 sample_delivered; /**< Delivered of sample used for rate */ + u32 delivered; /**< Bytes delivered in ack time */ + f64 ack_time; /**< Time to ack the bytes delivered */ + u64 tx_rate; /**< Tx pacing rate */ + tcp_bts_flags_t flags; /**< Rate sample flags from bt sample */ +} tcp_rate_sample_t; + +typedef struct tcp_byte_tracker_ +{ + tcp_bt_sample_t *samples; /**< Pool of samples */ + rb_tree_t sample_lookup; /**< Rbtree for sample lookup by min_seq */ + u32 head; /**< Head of samples linked list */ + u32 tail; /**< Tail of samples linked list */ + u32 last_ooo; /**< Cached last ooo sample */ +} tcp_byte_tracker_t; + typedef enum _tcp_cc_algorithm_type { TCP_CC_NEWRENO, @@ -304,6 +344,7 @@ typedef struct _tcp_connection u32 snd_rxt_ts; /**< Timestamp when first packet is retransmitted */ u32 tsecr_last_ack; /**< Timestamp echoed to us in last healthy ACK */ u32 snd_congestion; /**< snd_una_max when congestion is detected */ + u32 tx_fifo_size; /**< Tx fifo size. Used to constrain cwnd */ tcp_cc_algorithm_t *cc_algo; /**< Congestion control algorithm */ u8 cc_data[TCP_CC_DATA_SZ]; /**< Congestion control algo private data */ @@ -316,15 +357,20 @@ typedef struct _tcp_connection f64 rtt_ts; /**< Timestamp for tracked ACK */ f64 mrtt_us; /**< High precision mrtt from tracked acks */ - u16 mss; /**< Our max seg size that includes options */ - u32 limited_transmit; /**< snd_nxt when limited transmit starts */ - u32 last_fib_check; /**< Last time we checked fib route for peer */ - u32 sw_if_index; /**< Interface for the connection */ - u32 tx_fifo_size; /**< Tx fifo size. Used to constrain cwnd */ - u32 psh_seq; /**< Add psh header for seg that includes this */ u32 next_node_index; /**< Can be used to control next node in output */ u32 next_node_opaque; /**< Opaque to pass to next node */ + u32 limited_transmit; /**< snd_nxt when limited transmit starts */ + u32 sw_if_index; /**< Interface for the connection */ + + /* Delivery rate estimation */ + u64 delivered; /**< Total bytes delivered to peer */ + u64 app_limited; /**< Delivered when app-limited detected */ + f64 delivered_time; /**< Time last bytes were acked */ + tcp_byte_tracker_t *bt; /**< Tx byte tracker */ + + u32 last_fib_check; /**< Last time we checked fib route for peer */ + u16 mss; /**< Our max seg size that includes options */ } tcp_connection_t; /* *INDENT-OFF* */ @@ -332,8 +378,9 @@ struct _tcp_cc_algorithm { const char *name; uword (*unformat_cfg) (unformat_input_t * input); - void (*rcv_ack) (tcp_connection_t * tc); - void (*rcv_cong_ack) (tcp_connection_t * tc, tcp_cc_ack_t ack); + void (*rcv_ack) (tcp_connection_t * tc, tcp_rate_sample_t *rs); + void (*rcv_cong_ack) (tcp_connection_t * tc, tcp_cc_ack_t ack, + tcp_rate_sample_t *rs); void (*congestion) (tcp_connection_t * tc); void (*recovered) (tcp_connection_t * tc); void (*init) (tcp_connection_t * tc); @@ -636,6 +683,66 @@ void tcp_program_ack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc); void tcp_program_dupack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc); void tcp_send_acks (tcp_worker_ctx_t * wrk); +/* + * Rate estimation + */ + +/** + * Byte tracker initialize + * + * @param tc connection for which the byte tracker should be allocated and + * initialized + */ +void tcp_bt_init (tcp_connection_t * tc); +/** + * Byte tracker cleanup + * + * @param tc connection for which the byte tracker should be cleaned up + */ +void tcp_bt_cleanup (tcp_connection_t * tc); +/** + * Flush byte tracker samples + * + * @param tc tcp connection for which samples should be flushed + */ +void tcp_bt_flush_samples (tcp_connection_t * tc); +/** + * Track a tcp tx burst + * + * @param tc tcp connection + */ +void tcp_bt_track_tx (tcp_connection_t * tc); +/** + * Track a tcp retransmission + * + * @param tc tcp connection + * @param start start sequence number + * @param end end sequence number + */ +void tcp_bt_track_rxt (tcp_connection_t * tc, u32 start, u32 end); +/** + * Generate a delivery rate sample from recently acked bytes + * + * @param tc tcp connection + * @param rs resulting rate sample + */ +void tcp_bt_sample_delivery_rate (tcp_connection_t * tc, + tcp_rate_sample_t * rs); +/** + * Check if sample to be generated is app limited + * + * @param tc tcp connection + */ +void tcp_bt_check_app_limited (tcp_connection_t * tc); +/** + * Check if the byte tracker is in sane state + * + * Should be used only for testing + * + * @param bt byte tracker + */ +int tcp_bt_is_sane (tcp_byte_tracker_t * bt); + always_inline u32 tcp_end_seq (tcp_header_t * th, u32 len) { @@ -825,12 +932,19 @@ void tcp_connection_tx_pacer_reset (tcp_connection_t * tc, u32 window, u32 start_bucket); always_inline void -tcp_cc_rcv_ack (tcp_connection_t * tc) +tcp_cc_rcv_ack (tcp_connection_t * tc, tcp_rate_sample_t * rs) { - tc->cc_algo->rcv_ack (tc); + tc->cc_algo->rcv_ack (tc, rs); tc->tsecr_last_ack = tc->rcv_opts.tsecr; } +static inline void +tcp_cc_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type, + tcp_rate_sample_t * rs) +{ + tc->cc_algo->rcv_cong_ack (tc, ack_type, rs); +} + always_inline void tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval) { @@ -959,7 +1073,8 @@ tcp_cc_data (tcp_connection_t * tc) return (void *) tc->cc_data; } -void newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type); +void newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type, + tcp_rate_sample_t * rs); /** * Push TCP header to buffer diff --git a/src/vnet/tcp/tcp_bt.c b/src/vnet/tcp/tcp_bt.c new file mode 100644 index 00000000000..74947474b53 --- /dev/null +++ b/src/vnet/tcp/tcp_bt.c @@ -0,0 +1,586 @@ +/* + * Copyright (c) 2019 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * TCP byte tracker that can generate delivery rate estimates. Based on + * draft-cheng-iccrg-delivery-rate-estimation-00 + */ + +#include <vnet/tcp/tcp.h> + +static tcp_bt_sample_t * +bt_get_sample (tcp_byte_tracker_t * bt, u32 bts_index) +{ + if (pool_is_free_index (bt->samples, bts_index)) + return 0; + return pool_elt_at_index (bt->samples, bts_index); +} + +static tcp_bt_sample_t * +bt_next_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts) +{ + return bt_get_sample (bt, bts->next); +} + +static tcp_bt_sample_t * +bt_prev_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts) +{ + return bt_get_sample (bt, bts->prev); +} + +static u32 +bt_sample_index (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts) +{ + if (!bts) + return TCP_BTS_INVALID_INDEX; + return bts - bt->samples; +} + +static inline int +bt_seq_lt (u32 a, u32 b) +{ + return seq_lt (a, b); +} + +static tcp_bt_sample_t * +bt_alloc_sample (tcp_byte_tracker_t * bt, u32 min_seq) +{ + tcp_bt_sample_t *bts; + + pool_get_zero (bt->samples, bts); + bts->next = bts->prev = TCP_BTS_INVALID_INDEX; + bts->min_seq = min_seq; + rb_tree_add_custom (&bt->sample_lookup, bts->min_seq, bts - bt->samples, + bt_seq_lt); + return bts; +} + +static void +bt_free_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts) +{ + if (bts->prev != TCP_BTS_INVALID_INDEX) + { + tcp_bt_sample_t *prev = bt_prev_sample (bt, bts); + prev->next = bts->next; + } + else + bt->head = bts->next; + + if (bts->next != TCP_BTS_INVALID_INDEX) + { + tcp_bt_sample_t *next = bt_next_sample (bt, bts); + next->prev = bts->prev; + } + else + bt->tail = bts->prev; + + rb_tree_del_custom (&bt->sample_lookup, bts->min_seq, bt_seq_lt); + if (CLIB_DEBUG) + memset (bts, 0xfc, sizeof (*bts)); + pool_put (bt->samples, bts); +} + +static tcp_bt_sample_t * +bt_lookup_seq (tcp_byte_tracker_t * bt, u32 seq) +{ + rb_tree_t *rt = &bt->sample_lookup; + rb_node_t *cur, *prev; + tcp_bt_sample_t *bts; + + cur = rb_node (rt, rt->root); + if (rb_node_is_tnil (rt, cur)) + return 0; + + while (seq != cur->key) + { + prev = cur; + if (seq_lt (seq, cur->key)) + cur = rb_node_left (rt, cur); + else + cur = rb_node_right (rt, cur); + + if (rb_node_is_tnil (rt, cur)) + { + /* Hit tnil as a left child. Find predecessor */ + if (seq_lt (seq, prev->key)) + { + cur = rb_tree_predecessor (rt, prev); + if (rb_node_is_tnil (rt, cur)) + return 0; + bts = bt_get_sample (bt, cur->opaque); + } + /* Hit tnil as a right child */ + else + { + bts = bt_get_sample (bt, prev->opaque); + } + + if (seq_geq (seq, bts->min_seq)) + return bts; + + return 0; + } + } + + if (!rb_node_is_tnil (rt, cur)) + return bt_get_sample (bt, cur->opaque); + + return 0; +} + +static void +bt_update_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts, u32 seq) +{ + rb_tree_del_custom (&bt->sample_lookup, bts->min_seq, bt_seq_lt); + bts->min_seq = seq; + rb_tree_add_custom (&bt->sample_lookup, bts->min_seq, + bt_sample_index (bt, bts), bt_seq_lt); +} + +static tcp_bt_sample_t * +bt_fix_overlapped (tcp_byte_tracker_t * bt, tcp_bt_sample_t * start, + u32 seq, u8 is_end) +{ + tcp_bt_sample_t *cur, *next; + + cur = start; + while ((next = bt_next_sample (bt, cur)) && seq_lt (next->min_seq, seq)) + { + bt_free_sample (bt, cur); + cur = next; + } + + if (next) + { + bt_free_sample (bt, cur); + return next; + } + + /* Overlapping current entirely */ + if (is_end) + { + bt_free_sample (bt, cur); + return 0; + } + + /* Overlapping head of current but not all */ + bt_update_sample (bt, cur, seq); + return cur; +} + +int +tcp_bt_is_sane (tcp_byte_tracker_t * bt) +{ + tcp_bt_sample_t *bts, *tmp; + + if (pool_elts (bt->samples) != pool_elts (bt->sample_lookup.nodes) - 1) + return 0; + + if (bt->head == TCP_BTS_INVALID_INDEX) + { + if (bt->tail != TCP_BTS_INVALID_INDEX) + return 0; + if (pool_elts (bt->samples) != 0) + return 0; + return 1; + } + + bts = bt_get_sample (bt, bt->tail); + if (!bts) + return 0; + + bts = bt_get_sample (bt, bt->head); + if (!bts || bts->prev != TCP_BTS_INVALID_INDEX) + return 0; + + while (bts) + { + tmp = bt_lookup_seq (bt, bts->min_seq); + if (!tmp) + return 0; + if (tmp != bts) + return 0; + tmp = bt_next_sample (bt, bts); + if (tmp) + { + if (tmp->prev != bt_sample_index (bt, bts)) + { + clib_warning ("next %u thinks prev is %u should be %u", + bts->next, tmp->prev, bt_sample_index (bt, bts)); + return 0; + } + if (!seq_lt (bts->min_seq, tmp->min_seq)) + return 0; + } + else + { + if (bt->tail != bt_sample_index (bt, bts)) + return 0; + if (bts->next != TCP_BTS_INVALID_INDEX) + return 0; + } + bts = tmp; + } + return 1; +} + +static tcp_bt_sample_t * +tcp_bt_alloc_tx_sample (tcp_connection_t * tc, u32 min_seq) +{ + tcp_bt_sample_t *bts; + bts = bt_alloc_sample (tc->bt, min_seq); + bts->delivered = tc->delivered; + bts->delivered_time = tc->delivered_time; + bts->tx_rate = transport_connection_tx_pacer_rate (&tc->connection); + bts->flags |= tc->app_limited ? TCP_BTS_IS_APP_LIMITED : 0; + return bts; +} + +void +tcp_bt_check_app_limited (tcp_connection_t * tc) +{ + u32 available_bytes, flight_size; + + available_bytes = transport_max_tx_dequeue (&tc->connection); + flight_size = tcp_flight_size (tc); + + /* Not enough bytes to fill the cwnd */ + if (available_bytes + flight_size + tc->snd_mss < tc->cwnd + /* Bytes considered lost have been retransmitted */ + && tc->sack_sb.lost_bytes <= tc->snd_rxt_bytes) + tc->app_limited = tc->delivered + flight_size ? : 1; +} + +void +tcp_bt_track_tx (tcp_connection_t * tc) +{ + tcp_byte_tracker_t *bt = tc->bt; + tcp_bt_sample_t *bts, *tail; + u32 bts_index; + + if (!tcp_flight_size (tc)) + tc->delivered_time = tcp_time_now_us (tc->c_thread_index); + + bts = tcp_bt_alloc_tx_sample (tc, tc->snd_nxt); + bts_index = bt_sample_index (bt, bts); + tail = bt_get_sample (bt, bt->tail); + if (tail) + { + tail->next = bts_index; + bts->prev = bt->tail; + bt->tail = bts_index; + } + else + { + bt->tail = bt->head = bts_index; + } +} + +void +tcp_bt_track_rxt (tcp_connection_t * tc, u32 start, u32 end) +{ + tcp_byte_tracker_t *bt = tc->bt; + tcp_bt_sample_t *bts, *next, *cur, *prev, *nbts; + u32 bts_index, cur_index, next_index, prev_index, min_seq; + u8 is_end = end == tc->snd_nxt; + + bts = bt_get_sample (bt, bt->last_ooo); + if (bts && bts->max_seq == start) + { + bts->max_seq = end; + next = bt_next_sample (bt, bts); + if (next) + bt_fix_overlapped (bt, next, end, is_end); + + return; + } + + /* Find original tx sample */ + bts = bt_lookup_seq (bt, start); + + ASSERT (bts != 0 && seq_geq (start, bts->min_seq)); + + /* Head in the past */ + if (seq_lt (bts->min_seq, tc->snd_una)) + bt_update_sample (bt, bts, tc->snd_una); + + /* Head overlap */ + if (bts->min_seq == start) + { + prev_index = bts->prev; + next = bt_fix_overlapped (bt, bts, end, is_end); + next_index = bt_sample_index (bt, next); + + cur = tcp_bt_alloc_tx_sample (tc, start); + cur->max_seq = end; + cur->flags |= TCP_BTS_IS_RXT; + cur->next = next_index; + cur->prev = prev_index; + + cur_index = bt_sample_index (bt, cur); + + if (next_index != TCP_BTS_INVALID_INDEX) + { + next = bt_get_sample (bt, next_index); + next->prev = cur_index; + } + else + { + bt->tail = cur_index; + } + + if (prev_index != TCP_BTS_INVALID_INDEX) + { + prev = bt_get_sample (bt, prev_index); + prev->next = cur_index; + } + else + { + bt->head = cur_index; + } + + bt->last_ooo = cur_index; + return; + } + + bts_index = bt_sample_index (bt, bts); + next = bt_next_sample (bt, bts); + if (next) + next = bt_fix_overlapped (bt, next, end, is_end); + + min_seq = next ? next->min_seq : tc->snd_nxt; + ASSERT (seq_lt (start, min_seq)); + + /* Have to split or tail overlap */ + cur = tcp_bt_alloc_tx_sample (tc, start); + cur->max_seq = end; + cur->flags |= TCP_BTS_IS_RXT; + cur->prev = bts_index; + cur_index = bt_sample_index (bt, cur); + + /* Split. Allocate another sample */ + if (seq_lt (end, min_seq)) + { + nbts = tcp_bt_alloc_tx_sample (tc, end); + cur = bt_get_sample (bt, cur_index); + bts = bt_get_sample (bt, bts_index); + + *nbts = *bts; + nbts->min_seq = end; + + if (nbts->next != TCP_BTS_INVALID_INDEX) + { + next = bt_get_sample (bt, nbts->next); + next->prev = bt_sample_index (bt, nbts); + } + else + bt->tail = bt_sample_index (bt, nbts); + + bts->next = nbts->prev = cur_index; + cur->next = bt_sample_index (bt, nbts); + + bt->last_ooo = cur_index; + } + /* Tail completely overlapped */ + else + { + bts = bt_get_sample (bt, bts_index); + + if (bts->next != TCP_BTS_INVALID_INDEX) + { + next = bt_get_sample (bt, bts->next); + next->prev = cur_index; + } + else + bt->tail = cur_index; + + cur->next = bts->next; + bts->next = cur_index; + + bt->last_ooo = cur_index; + } +} + +static void +tcp_bt_sample_to_rate_sample (tcp_connection_t * tc, tcp_bt_sample_t * bts, + tcp_rate_sample_t * rs) +{ + if (rs->sample_delivered && rs->sample_delivered >= bts->delivered) + return; + + rs->sample_delivered = bts->delivered; + rs->delivered = tc->delivered - bts->delivered; + rs->ack_time = tc->delivered_time - bts->delivered_time; + rs->tx_rate = bts->tx_rate; + rs->flags = bts->flags; +} + +static void +tcp_bt_walk_samples (tcp_connection_t * tc, tcp_rate_sample_t * rs) +{ + tcp_byte_tracker_t *bt = tc->bt; + tcp_bt_sample_t *next, *cur; + + cur = bt_get_sample (bt, bt->head); + tcp_bt_sample_to_rate_sample (tc, cur, rs); + while ((next = bt_get_sample (bt, cur->next)) + && seq_lt (next->min_seq, tc->snd_una)) + { + bt_free_sample (bt, cur); + tcp_bt_sample_to_rate_sample (tc, next, rs); + cur = next; + } + + ASSERT (seq_lt (cur->min_seq, tc->snd_una)); + + /* All samples acked */ + if (tc->snd_una == tc->snd_nxt) + { + ASSERT (pool_elts (bt->samples) == 1); + bt_free_sample (bt, cur); + return; + } + + /* Current sample completely consumed */ + if (next && next->min_seq == tc->snd_una) + { + bt_free_sample (bt, cur); + cur = next; + } +} + +static void +tcp_bt_walk_samples_ooo (tcp_connection_t * tc, tcp_rate_sample_t * rs) +{ + sack_block_t *blks = tc->rcv_opts.sacks, *blk; + tcp_byte_tracker_t *bt = tc->bt; + tcp_bt_sample_t *next, *cur; + int i; + + for (i = 0; i < vec_len (blks); i++) + { + blk = &blks[i]; + + /* Ignore blocks that are already covered by snd_una */ + if (seq_lt (blk->end, tc->snd_una)) + continue; + + cur = bt_lookup_seq (bt, blk->start); + if (!cur) + continue; + + tcp_bt_sample_to_rate_sample (tc, cur, rs); + + /* Current shouldn't be removed */ + if (cur->min_seq != blk->start) + { + cur = bt_next_sample (bt, cur); + if (!cur) + continue; + } + + while ((next = bt_get_sample (bt, cur->next)) + && seq_lt (next->min_seq, blk->end)) + { + bt_free_sample (bt, cur); + tcp_bt_sample_to_rate_sample (tc, next, rs); + cur = next; + } + + /* Current consumed entirely */ + if (next && next->min_seq == blk->end) + bt_free_sample (bt, cur); + } +} + +void +tcp_bt_sample_delivery_rate (tcp_connection_t * tc, tcp_rate_sample_t * rs) +{ + u32 delivered; + + if (PREDICT_FALSE (tc->flags & TCP_CONN_FINSNT)) + return; + + delivered = tc->bytes_acked + tc->sack_sb.last_sacked_bytes; + if (!delivered || tc->bt->head == TCP_BTS_INVALID_INDEX) + return; + + /* Do not count bytes that were previously sacked again */ + tc->delivered += delivered - tc->sack_sb.last_bytes_delivered; + tc->delivered_time = tcp_time_now_us (tc->c_thread_index); + + if (tc->app_limited && tc->delivered > tc->app_limited) + tc->app_limited = 0; + + if (tc->bytes_acked) + tcp_bt_walk_samples (tc, rs); + + if (tc->sack_sb.last_sacked_bytes) + tcp_bt_walk_samples_ooo (tc, rs); +} + +void +tcp_bt_flush_samples (tcp_connection_t * tc) +{ + tcp_byte_tracker_t *bt = tc->bt; + tcp_bt_sample_t *bts; + u32 *samples = 0, *si; + + vec_validate (samples, pool_elts (bt->samples) - 1); + + /* *INDENT-OFF* */ + pool_foreach (bts, bt->samples, ({ + vec_add1 (samples, bts - bt->samples); + })); + /* *INDENT-ON* */ + + vec_foreach (si, samples) + { + bts = bt_get_sample (bt, *si); + bt_free_sample (bt, bts); + } + + vec_free (samples); +} + +void +tcp_bt_cleanup (tcp_connection_t * tc) +{ + tcp_byte_tracker_t *bt = tc->bt; + + rb_tree_free_nodes (&bt->sample_lookup); + pool_free (bt->samples); + clib_mem_free (bt); + tc->bt = 0; +} + +void +tcp_bt_init (tcp_connection_t * tc) +{ + tcp_byte_tracker_t *bt; + + bt = clib_mem_alloc (sizeof (tcp_byte_tracker_t)); + clib_memset (bt, 0, sizeof (tcp_byte_tracker_t)); + + rb_tree_init (&bt->sample_lookup); + bt->head = bt->tail = TCP_BTS_INVALID_INDEX; + tc->bt = bt; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_cubic.c b/src/vnet/tcp/tcp_cubic.c index 38156274ce0..80d4308bbb7 100644 --- a/src/vnet/tcp/tcp_cubic.c +++ b/src/vnet/tcp/tcp_cubic.c @@ -130,7 +130,7 @@ cubic_cwnd_accumulate (tcp_connection_t * tc, u32 thresh, u32 bytes_acked) } static void -cubic_rcv_ack (tcp_connection_t * tc) +cubic_rcv_ack (tcp_connection_t * tc, tcp_rate_sample_t * rs) { cubic_data_t *cd = (cubic_data_t *) tcp_cc_data (tc); u64 w_cubic, w_aimd; diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 6c78af0fc88..944b8eb2208 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -1264,12 +1264,12 @@ tcp_cc_recover (tcp_connection_t * tc) } static void -tcp_cc_update (tcp_connection_t * tc, vlib_buffer_t * b) +tcp_cc_update (tcp_connection_t * tc, tcp_rate_sample_t * rs) { ASSERT (!tcp_in_cong_recovery (tc) || tcp_is_lost_fin (tc)); /* Congestion avoidance */ - tcp_cc_rcv_ack (tc); + tcp_cc_rcv_ack (tc, rs); /* If a cumulative ack, make sure dupacks is 0 */ tc->rcv_dupacks = 0; @@ -1376,7 +1376,8 @@ tcp_do_fastretransmits (tcp_worker_ctx_t * wrk) * One function to rule them all ... and in the darkness bind them */ static void -tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) +tcp_cc_handle_event (tcp_connection_t * tc, tcp_rate_sample_t * rs, + u32 is_dack) { u32 rxt_delivered; @@ -1402,7 +1403,7 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD && !tc->bytes_acked) { ASSERT (tcp_in_fastrecovery (tc)); - tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + tcp_cc_rcv_cong_ack (tc, TCP_CC_DUPACK, rs); return; } else if (tcp_should_fastrecover (tc)) @@ -1421,7 +1422,7 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) } tcp_cc_init_congestion (tc); - tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + tcp_cc_rcv_cong_ack (tc, TCP_CC_DUPACK, rs); if (tcp_opts_sack_permitted (&tc->rcv_opts)) { @@ -1447,7 +1448,7 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) else if (!tc->bytes_acked || (tc->bytes_acked && !tcp_in_cong_recovery (tc))) { - tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + tcp_cc_rcv_cong_ack (tc, TCP_CC_DUPACK, rs); return; } else @@ -1502,7 +1503,7 @@ partial_ack: } /* Treat as congestion avoidance ack */ - tcp_cc_rcv_ack (tc); + tcp_cc_rcv_ack (tc, rs); return; } @@ -1520,7 +1521,7 @@ partial_ack: /* Post RTO timeout don't try anything fancy */ if (tcp_in_recovery (tc)) { - tcp_cc_rcv_ack (tc); + tcp_cc_rcv_ack (tc, rs); transport_add_tx_event (&tc->connection); return; } @@ -1557,7 +1558,7 @@ partial_ack: tc->snd_rxt_bytes = 0; } - tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK); + tcp_cc_rcv_cong_ack (tc, TCP_CC_PARTIALACK, rs); /* * Since this was a partial ack, try to retransmit some more data @@ -1573,6 +1574,7 @@ tcp_rcv_ack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, vlib_buffer_t * b, tcp_header_t * th, u32 * error) { u32 prev_snd_wnd, prev_snd_una; + tcp_rate_sample_t rs = { 0 }; u8 is_dack; TCP_EVT_DBG (TCP_EVT_CC_STAT, tc); @@ -1602,7 +1604,7 @@ tcp_rcv_ack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, vlib_buffer_t * b, TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 1, vnet_buffer (b)->tcp.ack_number); if (tcp_in_fastrecovery (tc) && tc->rcv_dupacks == TCP_DUPACK_THRESHOLD) - tcp_cc_handle_event (tc, 1); + tcp_cc_handle_event (tc, 0, 1); /* Don't drop yet */ return 0; } @@ -1630,6 +1632,9 @@ process_ack: tcp_update_rtt (tc, vnet_buffer (b)->tcp.ack_number); } + if (tc->flags & TCP_CONN_RATE_SAMPLE) + tcp_bt_sample_delivery_rate (tc, &rs); + TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc); /* @@ -1638,7 +1643,7 @@ process_ack: if (tcp_ack_is_cc_event (tc, b, prev_snd_wnd, prev_snd_una, &is_dack)) { - tcp_cc_handle_event (tc, is_dack); + tcp_cc_handle_event (tc, &rs, is_dack); if (!tcp_in_cong_recovery (tc)) { *error = TCP_ERROR_ACK_OK; @@ -1653,7 +1658,7 @@ process_ack: /* * Update congestion control (slow start/congestion avoidance) */ - tcp_cc_update (tc, b); + tcp_cc_update (tc, &rs); *error = TCP_ERROR_ACK_OK; return 0; } diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c index c40e4432732..3887b34b7ea 100644 --- a/src/vnet/tcp/tcp_newreno.c +++ b/src/vnet/tcp/tcp_newreno.c @@ -28,7 +28,7 @@ newreno_recovered (tcp_connection_t * tc) } void -newreno_rcv_ack (tcp_connection_t * tc) +newreno_rcv_ack (tcp_connection_t * tc, tcp_rate_sample_t * rs) { if (tcp_in_slowstart (tc)) { @@ -42,7 +42,8 @@ newreno_rcv_ack (tcp_connection_t * tc) } void -newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type) +newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type, + tcp_rate_sample_t * rs) { if (ack_type == TCP_CC_DUPACK) { diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 1adac95731f..d3c4ca4a314 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -422,6 +422,9 @@ tcp_update_burst_snd_vars (tcp_connection_t * tc) &tc->snd_opts); tcp_update_rcv_wnd (tc); + + if (tc->flags & TCP_CONN_RATE_SAMPLE) + tc->flags |= TCP_CONN_TRACK_BURST; } void @@ -1129,8 +1132,17 @@ u32 tcp_session_push_header (transport_connection_t * tconn, vlib_buffer_t * b) { tcp_connection_t *tc = (tcp_connection_t *) tconn; + + if (tc->flags & TCP_CONN_TRACK_BURST) + { + tcp_bt_check_app_limited (tc); + tcp_bt_track_tx (tc); + tc->flags &= ~TCP_CONN_TRACK_BURST; + } + tcp_push_hdr_i (tc, b, tc->snd_nxt, /* compute opts */ 0, /* burst */ 1, /* update_snd_nxt */ 1); + tc->snd_una_max = seq_max (tc->snd_nxt, tc->snd_una_max); tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); /* If not tracking an ACK, start tracking */ @@ -1418,7 +1430,11 @@ tcp_prepare_retransmit_segment (tcp_worker_ctx_t * wrk, return 0; if (tcp_in_fastrecovery (tc)) - tc->snd_rxt_bytes += n_bytes; + { + tc->snd_rxt_bytes += n_bytes; + if (tc->flags & TCP_CONN_RATE_SAMPLE) + tcp_bt_track_rxt (tc, start, start + n_bytes); + } done: TCP_EVT_DBG (TCP_EVT_CC_RTX, tc, offset, n_bytes); @@ -1540,6 +1556,9 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) else scoreboard_clear (&tc->sack_sb); + if (tc->flags & TCP_CONN_RATE_SAMPLE) + tcp_bt_flush_samples (tc); + /* If we've sent beyond snd_congestion, update it */ tc->snd_congestion = seq_max (tc->snd_nxt, tc->snd_congestion); |