diff options
author | Florin Coras <fcoras@cisco.com> | 2019-10-12 18:10:20 -0700 |
---|---|---|
committer | Andrew Yourtchenko <ayourtch@gmail.com> | 2019-10-31 12:16:19 +0000 |
commit | cd0c5d8373d04f9e49b2ad630632e7940d841986 (patch) | |
tree | eb853c3f13605bbeaf7478f27aa25532e2073927 /src/vnet | |
parent | dcc5de6fc321c3828a7f2d55204641e198da5af2 (diff) |
tcp: improve rate samples for retansmitted segments
Type: fix
- Initialize max_seq on both transmitted and retransmitted segments
- Keep track of segments that have been sacked.
- Track new data segments sent during recovery
Change-Id: Ice55231a3da200ae6171702e54b2ce155f831143
Signed-off-by: Florin Coras <fcoras@cisco.com>
(cherry picked from commit d6ae4bf13a7819d64d128d196d491af4700fa948)
Diffstat (limited to 'src/vnet')
-rw-r--r-- | src/vnet/tcp/tcp.h | 5 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_bt.c | 229 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_output.c | 27 |
3 files changed, 184 insertions, 77 deletions
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 097a1475185..8636a7c85db 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -140,7 +140,6 @@ typedef enum tcp_cfg_flag_ _(DEQ_PENDING, "Dequeue pending ") \ _(PSH_PENDING, "PSH pending") \ _(FINRCVD, "FIN received") \ - _(TRACK_BURST, "Track burst") \ _(ZERO_RWND_SENT, "Zero RWND sent") \ typedef enum tcp_connection_flag_bits_ @@ -256,6 +255,7 @@ typedef enum tcp_bts_flags_ { TCP_BTS_IS_RXT = 1, TCP_BTS_IS_APP_LIMITED = 1 << 1, + TCP_BTS_IS_SACKED = 1 << 2, } __clib_packed tcp_bts_flags_t; typedef struct tcp_bt_sample_ @@ -824,7 +824,7 @@ void tcp_bt_flush_samples (tcp_connection_t * tc); * * @param tc tcp connection */ -void tcp_bt_track_tx (tcp_connection_t * tc); +void tcp_bt_track_tx (tcp_connection_t * tc, u32 len); /** * Track a tcp retransmission * @@ -855,6 +855,7 @@ void tcp_bt_check_app_limited (tcp_connection_t * tc); * @param bt byte tracker */ int tcp_bt_is_sane (tcp_byte_tracker_t * bt); +u8 *format_tcp_bt (u8 * s, va_list * args); always_inline u32 tcp_end_seq (tcp_header_t * th, u32 len) diff --git a/src/vnet/tcp/tcp_bt.c b/src/vnet/tcp/tcp_bt.c index b550fdd65dd..b3f4e6ab000 100644 --- a/src/vnet/tcp/tcp_bt.c +++ b/src/vnet/tcp/tcp_bt.c @@ -53,13 +53,14 @@ bt_seq_lt (u32 a, u32 b) } static tcp_bt_sample_t * -bt_alloc_sample (tcp_byte_tracker_t * bt, u32 min_seq) +bt_alloc_sample (tcp_byte_tracker_t * bt, u32 min_seq, u32 max_seq) { tcp_bt_sample_t *bts; pool_get_zero (bt->samples, bts); bts->next = bts->prev = TCP_BTS_INVALID_INDEX; bts->min_seq = min_seq; + bts->max_seq = max_seq; rb_tree_add_custom (&bt->sample_lookup, bts->min_seq, bts - bt->samples, bt_seq_lt); return bts; @@ -91,6 +92,47 @@ bt_free_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts) } static tcp_bt_sample_t * +bt_split_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts, u32 seq) +{ + tcp_bt_sample_t *ns, *next; + u32 bts_index; + + bts_index = bt_sample_index (bt, bts); + + ASSERT (seq_leq (bts->min_seq, seq) && seq_lt (seq, bts->max_seq)); + + ns = bt_alloc_sample (bt, seq, bts->max_seq); + bts = bt_get_sample (bt, bts_index); + + *ns = *bts; + ns->min_seq = seq; + bts->max_seq = seq; + + next = bt_next_sample (bt, bts); + if (next) + next->prev = bt_sample_index (bt, ns); + else + bt->tail = bt_sample_index (bt, ns); + + bts->next = bt_sample_index (bt, ns); + ns->prev = bt_sample_index (bt, bts); + + return ns; +} + +static tcp_bt_sample_t * +bt_merge_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * prev, + tcp_bt_sample_t * cur) +{ + ASSERT (prev->max_seq == cur->min_seq); + prev->max_seq = cur->max_seq; + if (bt_sample_index (bt, cur) == bt->tail) + bt->tail = bt_sample_index (bt, prev); + bt_free_sample (bt, cur); + return prev; +} + +static tcp_bt_sample_t * bt_lookup_seq (tcp_byte_tracker_t * bt, u32 seq) { rb_tree_t *rt = &bt->sample_lookup; @@ -154,27 +196,16 @@ bt_fix_overlapped (tcp_byte_tracker_t * bt, tcp_bt_sample_t * start, tcp_bt_sample_t *cur, *next; cur = start; - while ((next = bt_next_sample (bt, cur)) && seq_lt (next->min_seq, seq)) + while (cur && seq_leq (cur->max_seq, seq)) { + next = bt_next_sample (bt, cur); bt_free_sample (bt, cur); cur = next; } - if (next) - { - bt_free_sample (bt, cur); - return next; - } - - /* Overlapping current entirely */ - if (is_end) - { - bt_free_sample (bt, cur); - return 0; - } + if (cur && seq_lt (cur->min_seq, seq)) + bt_update_sample (bt, cur, seq); - /* Overlapping head of current but not all */ - bt_update_sample (bt, cur, seq); return cur; } @@ -235,10 +266,10 @@ tcp_bt_is_sane (tcp_byte_tracker_t * bt) } static tcp_bt_sample_t * -tcp_bt_alloc_tx_sample (tcp_connection_t * tc, u32 min_seq) +tcp_bt_alloc_tx_sample (tcp_connection_t * tc, u32 min_seq, u32 max_seq) { tcp_bt_sample_t *bts; - bts = bt_alloc_sample (tc->bt, min_seq); + bts = bt_alloc_sample (tc->bt, min_seq, max_seq); bts->delivered = tc->delivered; bts->delivered_time = tc->delivered_time; bts->tx_time = tcp_time_now_us (tc->c_thread_index); @@ -263,19 +294,27 @@ tcp_bt_check_app_limited (tcp_connection_t * tc) } void -tcp_bt_track_tx (tcp_connection_t * tc) +tcp_bt_track_tx (tcp_connection_t * tc, u32 len) { tcp_byte_tracker_t *bt = tc->bt; tcp_bt_sample_t *bts, *tail; u32 bts_index; + tail = bt_get_sample (bt, bt->tail); + if (tail && tail->max_seq == tc->snd_nxt + && tail->tx_time == tcp_time_now_us (tc->c_thread_index)) + { + tail->max_seq += len; + return; + } + if (tc->snd_una == tc->snd_nxt) { tc->delivered_time = tcp_time_now_us (tc->c_thread_index); tc->first_tx_time = tc->delivered_time; } - bts = tcp_bt_alloc_tx_sample (tc, tc->snd_nxt); + bts = tcp_bt_alloc_tx_sample (tc, tc->snd_nxt, tc->snd_nxt + len); bts_index = bt_sample_index (bt, bts); tail = bt_get_sample (bt, bt->tail); if (tail) @@ -295,11 +334,13 @@ tcp_bt_track_rxt (tcp_connection_t * tc, u32 start, u32 end) { tcp_byte_tracker_t *bt = tc->bt; tcp_bt_sample_t *bts, *next, *cur, *prev, *nbts; - u32 bts_index, cur_index, next_index, prev_index, min_seq; + u32 bts_index, cur_index, next_index, prev_index, max_seq; u8 is_end = end == tc->snd_nxt; + /* Contiguous blocks retransmitted at the same time */ bts = bt_get_sample (bt, bt->last_ooo); - if (bts && bts->max_seq == start) + if (bts && bts->max_seq == start + && bts->tx_time == tcp_time_now_us (tc->c_thread_index)) { bts->max_seq = end; next = bt_next_sample (bt, bts); @@ -325,8 +366,7 @@ tcp_bt_track_rxt (tcp_connection_t * tc, u32 start, u32 end) next = bt_fix_overlapped (bt, bts, end, is_end); next_index = bt_sample_index (bt, next); - cur = tcp_bt_alloc_tx_sample (tc, start); - cur->max_seq = end; + cur = tcp_bt_alloc_tx_sample (tc, start, end); cur->flags |= TCP_BTS_IS_RXT; cur->next = next_index; cur->prev = prev_index; @@ -362,20 +402,19 @@ tcp_bt_track_rxt (tcp_connection_t * tc, u32 start, u32 end) if (next) next = bt_fix_overlapped (bt, next, end, is_end); - min_seq = next ? next->min_seq : tc->snd_nxt; - ASSERT (seq_lt (start, min_seq)); + max_seq = bts->max_seq; + ASSERT (seq_lt (start, max_seq)); /* Have to split or tail overlap */ - cur = tcp_bt_alloc_tx_sample (tc, start); - cur->max_seq = end; + cur = tcp_bt_alloc_tx_sample (tc, start, end); cur->flags |= TCP_BTS_IS_RXT; cur->prev = bts_index; cur_index = bt_sample_index (bt, cur); /* Split. Allocate another sample */ - if (seq_lt (end, min_seq)) + if (seq_lt (end, max_seq)) { - nbts = tcp_bt_alloc_tx_sample (tc, end); + nbts = tcp_bt_alloc_tx_sample (tc, end, bts->max_seq); cur = bt_get_sample (bt, cur_index); bts = bt_get_sample (bt, bts_index); @@ -393,12 +432,14 @@ tcp_bt_track_rxt (tcp_connection_t * tc, u32 start, u32 end) bts->next = nbts->prev = cur_index; cur->next = bt_sample_index (bt, nbts); + bts->max_seq = start; bt->last_ooo = cur_index; } /* Tail completely overlapped */ else { bts = bt_get_sample (bt, bts_index); + bts->max_seq = start; if (bts->next != TCP_BTS_INVALID_INDEX) { @@ -419,13 +460,16 @@ static void tcp_bt_sample_to_rate_sample (tcp_connection_t * tc, tcp_bt_sample_t * bts, tcp_rate_sample_t * rs) { + if (bts->flags & TCP_BTS_IS_SACKED) + return; + if (rs->prior_delivered && rs->prior_delivered >= bts->delivered) return; rs->prior_delivered = bts->delivered; rs->prior_time = bts->delivered_time; rs->interval_time = bts->tx_time - bts->first_tx_time; - rs->rtt_time = bts->tx_time; + rs->rtt_time = tc->delivered_time - bts->tx_time; rs->flags = bts->flags; tc->first_tx_time = bts->tx_time; } @@ -437,31 +481,16 @@ tcp_bt_walk_samples (tcp_connection_t * tc, tcp_rate_sample_t * rs) tcp_bt_sample_t *next, *cur; cur = bt_get_sample (bt, bt->head); - tcp_bt_sample_to_rate_sample (tc, cur, rs); - while ((next = bt_get_sample (bt, cur->next)) - && seq_lt (next->min_seq, tc->snd_una)) + while (cur && seq_leq (cur->max_seq, tc->snd_una)) { + next = bt_next_sample (bt, cur); + tcp_bt_sample_to_rate_sample (tc, cur, rs); bt_free_sample (bt, cur); - tcp_bt_sample_to_rate_sample (tc, next, rs); cur = next; } - ASSERT (seq_lt (cur->min_seq, tc->snd_una)); - - /* All samples acked */ - if (tc->snd_una == tc->snd_nxt) - { - ASSERT (pool_elts (bt->samples) == 1); - bt_free_sample (bt, cur); - return; - } - - /* Current sample completely consumed */ - if (next && next->min_seq == tc->snd_una) - { - bt_free_sample (bt, cur); - cur = next; - } + if (cur && seq_lt (cur->min_seq, tc->snd_una)) + tcp_bt_sample_to_rate_sample (tc, cur, rs); } static void @@ -469,7 +498,7 @@ tcp_bt_walk_samples_ooo (tcp_connection_t * tc, tcp_rate_sample_t * rs) { sack_block_t *blks = tc->rcv_opts.sacks, *blk; tcp_byte_tracker_t *bt = tc->bt; - tcp_bt_sample_t *next, *cur; + tcp_bt_sample_t *cur, *prev, *next; int i; for (i = 0; i < vec_len (blks); i++) @@ -484,27 +513,64 @@ tcp_bt_walk_samples_ooo (tcp_connection_t * tc, tcp_rate_sample_t * rs) if (!cur) continue; - tcp_bt_sample_to_rate_sample (tc, cur, rs); + ASSERT (seq_geq (blk->start, cur->min_seq) + && seq_lt (blk->start, cur->max_seq)); - /* Current shouldn't be removed */ - if (cur->min_seq != blk->start) + /* Current should be split. Second part will be consumed */ + if (PREDICT_FALSE (cur->min_seq != blk->start)) { - cur = bt_next_sample (bt, cur); - if (!cur) - continue; + cur = bt_split_sample (bt, cur, blk->start); + prev = bt_prev_sample (bt, cur); } + else + prev = bt_prev_sample (bt, cur); - while ((next = bt_get_sample (bt, cur->next)) - && seq_lt (next->min_seq, blk->end)) + while (cur && seq_leq (cur->max_seq, blk->end)) { - bt_free_sample (bt, cur); - tcp_bt_sample_to_rate_sample (tc, next, rs); + if (!(cur->flags & TCP_BTS_IS_SACKED)) + { + tcp_bt_sample_to_rate_sample (tc, cur, rs); + cur->flags |= TCP_BTS_IS_SACKED; + if (prev && (prev->flags & TCP_BTS_IS_SACKED)) + { + cur = bt_merge_sample (bt, prev, cur); + next = bt_next_sample (bt, cur); + } + else + { + next = bt_next_sample (bt, cur); + if (next && (next->flags & TCP_BTS_IS_SACKED)) + { + cur = bt_merge_sample (bt, cur, next); + next = bt_next_sample (bt, cur); + } + } + } + else + next = bt_next_sample (bt, cur); + + prev = cur; cur = next; } - /* Current consumed entirely */ - if (next && next->min_seq == blk->end) - bt_free_sample (bt, cur); + if (cur && seq_lt (cur->min_seq, blk->end)) + { + tcp_bt_sample_to_rate_sample (tc, cur, rs); + prev = bt_prev_sample (bt, cur); + /* Extend previous to include the newly sacked bytes */ + if (prev && (prev->flags & TCP_BTS_IS_SACKED)) + { + prev->max_seq = blk->end; + bt_update_sample (bt, cur, blk->end); + } + /* Split sample into two. First part is consumed */ + else + { + next = bt_split_sample (bt, cur, blk->end); + cur = bt_prev_sample (bt, next); + cur->flags |= TCP_BTS_IS_SACKED; + } + } } } @@ -533,10 +599,9 @@ tcp_bt_sample_delivery_rate (tcp_connection_t * tc, tcp_rate_sample_t * rs) if (tc->sack_sb.last_sacked_bytes) tcp_bt_walk_samples_ooo (tc, rs); - rs->interval_time = clib_max (tc->delivered_time - rs->prior_time, + rs->interval_time = clib_max ((tc->delivered_time - rs->prior_time), rs->interval_time); rs->delivered = tc->delivered - rs->prior_delivered; - rs->rtt_time = tc->delivered_time - rs->rtt_time; rs->acked_and_sacked = delivered; rs->lost = tc->sack_sb.last_lost_bytes; } @@ -590,6 +655,36 @@ tcp_bt_init (tcp_connection_t * tc) tc->bt = bt; } +u8 * +format_tcp_bt_sample (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + tcp_bt_sample_t *bts = va_arg (*args, tcp_bt_sample_t *); + f64 now = tcp_time_now_us (tc->c_thread_index); + s = format (s, "[%u, %u] d %u dt %.3f txt %.3f ftxt %.3f flags 0x%x", + bts->min_seq - tc->iss, bts->max_seq - tc->iss, bts->delivered, + now - bts->delivered_time, now - bts->tx_time, + now - bts->first_tx_time, bts->flags); + return s; +} + +u8 * +format_tcp_bt (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + tcp_byte_tracker_t *bt = tc->bt; + tcp_bt_sample_t *bts; + + bts = bt_get_sample (bt, bt->head); + while (bts) + { + s = format (s, "%U\n", format_tcp_bt_sample, tc, bts); + bts = bt_next_sample (bt, bts); + } + + return s; +} + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 7362319646a..33bfe100a10 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -406,7 +406,7 @@ tcp_update_burst_snd_vars (tcp_connection_t * tc) tcp_update_rcv_wnd (tc); if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE) - tc->flags |= TCP_CONN_TRACK_BURST; + tcp_bt_check_app_limited (tc); if (tc->snd_una == tc->snd_nxt) { @@ -1171,17 +1171,22 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, u32 snd_nxt, TCP_EVT (TCP_EVT_PKTIZE, tc); } +always_inline u32 +tcp_buffer_len (vlib_buffer_t * b) +{ + u32 data_len = b->current_length; + if (PREDICT_FALSE (b->flags & VLIB_BUFFER_NEXT_PRESENT)) + data_len += b->total_length_not_including_first_buffer; + return data_len; +} + u32 tcp_session_push_header (transport_connection_t * tconn, vlib_buffer_t * b) { tcp_connection_t *tc = (tcp_connection_t *) tconn; - if (tc->flags & TCP_CONN_TRACK_BURST) - { - tcp_bt_check_app_limited (tc); - tcp_bt_track_tx (tc); - tc->flags &= ~TCP_CONN_TRACK_BURST; - } + if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE) + tcp_bt_track_tx (tc, tcp_buffer_len (b)); tcp_push_hdr_i (tc, b, tc->snd_nxt, /* compute opts */ 0, /* burst */ 1, /* update_snd_nxt */ 1); @@ -1783,7 +1788,7 @@ tcp_timer_persist_handler (u32 index) if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE) { tcp_bt_check_app_limited (tc); - tcp_bt_track_tx (tc); + tcp_bt_track_tx (tc, n_bytes); } tcp_push_hdr_i (tc, b, tc->snd_nxt, /* compute opts */ 0, @@ -1830,6 +1835,9 @@ tcp_transmit_unsent (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, available_wnd = tc->snd_wnd - offset; burst_size = clib_min (burst_size, available_wnd / tc->snd_mss); + if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE) + tcp_bt_check_app_limited (tc); + while (n_segs < burst_size) { n_written = tcp_prepare_segment (wrk, tc, offset, tc->snd_mss, &b); @@ -1841,6 +1849,9 @@ tcp_transmit_unsent (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, offset += n_written; n_segs += 1; + if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE) + tcp_bt_track_tx (tc, n_written); + tc->snd_nxt += n_written; tc->snd_una_max = seq_max (tc->snd_nxt, tc->snd_una_max); } |