aboutsummaryrefslogtreecommitdiffstats
path: root/src/vnet/tcp
diff options
context:
space:
mode:
authorFlorin Coras <fcoras@cisco.com>2018-05-23 21:01:30 -0700
committerDamjan Marion <dmarion.lists@gmail.com>2018-05-26 18:56:43 +0000
commitca1c8f3e782dc68a51aa2792771d9b4aac696ddd (patch)
tree890c7250af97dd65357363242e2c7272a199feca /src/vnet/tcp
parenta34c08c8c5a505e55178a9a8ef5391224d5460a5 (diff)
tcp: loss recovery improvements/fixes
- fix newreno cwnd computation - reset snd_una_max on entering recovery - accept acks beyond snd_nxt but less than snd_congestion when in recovery - avoid entering fast recovery multiple times when using sacks - avoid as much as possible sending small segments when doing fast retransmit - more event logging Change-Id: I19dd151d7704e39d4eae06de3a26f5e124875366 Signed-off-by: Florin Coras <fcoras@cisco.com>
Diffstat (limited to 'src/vnet/tcp')
-rw-r--r--src/vnet/tcp/tcp.c6
-rw-r--r--src/vnet/tcp/tcp.h6
-rwxr-xr-xsrc/vnet/tcp/tcp_debug.h274
-rw-r--r--src/vnet/tcp/tcp_input.c69
-rw-r--r--src/vnet/tcp/tcp_newreno.c2
-rw-r--r--src/vnet/tcp/tcp_output.c26
6 files changed, 241 insertions, 142 deletions
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index 25292d1e588..15ac7d37edc 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -734,9 +734,9 @@ format_tcp_vars (u8 * s, va_list * args)
s = format (s, " snd_wnd %u rcv_wnd %u snd_wl1 %u snd_wl2 %u\n",
tc->snd_wnd, tc->rcv_wnd, tc->snd_wl1 - tc->irs,
tc->snd_wl2 - tc->iss);
- s = format (s, " flight size %u send space %u rcv_wnd_av %d\n",
+ s = format (s, " flight size %u out space %u cc space %u rcv_wnd_av %u\n",
tcp_flight_size (tc), tcp_available_output_snd_space (tc),
- tcp_rcv_wnd_available (tc));
+ tcp_available_cc_snd_space (tc), tcp_rcv_wnd_available (tc));
s = format (s, " cong %U ", format_tcp_congestion_status, tc);
s = format (s, "cwnd %u ssthresh %u rtx_bytes %u bytes_acked %u\n",
tc->cwnd, tc->ssthresh, tc->snd_rxt_bytes, tc->bytes_acked);
@@ -1022,7 +1022,7 @@ tcp_snd_space (tcp_connection_t * tc)
* bytes of previously unsent data. */
if (tcp_in_fastrecovery (tc) && !tcp_fastrecovery_sent_1_smss (tc))
{
- if (tcp_available_output_snd_space (tc) < tc->snd_mss)
+ if (tcp_available_cc_snd_space (tc) < tc->snd_mss)
return 0;
tcp_fastrecovery_1_smss_on (tc);
return tc->snd_mss;
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index 837b5b4d0d2..10aa721a4eb 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -119,7 +119,7 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler;
_(FAST_RECOVERY, "Fast Recovery") \
_(FR_1_SMSS, "Sent 1 SMSS") \
_(HALF_OPEN_DONE, "Half-open completed") \
- _(FINPNDG, "FIN pending")
+ _(FINPNDG, "FIN pending") \
typedef enum _tcp_connection_flag_bits
{
@@ -617,7 +617,7 @@ tcp_available_output_snd_space (const tcp_connection_t * tc)
* Estimate of how many bytes we can still push into the network
*/
always_inline u32
-tcp_available_snd_space (const tcp_connection_t * tc)
+tcp_available_cc_snd_space (const tcp_connection_t * tc)
{
u32 available_wnd = tcp_available_snd_wnd (tc);
u32 flight_size = tcp_flight_size (tc);
@@ -652,6 +652,7 @@ fib_node_index_t tcp_lookup_rmt_in_fib (tcp_connection_t * tc);
/* Made public for unit testing only */
void tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end);
+u32 tcp_sack_list_bytes (tcp_connection_t * tc);
always_inline u32
tcp_time_now (void)
@@ -791,7 +792,6 @@ tcp_timer_is_active (tcp_connection_t * tc, tcp_timers_e timer)
void
scoreboard_remove_hole (sack_scoreboard_t * sb,
sack_scoreboard_hole_t * hole);
-void scoreboard_update_lost (tcp_connection_t * tc, sack_scoreboard_t * sb);
sack_scoreboard_hole_t *scoreboard_insert_hole (sack_scoreboard_t * sb,
u32 prev_index, u32 start,
u32 end);
diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h
index 4af4f2e7052..a52efe00720 100755
--- a/src/vnet/tcp/tcp_debug.h
+++ b/src/vnet/tcp/tcp_debug.h
@@ -56,6 +56,9 @@
_(CC_PACK, "cc partial ack") \
_(CC_STAT, "cc stats") \
_(CC_RTO_STAT, "cc rto stats") \
+ _(CC_SCOREBOARD, "scoreboard stats") \
+ _(CC_SACKS, "snd sacks stats") \
+ _(CC_INPUT, "ooo data delivered") \
_(SEG_INVALID, "invalid segment") \
_(PAWS_FAIL, "failed paws check") \
_(ACK_RCV_ERR, "invalid ack") \
@@ -192,7 +195,7 @@ typedef enum _tcp_dbg_evt
ed->data[0] = _tc->c_c_index; \
}
-#define TCP_EVT_SYN_RCVD_HANDLER(_tc,_init, ...) \
+#define TCP_EVT_SYN_RCVD_HANDLER(_tc,_init, ...) \
{ \
if (_init) \
TCP_EVT_INIT_HANDLER(_tc, 0); \
@@ -277,9 +280,9 @@ typedef enum _tcp_dbg_evt
}; \
DECLARE_ETD(_tc, _e, 4); \
ed->data[0] = _tc->iss; \
- ed->data[1] = _tc->snd_una - _tc->iss; \
+ ed->data[1] = _tc->snd_una - _tc->iss; \
ed->data[2] = _tc->snd_una_max - _tc->iss; \
- ed->data[3] = _tc->snd_nxt - _tc->iss; \
+ ed->data[3] = _tc->snd_nxt - _tc->iss; \
TCP_EVT_STATE_CHANGE_HANDLER(_tc); \
}
@@ -288,14 +291,14 @@ typedef enum _tcp_dbg_evt
ELOG_TYPE_DECLARE (_e) = \
{ \
.format = "synack-tx: iss %u irs %u snd_una %u snd_nxt %u rcv_nxt %u",\
- .format_args = "i4i4i4i4i4", \
+ .format_args = "i4i4i4i4i4", \
}; \
DECLARE_ETD(_tc, _e, 5); \
ed->data[0] = _tc->iss; \
ed->data[1] = _tc->irs; \
- ed->data[2] = _tc->snd_una - _tc->iss; \
- ed->data[3] = _tc->snd_nxt - _tc->iss; \
- ed->data[4] = _tc->rcv_nxt - _tc->irs; \
+ ed->data[2] = _tc->snd_una - _tc->iss; \
+ ed->data[3] = _tc->snd_nxt - _tc->iss; \
+ ed->data[4] = _tc->rcv_nxt - _tc->irs; \
}
#define TCP_EVT_SYNACK_RCVD_HANDLER(_tc, ...) \
@@ -303,14 +306,14 @@ typedef enum _tcp_dbg_evt
ELOG_TYPE_DECLARE (_e) = \
{ \
.format = "synack-rx: iss %u irs %u snd_una %u snd_nxt %u rcv_nxt %u",\
- .format_args = "i4i4i4i4i4", \
+ .format_args = "i4i4i4i4i4", \
}; \
DECLARE_ETD(_tc, _e, 5); \
ed->data[0] = _tc->iss; \
ed->data[1] = _tc->irs; \
- ed->data[2] = _tc->snd_una - _tc->iss; \
- ed->data[3] = _tc->snd_nxt - _tc->iss; \
- ed->data[4] = _tc->rcv_nxt - _tc->irs; \
+ ed->data[2] = _tc->snd_una - _tc->iss; \
+ ed->data[3] = _tc->snd_nxt - _tc->iss; \
+ ed->data[4] = _tc->rcv_nxt - _tc->irs; \
TCP_EVT_STATE_CHANGE_HANDLER(_tc); \
}
@@ -371,7 +374,7 @@ if (_tc) \
ELOG_TYPE_DECLARE (_e) = \
{ \
.format = "%s-rxt: iss %u irs %u snd_nxt %u rcv_nxt %u", \
- .format_args = "t4i4i4i4i4", \
+ .format_args = "t4i4i4i4i4", \
.n_enum_strings = 2, \
.enum_strings = { \
"syn", \
@@ -382,10 +385,9 @@ if (_tc) \
ed->data[0] = _type; \
ed->data[1] = _tc->iss; \
ed->data[2] = _tc->irs; \
- ed->data[3] = _tc->snd_nxt - _tc->iss; \
- ed->data[4] = _tc->rcv_nxt - _tc->irs; \
+ ed->data[3] = _tc->snd_nxt - _tc->iss; \
+ ed->data[4] = _tc->rcv_nxt - _tc->irs; \
}
-
#else
#define TCP_EVT_SYN_SENT_HANDLER(_tc, ...)
#define TCP_EVT_SYNACK_SENT_HANDLER(_tc, ...)
@@ -399,6 +401,81 @@ if (_tc) \
#endif
#if TCP_DEBUG_SM > 1
+#define TCP_EVT_SEG_INVALID_HANDLER(_tc, _btcp, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "seg-inv: seq %u end %u rcv_las %u rcv_nxt %u rcv_wnd %u",\
+ .format_args = "i4i4i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 5); \
+ ed->data[0] = _btcp.seq_number - _tc->irs; \
+ ed->data[1] = _btcp.seq_end - _tc->irs; \
+ ed->data[2] = _tc->rcv_las - _tc->irs; \
+ ed->data[3] = _tc->rcv_nxt - _tc->irs; \
+ ed->data[4] = _tc->rcv_wnd; \
+}
+
+#define TCP_EVT_PAWS_FAIL_HANDLER(_tc, _seq, _end, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "paws-err: seq %u end %u tsval %u tsval_recent %u", \
+ .format_args = "i4i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 4); \
+ ed->data[0] = _seq - _tc->irs; \
+ ed->data[1] = _end - _tc->irs; \
+ ed->data[2] = _tc->rcv_opts.tsval; \
+ ed->data[3] = _tc->tsval_recent; \
+}
+
+#define TCP_EVT_ACK_RCV_ERR_HANDLER(_tc, _type, _ack, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "ack-err: %s ack %u snd_una %u snd_nxt %u una_max %u", \
+ .format_args = "t4i4i4i4i4", \
+ .n_enum_strings = 3, \
+ .enum_strings = { \
+ "invalid", \
+ "old", \
+ "future", \
+ }, \
+ }; \
+ DECLARE_ETD(_tc, _e, 5); \
+ ed->data[0] = _type; \
+ ed->data[1] = _ack - _tc->iss; \
+ ed->data[2] = _tc->snd_una - _tc->iss; \
+ ed->data[3] = _tc->snd_nxt - _tc->iss; \
+ ed->data[4] = _tc->snd_una_max - _tc->iss; \
+}
+
+#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...) \
+{ \
+if (_av > 0) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "huh?: rcv_wnd %u obsd %u av %u rcv_nxt %u rcv_las %u", \
+ .format_args = "i4i4i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 5); \
+ ed->data[0] = _tc->rcv_wnd; \
+ ed->data[1] = _obs; \
+ ed->data[2] = _av; \
+ ed->data[3] = _tc->rcv_nxt - _tc->irs; \
+ ed->data[4] = _tc->rcv_las - _tc->irs; \
+} \
+}
+#else
+#define TCP_EVT_SEG_INVALID_HANDLER(_tc, _btcp, ...)
+#define TCP_EVT_PAWS_FAIL_HANDLER(_tc, _seq, _end, ...)
+#define TCP_EVT_ACK_RCV_ERR_HANDLER(_tc, _type, _ack, ...)
+#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...)
+#endif
+
+#if TCP_DEBUG_SM > 2
#define TCP_EVT_ACK_SENT_HANDLER(_tc, ...) \
{ \
@@ -505,90 +582,18 @@ if (_tc) \
_tc_index); \
} \
}
-
-#define TCP_EVT_SEG_INVALID_HANDLER(_tc, _seq, _end, ...) \
-{ \
- ELOG_TYPE_DECLARE (_e) = \
- { \
- .format = "seg-inv: seq %u end %u rcv_las %u rcv_nxt %u rcv_wnd %u",\
- .format_args = "i4i4i4i4i4", \
- }; \
- DECLARE_ETD(_tc, _e, 5); \
- ed->data[0] = _seq - _tc->irs; \
- ed->data[1] = _end - _tc->irs; \
- ed->data[2] = _tc->rcv_las - _tc->irs; \
- ed->data[3] = _tc->rcv_nxt - _tc->irs; \
- ed->data[4] = _tc->rcv_wnd; \
-}
-
-#define TCP_EVT_PAWS_FAIL_HANDLER(_tc, _seq, _end, ...) \
-{ \
- ELOG_TYPE_DECLARE (_e) = \
- { \
- .format = "paws-err: seq %u end %u tsval %u tsval_recent %u", \
- .format_args = "i4i4i4i4", \
- }; \
- DECLARE_ETD(_tc, _e, 4); \
- ed->data[0] = _seq - _tc->irs; \
- ed->data[1] = _end - _tc->irs; \
- ed->data[2] = _tc->rcv_opts.tsval; \
- ed->data[3] = _tc->tsval_recent; \
-}
-
-#define TCP_EVT_ACK_RCV_ERR_HANDLER(_tc, _type, _ack, ...) \
-{ \
- ELOG_TYPE_DECLARE (_e) = \
- { \
- .format = "ack-err: %s ack %u snd_una %u snd_nxt %u una_max %u", \
- .format_args = "t4i4i4i4i4", \
- .n_enum_strings = 3, \
- .enum_strings = { \
- "invalid", \
- "old", \
- "future", \
- }, \
- }; \
- DECLARE_ETD(_tc, _e, 5); \
- ed->data[0] = _type; \
- ed->data[1] = _ack - _tc->iss; \
- ed->data[2] = _tc->snd_una - _tc->iss; \
- ed->data[3] = _tc->snd_nxt - _tc->iss; \
- ed->data[4] = _tc->snd_una_max - _tc->iss; \
-}
-
-#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...) \
-{ \
-if (_av > 0) \
-{ \
- ELOG_TYPE_DECLARE (_e) = \
- { \
- .format = "huh?: rcv_wnd %u obsd %u av %u rcv_nxt %u rcv_las %u", \
- .format_args = "i4i4i4i4i4", \
- }; \
- DECLARE_ETD(_tc, _e, 5); \
- ed->data[0] = _tc->rcv_wnd; \
- ed->data[1] = _obs; \
- ed->data[2] = _av; \
- ed->data[3] = _tc->rcv_nxt - _tc->irs; \
- ed->data[4] = _tc->rcv_las - _tc->irs; \
-} \
-}
#else
#define TCP_EVT_ACK_SENT_HANDLER(_tc, ...)
#define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...)
#define TCP_EVT_PKTIZE_HANDLER(_tc, ...)
#define TCP_EVT_INPUT_HANDLER(_tc, _type, _len, _written, ...)
#define TCP_EVT_TIMER_POP_HANDLER(_tc_index, _timer_id, ...)
-#define TCP_EVT_SEG_INVALID_HANDLER(_tc, _seq, _end, ...)
-#define TCP_EVT_PAWS_FAIL_HANDLER(_tc, _seq, _end, ...)
-#define TCP_EVT_ACK_RCV_ERR_HANDLER(_tc, _type, _ack, ...)
-#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...)
#endif
/*
* State machine verbose
*/
-#if TCP_DEBUG_SM > 2
+#if TCP_DEBUG_SM > 3
#define TCP_EVT_SND_WND_HANDLER(_tc, ...) \
{ \
ELOG_TYPE_DECLARE (_e) = \
@@ -626,9 +631,9 @@ if (_av > 0) \
{ \
ELOG_TYPE_DECLARE (_e) = \
{ \
- .format = "cc: %s snd_space %u snd_cong %u rxt_bytes %u", \
- .format_args = "t4i4i4i4", \
- .n_enum_strings = 6, \
+ .format = "cc: %s snd_space %u snd_una %u out %u flight %u", \
+ .format_args = "t4i4i4i4i4", \
+ .n_enum_strings = 7, \
.enum_strings = { \
"fast-rxt", \
"rxt-timeout", \
@@ -636,13 +641,15 @@ if (_av > 0) \
"recovered", \
"congestion", \
"undo", \
+ "recovery", \
}, \
}; \
- DECLARE_ETD(_tc, _e, 4); \
+ DECLARE_ETD(_tc, _e, 5); \
ed->data[0] = _sub_evt; \
- ed->data[1] = tcp_available_snd_space (_tc); \
- ed->data[2] = _tc->snd_congestion - _tc->iss; \
- ed->data[3] = _tc->snd_rxt_bytes; \
+ ed->data[1] = tcp_available_cc_snd_space (_tc); \
+ ed->data[2] = _tc->snd_una - _tc->iss; \
+ ed->data[3] = tcp_bytes_out(_tc); \
+ ed->data[4] = tcp_flight_size (_tc); \
}
#define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...) \
@@ -659,19 +666,19 @@ if (_av > 0) \
ed->data[3] = _tc->snd_rxt_bytes; \
}
-#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...) \
+#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, _btcp, ...) \
{ \
ELOG_TYPE_DECLARE (_e) = \
{ \
- .format = "dack-tx: rcv_nxt %u rcv_wnd %u snd_nxt %u av_wnd %u snd_wnd %u",\
+ .format = "dack-tx: rcv_nxt %u seq %u rcv_wnd %u snd_nxt %u av_wnd %u",\
.format_args = "i4i4i4i4i4", \
}; \
DECLARE_ETD(_tc, _e, 5); \
ed->data[0] = _tc->rcv_nxt - _tc->irs; \
- ed->data[1] = _tc->rcv_wnd; \
- ed->data[2] = _tc->snd_nxt - _tc->iss; \
- ed->data[3] = tcp_available_snd_wnd(_tc); \
- ed->data[4] = _tc->snd_wnd; \
+ ed->data[1] = _btcp.seq_number - _tc->irs; \
+ ed->data[2] = _tc->rcv_wnd; \
+ ed->data[3] = _tc->snd_nxt - _tc->iss; \
+ ed->data[4] = tcp_available_snd_wnd(_tc); \
}
#define TCP_EVT_DUPACK_RCVD_HANDLER(_tc, ...) \
@@ -700,12 +707,75 @@ if (_av > 0) \
ed->data[0] = _tc->snd_una - _tc->iss; \
ed->data[1] = _tc->snd_una_max - _tc->iss; \
}
+#define TCP_EVT_CC_SCOREBOARD_HANDLER(_tc, ...) \
+{ \
+if (TCP_DEBUG_CC > 1 && _tc->sack_sb.last_sacked_bytes) \
+ { \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "sb1: holes %u lost %u sacked %u high %u highrxt %u", \
+ .format_args = "i4i4i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 5); \
+ ed->data[0] = pool_elts(_tc->sack_sb.holes); \
+ ed->data[1] = _tc->sack_sb.lost_bytes; \
+ ed->data[2] = _tc->sack_sb.sacked_bytes; \
+ ed->data[3] = _tc->sack_sb.high_sacked - _tc->iss; \
+ ed->data[4] = _tc->sack_sb.high_rxt - _tc->iss; \
+ } \
+if (TCP_DEBUG_CC > 1 && _tc->sack_sb.last_sacked_bytes) \
+ { \
+ sack_scoreboard_hole_t *hole; \
+ hole = scoreboard_first_hole (&_tc->sack_sb); \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "sb2: first start: %u end %u last start %u end %u", \
+ .format_args = "i4i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 4); \
+ ed->data[0] = hole ? hole->start - _tc->iss : 0; \
+ ed->data[1] = hole ? hole->end - _tc->iss : 0; \
+ hole = scoreboard_last_hole (&_tc->sack_sb); \
+ ed->data[2] = hole ? hole->start - _tc->iss : 0; \
+ ed->data[3] = hole ? hole->end - _tc->iss : 0; \
+ } \
+}
+#define TCP_EVT_CC_SACKS_HANDLER(_tc, ...) \
+{ \
+if (TCP_DEBUG_CC > 1) \
+ { \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "sacks: blocks %u bytes %u", \
+ .format_args = "i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 2); \
+ ed->data[0] = vec_len (_tc->snd_sacks); \
+ ed->data[1] = tcp_sack_list_bytes (_tc); \
+ } \
+}
+#define TCP_EVT_CC_INPUT_HANDLER(_tc, _len, _written, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "cc input: len %u written %d rcv_nxt %u rcv_wnd(o) %d", \
+ .format_args = "i4i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 4); \
+ ed->data[0] = _len; \
+ ed->data[1] = _written; \
+ ed->data[2] = _tc->rcv_nxt - _tc->irs; \
+ ed->data[3] = _tc->rcv_wnd - (_tc->rcv_nxt - _tc->rcv_las); \
+}
#else
#define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...)
-#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...)
+#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, _btcp, ...)
#define TCP_EVT_DUPACK_RCVD_HANDLER(_tc, ...)
#define TCP_EVT_CC_PACK_HANDLER(_tc, ...)
#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...)
+#define TCP_EVT_CC_SCOREBOARD_HANDLER(_tc, ...)
+#define TCP_EVT_CC_SACKS_HANDLER(_tc, ...)
+#define TCP_EVT_CC_INPUT_HANDLER(_tc, _len, _written, ...)
#endif
/*
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index c86432037fd..19ecc7deef8 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -275,6 +275,14 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0,
vlib_buffer_t * b0, tcp_header_t * th0,
u32 * next0, u32 * error0)
{
+ /* We could get a burst of RSTs interleaved with acks */
+ if (PREDICT_FALSE (tc0->state == TCP_STATE_CLOSED))
+ {
+ tcp_send_reset (tc0);
+ *error0 = TCP_ERROR_CONNECTION_CLOSED;
+ goto drop;
+ }
+
if (PREDICT_FALSE (!tcp_ack (th0) && !tcp_rst (th0) && !tcp_syn (th0)))
{
*error0 = TCP_ERROR_SEGMENT_INVALID;
@@ -292,13 +300,7 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0,
{
*error0 = TCP_ERROR_PAWS;
if (CLIB_DEBUG > 2)
- {
- clib_warning ("paws failed\n%U", format_tcp_connection, tc0, 2);
- clib_warning ("seq %u seq_end %u ack %u",
- vnet_buffer (b0)->tcp.seq_number - tc0->irs,
- vnet_buffer (b0)->tcp.seq_end - tc0->irs,
- vnet_buffer (b0)->tcp.ack_number - tc0->iss);
- }
+ clib_warning ("paws failed\n%U", format_tcp_connection, tc0, 2);
TCP_EVT_DBG (TCP_EVT_PAWS_FAIL, tc0, vnet_buffer (b0)->tcp.seq_number,
vnet_buffer (b0)->tcp.seq_end);
@@ -317,7 +319,7 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0,
if (!tcp_rst (th0))
{
tcp_make_ack (tc0, b0);
- TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0);
+ TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0, vnet_buffer (b0)->tcp);
goto error;
}
}
@@ -329,7 +331,6 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0,
vnet_buffer (b0)->tcp.seq_end))
{
*error0 = TCP_ERROR_RCV_WND;
-
/* If our window is 0 and the packet is in sequence, let it pass
* through for ack processing. It should be dropped later. */
if (!(tc0->rcv_wnd == 0
@@ -339,7 +340,7 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0,
if (!tcp_rst (th0))
{
tcp_make_ack (tc0, b0);
- TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0);
+ TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0, vnet_buffer (b0)->tcp);
goto error;
}
goto drop;
@@ -889,13 +890,14 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
scoreboard_update_bytes (tc, sb);
sb->last_sacked_bytes = sb->sacked_bytes
- (old_sacked_bytes - sb->last_bytes_delivered);
- ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes);
+ ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes || tcp_in_recovery (tc));
ASSERT (sb->sacked_bytes == 0
|| sb->sacked_bytes < tc->snd_una_max - seq_max (tc->snd_una, ack));
ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_una_max
- seq_max (tc->snd_una, ack));
ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc)
|| sb->holes[sb->head].start == ack + sb->snd_una_adv);
+ TCP_EVT_DBG (TCP_EVT_CC_SCOREBOARD, tc);
}
/**
@@ -1063,11 +1065,18 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
{
u32 rxt_delivered;
+ if (tcp_in_fastrecovery (tc) && tcp_opts_sack_permitted (&tc->rcv_opts))
+ {
+ if (tc->bytes_acked)
+ goto partial_ack;
+ tcp_fast_retransmit (tc);
+ return;
+ }
/*
* Duplicate ACK. Check if we should enter fast recovery, or if already in
* it account for the bytes that left the network.
*/
- if (is_dack && !tcp_in_recovery (tc))
+ else if (is_dack && !tcp_in_recovery (tc))
{
TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc, 1);
ASSERT (tc->snd_una != tc->snd_una_max
@@ -1128,7 +1137,6 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
{
tcp_fast_retransmit_no_sack (tc);
}
-
return;
}
else if (!tc->bytes_acked
@@ -1237,6 +1245,16 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
/* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) */
if (PREDICT_FALSE (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)))
{
+ /* When we entered recovery, we reset snd_nxt to snd_una. Seems peer
+ * still has the data so accept the ack */
+ if (tcp_in_recovery (tc)
+ && seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_congestion)
+ && seq_geq (vnet_buffer (b)->tcp.ack_number, tc->snd_una))
+ {
+ tc->snd_una_max = tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
+ goto process_ack;
+ }
+
/* If we have outstanding data and this is within the window, accept it,
* probably retransmit has timed out. Otherwise ACK segment and then
* drop it */
@@ -1264,9 +1282,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 1,
vnet_buffer (b)->tcp.ack_number);
if (tcp_in_fastrecovery (tc) && tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
- {
- tcp_cc_handle_event (tc, 1);
- }
+ tcp_cc_handle_event (tc, 1);
/* Don't drop yet */
return 0;
}
@@ -1274,7 +1290,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
/*
* Looks okay, process feedback
*/
-
+process_ack:
if (tcp_opts_sack_permitted (&tc->rcv_opts))
tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number);
@@ -1390,6 +1406,15 @@ tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end)
ASSERT (tcp_sack_vector_is_sane (tc->snd_sacks));
}
+u32
+tcp_sack_list_bytes (tcp_connection_t * tc)
+{
+ u32 bytes = 0, i;
+ for (i = 0; i < vec_len (tc->snd_sacks); i++)
+ bytes += tc->snd_sacks[i].end - tc->snd_sacks[i].start;
+ return bytes;
+}
+
/** Enqueue data for delivery to application */
always_inline int
tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b,
@@ -1416,6 +1441,7 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b,
/* Send ACK confirming the update */
tc->flags |= TCP_CONN_SNDACK;
+ TCP_EVT_DBG (TCP_EVT_CC_INPUT, tc, data_len, written);
}
else if (written > 0)
{
@@ -1488,6 +1514,7 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b,
end = start + ooo_segment_length (s0->server_rx_fifo, newest);
tcp_update_sack_list (tc, start, end);
svm_fifo_newest_ooo_segment_reset (s0->server_rx_fifo);
+ TCP_EVT_DBG (TCP_EVT_CC_SACKS, tc);
}
}
@@ -1508,7 +1535,7 @@ tcp_can_delack (tcp_connection_t * tc)
/* constrained to send ack */
|| (tc->flags & TCP_CONN_SNDACK) != 0
/* we're almost out of tx wnd */
- || tcp_available_snd_space (tc) < 4 * tc->snd_mss)
+ || tcp_available_cc_snd_space (tc) < 4 * tc->snd_mss)
return 0;
return 1;
@@ -1592,7 +1619,7 @@ tcp_segment_rcv (tcp_connection_t * tc, vlib_buffer_t * b, u32 * next0)
*next0 = tcp_next_output (tc->c_is_ip4);
tcp_make_ack (tc, b);
vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_DUPACK;
- TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc);
+ TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc, vnet_buffer (b)->tcp);
goto done;
}
@@ -1773,9 +1800,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
&error0)))
{
tcp_maybe_inc_err_counter (err_counters, error0);
- TCP_EVT_DBG (TCP_EVT_SEG_INVALID, tc0,
- vnet_buffer (b0)->tcp.seq_number,
- vnet_buffer (b0)->tcp.seq_end);
+ TCP_EVT_DBG (TCP_EVT_SEG_INVALID, tc0, vnet_buffer (b0)->tcp);
goto done;
}
diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c
index 0f43d21dfde..a9ec58c262f 100644
--- a/src/vnet/tcp/tcp_newreno.c
+++ b/src/vnet/tcp/tcp_newreno.c
@@ -41,8 +41,8 @@ newreno_rcv_ack (tcp_connection_t * tc)
if (tc->cwnd_acc_bytes >= tc->cwnd)
{
u32 inc = tc->cwnd_acc_bytes / tc->cwnd;
- tc->cwnd += inc * tc->snd_mss;
tc->cwnd_acc_bytes -= inc * tc->cwnd;
+ tc->cwnd += inc * tc->snd_mss;
}
tc->cwnd = clib_min (tc->cwnd,
transport_tx_fifo_size (&tc->connection));
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index 27450654f71..a036287a51c 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -389,6 +389,7 @@ tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts,
{
case TCP_STATE_ESTABLISHED:
case TCP_STATE_FIN_WAIT_1:
+ case TCP_STATE_CLOSED:
return tcp_make_established_options (tc, opts);
case TCP_STATE_SYN_RCVD:
return tcp_make_synack_options (tc, opts);
@@ -1337,8 +1338,9 @@ done:
* Reset congestion control, switch cwnd to loss window and try again.
*/
static void
-tcp_rtx_timeout_cc (tcp_connection_t * tc)
+tcp_rxt_timeout_cc (tcp_connection_t * tc)
{
+ TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 6);
tc->prev_ssthresh = tc->ssthresh;
tc->prev_cwnd = tc->cwnd;
@@ -1383,6 +1385,8 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID;
}
+ TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1);
+
if (tc->state >= TCP_STATE_ESTABLISHED)
{
/* Lost FIN, retransmit and return */
@@ -1414,13 +1418,11 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
/* First retransmit timeout */
if (tc->rto_boff == 1)
- tcp_rtx_timeout_cc (tc);
+ tcp_rxt_timeout_cc (tc);
tc->snd_una_max = tc->snd_nxt = tc->snd_una;
tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
- TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1);
-
/* Send one segment. Note that n_bytes may be zero due to buffer shortfall */
n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b);
@@ -1627,7 +1629,7 @@ void
tcp_fast_retransmit_sack (tcp_connection_t * tc)
{
vlib_main_t *vm = vlib_get_main ();
- u32 n_written = 0, offset, max_bytes;
+ u32 n_written = 0, offset, max_bytes, n_segs = 0;
vlib_buffer_t *b = 0;
sack_scoreboard_hole_t *hole;
sack_scoreboard_t *sb;
@@ -1636,14 +1638,17 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc)
u8 snd_limited = 0, can_rescue = 0;
ASSERT (tcp_in_fastrecovery (tc));
- TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0);
old_snd_nxt = tc->snd_nxt;
sb = &tc->sack_sb;
- snd_space = tcp_available_snd_space (tc);
+ snd_space = tcp_available_cc_snd_space (tc);
+ if (snd_space < tc->snd_mss)
+ goto done;
+
+ TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0);
hole = scoreboard_get_hole (sb, sb->cur_rxt_hole);
- while (hole && snd_space > 0)
+ while (hole && snd_space > 0 && n_segs++ < VLIB_FRAME_SIZE)
{
hole = scoreboard_next_rxt_hole (sb, hole,
tcp_fastrecovery_sent_1_smss (tc),
@@ -1717,7 +1722,7 @@ tcp_fast_retransmit_no_sack (tcp_connection_t * tc)
/* Start resending from first un-acked segment */
old_snd_nxt = tc->snd_nxt;
tc->snd_nxt = tc->snd_una;
- snd_space = tcp_available_snd_space (tc);
+ snd_space = tcp_available_cc_snd_space (tc);
while (snd_space > 0)
{
@@ -1743,8 +1748,7 @@ tcp_fast_retransmit_no_sack (tcp_connection_t * tc)
void
tcp_fast_retransmit (tcp_connection_t * tc)
{
- if (tcp_opts_sack_permitted (&tc->rcv_opts)
- && scoreboard_first_hole (&tc->sack_sb))
+ if (tcp_opts_sack_permitted (&tc->rcv_opts))
tcp_fast_retransmit_sack (tc);
else
tcp_fast_retransmit_no_sack (tc);