aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFlorin Coras <fcoras@cisco.com>2017-08-01 16:56:58 -0700
committerDave Barach <openvpp@barachs.net>2017-08-11 16:03:19 +0000
commitb2215d6b0d8ef7d425d2b9eea524a1c055a9f3b3 (patch)
tree6299677b83934af494e6bb7dd130ed8928304729
parent755e41e4574103f5435ca45384c236bf11d8e28f (diff)
Fix tcp multi buffer segments retransmission
- Fix tcp/udp sw checksum computation - Fix allocation of multi buffer tcp segments for retransmits - Send FIN only if/when tx fifo is empty Change-Id: I2e43a14b87a72c9e547b4339b9a51811cf5732c4 Signed-off-by: Florin Coras <fcoras@cisco.com>
-rw-r--r--src/vlib/buffer_funcs.h7
-rwxr-xr-xsrc/vnet/ip/ip4_forward.c12
-rw-r--r--src/vnet/session/session.c6
-rw-r--r--src/vnet/session/session_node.c39
-rw-r--r--src/vnet/tcp/tcp.c39
-rw-r--r--src/vnet/tcp/tcp.h18
-rw-r--r--src/vnet/tcp/tcp_input.c48
-rw-r--r--src/vnet/tcp/tcp_output.c290
8 files changed, 304 insertions, 155 deletions
diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h
index 72008dad..6a662416 100644
--- a/src/vlib/buffer_funcs.h
+++ b/src/vlib/buffer_funcs.h
@@ -833,7 +833,12 @@ vlib_buffer_init_for_free_list (vlib_buffer_t * dst,
_(current_length);
_(flags);
#undef _
- ASSERT (dst->total_length_not_including_first_buffer == 0);
+ /* ASSERT (dst->total_length_not_including_first_buffer == 0); */
+ /* total_length_not_including_first_buffer is not in the template anymore
+ * so it may actually not zeroed for some buffers. One option is to
+ * uncomment the line lower (comes at a cost), the other, is to just not
+ * care */
+ /* dst->total_length_not_including_first_buffer = 0; */
ASSERT (dst->n_add_refs == 0);
}
diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c
index 7a8d7a0c..496df3c7 100755
--- a/src/vnet/ip/ip4_forward.c
+++ b/src/vnet/ip/ip4_forward.c
@@ -1454,7 +1454,7 @@ ip4_tcp_udp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0,
{
ip_csum_t sum0;
u32 ip_header_length, payload_length_host_byte_order;
- u32 n_this_buffer, n_bytes_left;
+ u32 n_this_buffer, n_bytes_left, n_ip_bytes_this_buffer;
u16 sum16;
void *data_this_buffer;
@@ -1481,10 +1481,12 @@ ip4_tcp_udp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0,
n_bytes_left = n_this_buffer = payload_length_host_byte_order;
data_this_buffer = (void *) ip0 + ip_header_length;
- if (n_this_buffer + ip_header_length > p0->current_length)
- n_this_buffer =
- p0->current_length >
- ip_header_length ? p0->current_length - ip_header_length : 0;
+ n_ip_bytes_this_buffer = p0->current_length - (((u8 *) ip0 - p0->data) - p0->current_data);
+ if (n_this_buffer + ip_header_length > n_ip_bytes_this_buffer)
+ {
+ n_this_buffer = n_ip_bytes_this_buffer > ip_header_length ?
+ n_ip_bytes_this_buffer - ip_header_length : 0;
+ }
while (1)
{
sum0 = ip_incremental_checksum (sum0, data_this_buffer, n_this_buffer);
diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c
index 533a6c22..3a3e4dfe 100644
--- a/src/vnet/session/session.c
+++ b/src/vnet/session/session.c
@@ -98,9 +98,9 @@ session_enqueue_chain_tail (stream_session_t * s, vlib_buffer_t * b,
u32 offset, u8 is_in_order)
{
vlib_buffer_t *chain_b;
- u32 chain_bi = b->next_buffer;
+ u32 chain_bi = b->next_buffer, len;
vlib_main_t *vm = vlib_get_main ();
- u8 *data, len;
+ u8 *data;
u16 written = 0;
int rv = 0;
@@ -226,7 +226,7 @@ u32
stream_session_tx_fifo_max_dequeue (transport_connection_t * tc)
{
stream_session_t *s = stream_session_get (tc->s_index, tc->thread_index);
- if (s->session_state != SESSION_STATE_READY)
+ if (!s->server_tx_fifo)
return 0;
return svm_fifo_max_dequeue (s->server_tx_fifo);
}
diff --git a/src/vnet/session/session_node.c b/src/vnet/session/session_node.c
index 8d703b0b..9c5b17d9 100644
--- a/src/vnet/session/session_node.c
+++ b/src/vnet/session/session_node.c
@@ -75,20 +75,25 @@ always_inline void
session_tx_fifo_chain_tail (session_manager_main_t * smm, vlib_main_t * vm,
u8 thread_index, svm_fifo_t * fifo,
vlib_buffer_t * b0, u32 bi0, u8 n_bufs_per_seg,
- u32 * left_to_snd0, u16 * n_bufs, u32 * rx_offset,
- u16 deq_per_buf, u8 peek_data)
+ u32 left_from_seg, u32 * left_to_snd0,
+ u16 * n_bufs, u32 * rx_offset, u16 deq_per_buf,
+ u8 peek_data)
{
vlib_buffer_t *chain_b0, *prev_b0;
- u32 chain_bi0;
+ u32 chain_bi0, to_deq;
u16 len_to_deq0, n_bytes_read;
u8 *data0, j;
+ b0->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
+ b0->total_length_not_including_first_buffer = 0;
+
chain_bi0 = bi0;
chain_b0 = b0;
+ to_deq = left_from_seg;
for (j = 1; j < n_bufs_per_seg; j++)
{
prev_b0 = chain_b0;
- len_to_deq0 = clib_min (*left_to_snd0, deq_per_buf);
+ len_to_deq0 = clib_min (to_deq, deq_per_buf);
*n_bufs -= 1;
chain_bi0 = smm->tx_buffers[thread_index][*n_bufs];
@@ -117,10 +122,12 @@ session_tx_fifo_chain_tail (session_manager_main_t * smm, vlib_main_t * vm,
/* update current buffer */
chain_b0->next_buffer = 0;
- *left_to_snd0 -= n_bytes_read;
- if (*left_to_snd0 == 0)
+ to_deq -= n_bytes_read;
+ if (to_deq == 0)
break;
}
+ ASSERT (to_deq == 0);
+ *left_to_snd0 -= left_from_seg;
}
always_inline int
@@ -223,7 +230,6 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
&& ((buffers_allocated + n_bufs < VLIB_FRAME_SIZE)));
n_bufs += buffers_allocated;
-
_vec_len (smm->tx_buffers[thread_index]) = n_bufs;
if (PREDICT_FALSE (n_bufs < VLIB_FRAME_SIZE))
@@ -289,11 +295,15 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
* Fill in the remaining buffers in the chain, if any
*/
if (PREDICT_FALSE (n_bufs_per_seg > 1))
- session_tx_fifo_chain_tail (smm, vm, thread_index,
- s0->server_tx_fifo, b0, bi0,
- n_bufs_per_seg, &left_to_snd0,
- &n_bufs, &rx_offset, deq_per_buf,
- peek_data);
+ {
+ u32 left_for_seg;
+ left_for_seg = clib_min (snd_mss0 - n_bytes_read, left_to_snd0);
+ session_tx_fifo_chain_tail (smm, vm, thread_index,
+ s0->server_tx_fifo, b0, bi0,
+ n_bufs_per_seg, left_for_seg,
+ &left_to_snd0, &n_bufs, &rx_offset,
+ deq_per_buf, peek_data);
+ }
/* Ask transport to push header after current_length and
* total_length_not_including_first_buffer are updated */
@@ -607,8 +617,9 @@ skip_dequeue:
clib_warning ("It's dead, Jim!");
continue;
}
-
- if (PREDICT_FALSE (s0->session_state == SESSION_STATE_CLOSED))
+ /* Can retransmit for closed sessions but can't do anything if
+ * session is not ready or closed */
+ if (PREDICT_FALSE (s0->session_state < SESSION_STATE_READY))
continue;
/* Spray packets in per session type frames, since they go to
* different nodes */
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index 8e2eb9f4..4652618b 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -288,18 +288,31 @@ tcp_connection_close (tcp_connection_t * tc)
{
TCP_EVT_DBG (TCP_EVT_CLOSE, tc);
- /* Send FIN if needed */
- if (tc->state == TCP_STATE_ESTABLISHED
- || tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_CLOSE_WAIT)
- tcp_send_fin (tc);
-
- /* Switch state */
- if (tc->state == TCP_STATE_ESTABLISHED || tc->state == TCP_STATE_SYN_RCVD)
- tc->state = TCP_STATE_FIN_WAIT_1;
- else if (tc->state == TCP_STATE_SYN_SENT)
- tc->state = TCP_STATE_CLOSED;
- else if (tc->state == TCP_STATE_CLOSE_WAIT)
- tc->state = TCP_STATE_LAST_ACK;
+ /* Send/Program FIN if needed and switch state */
+ switch (tc->state)
+ {
+ case TCP_STATE_SYN_SENT:
+ tc->state = TCP_STATE_CLOSED;
+ break;
+ case TCP_STATE_SYN_RCVD:
+ tcp_send_fin (tc);
+ tc->state = TCP_STATE_FIN_WAIT_1;
+ break;
+ case TCP_STATE_ESTABLISHED:
+ if (!stream_session_tx_fifo_max_dequeue (&tc->connection))
+ tcp_send_fin (tc);
+ else
+ tc->flags |= TCP_CONN_FINPNDG;
+ tc->state = TCP_STATE_FIN_WAIT_1;
+ break;
+ case TCP_STATE_CLOSE_WAIT:
+ tcp_send_fin (tc);
+ tc->state = TCP_STATE_LAST_ACK;
+ break;
+ default:
+ clib_warning ("shouldn't be here");
+ }
+
TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc);
/* If in CLOSED and WAITCLOSE timer is not set, delete connection now */
@@ -1284,6 +1297,8 @@ tcp_main_enable (vlib_main_t * vm)
vec_validate (tm->tx_frames[0], num_threads - 1);
vec_validate (tm->tx_frames[1], num_threads - 1);
+ tm->bytes_per_buffer = vlib_buffer_free_list_buffer_size
+ (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
return error;
}
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index 997df76f..a17262fa 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -116,7 +116,8 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler;
_(RECOVERY, "Recovery on") \
_(FAST_RECOVERY, "Fast Recovery on") \
_(FR_1_SMSS, "Sent 1 SMSS") \
- _(HALF_OPEN_DONE, "Half-open completed")
+ _(HALF_OPEN_DONE, "Half-open completed") \
+ _(FINPNDG, "FIN pending")
typedef enum _tcp_connection_flag_bits
{
@@ -404,6 +405,9 @@ typedef struct _tcp_main
/** Port allocator random number generator seed */
u32 port_allocator_seed;
+
+ /** vlib buffer size */
+ u32 bytes_per_buffer;
} tcp_main_t;
extern tcp_main_t tcp_main;
@@ -587,6 +591,14 @@ tcp_available_snd_space (const tcp_connection_t * tc)
return available_wnd - flight_size;
}
+always_inline u8
+tcp_is_lost_fin (tcp_connection_t * tc)
+{
+ if ((tc->flags & TCP_CONN_FINSNT) && tc->snd_una_max - tc->snd_una == 1)
+ return 1;
+ return 0;
+}
+
i32 tcp_rcv_wnd_available (tcp_connection_t * tc);
u32 tcp_snd_space (tcp_connection_t * tc);
void tcp_update_rcv_wnd (tcp_connection_t * tc);
@@ -621,8 +633,8 @@ tcp_update_time (f64 now, u32 thread_index)
u32 tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b);
u32
-tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b,
- u32 offset, u32 max_bytes);
+tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset,
+ u32 max_bytes, vlib_buffer_t ** b);
void tcp_connection_timers_init (tcp_connection_t * tc);
void tcp_connection_timers_reset (tcp_connection_t * tc);
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index 29f4f08d..a3b48d83 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -492,14 +492,6 @@ tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 prev_snd_wnd,
&& (prev_snd_wnd == tc->snd_wnd));
}
-static u8
-tcp_is_lost_fin (tcp_connection_t * tc)
-{
- if ((tc->flags & TCP_CONN_FINSNT) && tc->snd_una_max - tc->snd_una == 1)
- return 1;
- return 0;
-}
-
/**
* Checks if ack is a congestion control event.
*/
@@ -1162,7 +1154,8 @@ partial_ack:
/* Remove retransmitted bytes that have been delivered */
ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv
- >= tc->sack_sb.last_bytes_delivered);
+ >= tc->sack_sb.last_bytes_delivered
+ || (tc->flags & TCP_CONN_FINSNT));
if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt))
{
@@ -1273,6 +1266,8 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
if (tcp_ack_is_cc_event (tc, b, prev_snd_wnd, prev_snd_una, &is_dack))
{
tcp_cc_handle_event (tc, is_dack);
+ if (!tcp_in_cong_recovery (tc))
+ return 0;
*error = TCP_ERROR_ACK_DUP;
TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc, 1);
return vnet_buffer (b)->tcp.data_len ? 0 : -1;
@@ -1498,6 +1493,29 @@ tcp_can_delack (tcp_connection_t * tc)
}
static int
+tcp_buffer_discard_bytes (vlib_buffer_t * b, u32 n_bytes_to_drop)
+{
+ u32 discard;
+ vlib_main_t *vm = vlib_get_main ();
+
+ /* Handle multi segment packets */
+ if (n_bytes_to_drop > b->current_length)
+ {
+ if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT))
+ return -1;
+ do
+ {
+ discard = clib_min (n_bytes_to_drop, b->current_length);
+ vlib_buffer_advance (b, discard);
+ b = vlib_get_buffer (vm, b->next_buffer);
+ n_bytes_to_drop -= discard;
+ }
+ while (n_bytes_to_drop);
+ }
+ return 0;
+}
+
+static int
tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b,
u32 * next0)
{
@@ -1530,7 +1548,8 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b,
n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number;
n_data_bytes -= n_bytes_to_drop;
vnet_buffer (b)->tcp.seq_number = tc->rcv_nxt;
- vlib_buffer_advance (b, n_bytes_to_drop);
+ if (tcp_buffer_discard_bytes (b, n_bytes_to_drop))
+ goto done;
goto in_order;
}
@@ -2252,8 +2271,15 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
goto drop;
+ /* Still have to send the FIN */
+ if (tc0->flags & TCP_CONN_FINPNDG)
+ {
+ /* TX fifo finally drained */
+ if (!stream_session_tx_fifo_max_dequeue (&tc0->connection))
+ tcp_send_fin (tc0);
+ }
/* If FIN is ACKed */
- if (tc0->snd_una == tc0->snd_una_max)
+ else if (tc0->snd_una == tc0->snd_una_max)
{
ASSERT (tcp_fin (tcp0));
tc0->rcv_nxt += 1;
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index f8fbb8a9..4c1add21 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -15,6 +15,7 @@
#include <vnet/tcp/tcp.h>
#include <vnet/lisp-cp/packets.h>
+#include <math.h>
vlib_node_registration_t tcp4_output_node;
vlib_node_registration_t tcp6_output_node;
@@ -84,7 +85,7 @@ void
tcp_update_rcv_mss (tcp_connection_t * tc)
{
/* TODO find our iface MTU */
- tc->mss = dummy_mtu;
+ tc->mss = dummy_mtu - sizeof (tcp_header_t);
}
/**
@@ -437,27 +438,34 @@ tcp_init_mss (tcp_connection_t * tc)
}
always_inline int
+tcp_alloc_tx_buffers (tcp_main_t * tm, u8 thread_index, u32 n_free_buffers)
+{
+ vec_validate (tm->tx_buffers[thread_index], n_free_buffers - 1);
+ _vec_len (tm->tx_buffers[thread_index]) =
+ vlib_buffer_alloc_from_free_list (vlib_get_main (),
+ tm->tx_buffers[thread_index],
+ n_free_buffers,
+ VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+ /* buffer shortage, report failure */
+ if (vec_len (tm->tx_buffers[thread_index]) == 0)
+ {
+ clib_warning ("out of buffers");
+ return -1;
+ }
+ return 0;
+}
+
+always_inline int
tcp_get_free_buffer_index (tcp_main_t * tm, u32 * bidx)
{
- u32 *my_tx_buffers, n_free_buffers;
+ u32 *my_tx_buffers;
u32 thread_index = vlib_get_thread_index ();
- my_tx_buffers = tm->tx_buffers[thread_index];
- if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0))
+ if (PREDICT_FALSE (vec_len (tm->tx_buffers[thread_index]) == 0))
{
- n_free_buffers = VLIB_FRAME_SIZE;
- vec_validate (my_tx_buffers, n_free_buffers - 1);
- _vec_len (my_tx_buffers) =
- vlib_buffer_alloc_from_free_list (vlib_get_main (), my_tx_buffers,
- n_free_buffers,
- VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
- /* buffer shortage, report failure */
- if (vec_len (my_tx_buffers) == 0)
- {
- clib_warning ("out of buffers");
- return -1;
- }
- tm->tx_buffers[thread_index] = my_tx_buffers;
+ if (tcp_alloc_tx_buffers (tm, thread_index, VLIB_FRAME_SIZE))
+ return -1;
}
+ my_tx_buffers = tm->tx_buffers[thread_index];
*bidx = my_tx_buffers[_vec_len (my_tx_buffers) - 1];
_vec_len (my_tx_buffers) -= 1;
return 0;
@@ -476,6 +484,7 @@ always_inline void
tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b)
{
vlib_buffer_t *it = b;
+ u32 save_free_list = b->flags & VLIB_BUFFER_FREE_LIST_INDEX_MASK;
do
{
it->current_data = 0;
@@ -485,6 +494,10 @@ tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b)
while ((it->flags & VLIB_BUFFER_NEXT_PRESENT)
&& (it = vlib_get_buffer (vm, it->next_buffer)));
+ if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+ vlib_buffer_free_one (vm, b->next_buffer);
+ b->flags = save_free_list;
+
/* Leave enough space for headers */
vlib_buffer_make_headroom (b, MAX_HDRS_LEN);
vnet_buffer (b)->tcp.flags = 0;
@@ -959,18 +972,16 @@ tcp_send_fin (tcp_connection_t * tc)
return;
b = vlib_get_buffer (vm, bi);
- /* Leave enough space for headers */
- vlib_buffer_make_headroom (b, MAX_HDRS_LEN);
-
tcp_make_fin (tc, b);
tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4);
tc->flags |= TCP_CONN_FINSNT;
+ tc->flags &= ~TCP_CONN_FINPNDG;
tcp_retransmit_timer_force_update (tc);
TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc);
}
always_inline u8
-tcp_make_state_flags (tcp_state_t next_state)
+tcp_make_state_flags (tcp_connection_t * tc, tcp_state_t next_state)
{
switch (next_state)
{
@@ -982,7 +993,10 @@ tcp_make_state_flags (tcp_state_t next_state)
return TCP_FLAG_SYN;
case TCP_STATE_LAST_ACK:
case TCP_STATE_FIN_WAIT_1:
- return TCP_FLAG_FIN;
+ if (tc->snd_nxt + 1 < tc->snd_una_max)
+ return TCP_FLAG_ACK;
+ else
+ return TCP_FLAG_FIN;
default:
clib_warning ("Shouldn't be here!");
}
@@ -1008,7 +1022,7 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b,
tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t);
advertise_wnd = tcp_window_to_advertise (tc, next_state);
- flags = tcp_make_state_flags (next_state);
+ flags = tcp_make_state_flags (tc, next_state);
/* Push header and options */
th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt,
@@ -1055,7 +1069,11 @@ tcp_send_ack (tcp_connection_t * tc)
tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
}
-/* Send delayed ACK when timer expires */
+/**
+ * Delayed ack timer handler
+ *
+ * Sends delayed ACK when timer expires
+ */
void
tcp_timer_delack_handler (u32 index)
{
@@ -1067,49 +1085,138 @@ tcp_timer_delack_handler (u32 index)
tcp_send_ack (tc);
}
-/** Build a retransmit segment
+/**
+ * Build a retransmit segment
*
* @return the number of bytes in the segment or 0 if there's nothing to
* retransmit
*/
u32
-tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b,
- u32 offset, u32 max_bytes)
+tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset,
+ u32 max_deq_bytes, vlib_buffer_t ** b)
{
+ tcp_main_t *tm = vnet_get_tcp_main ();
vlib_main_t *vm = vlib_get_main ();
int n_bytes = 0;
- u32 start;
-
- tcp_reuse_buffer (vm, b);
+ u32 start, bi, available_bytes;
ASSERT (tc->state >= TCP_STATE_ESTABLISHED);
- ASSERT (max_bytes != 0);
+ ASSERT (max_deq_bytes != 0);
- max_bytes = clib_min (tc->snd_mss, max_bytes);
+ /*
+ * Make sure we can retransmit something
+ */
+ max_deq_bytes = clib_min (tc->snd_mss, max_deq_bytes);
+ available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection);
+ if (!available_bytes)
+ return 0;
+ max_deq_bytes = clib_min (available_bytes, max_deq_bytes);
start = tc->snd_una + offset;
/* Start is beyond snd_congestion */
if (seq_geq (start, tc->snd_congestion))
- goto done;
+ {
+ goto done;
+ }
/* Don't overshoot snd_congestion */
- if (seq_gt (start + max_bytes, tc->snd_congestion))
+ if (seq_gt (start + max_deq_bytes, tc->snd_congestion))
{
- max_bytes = tc->snd_congestion - start;
- if (max_bytes == 0)
- goto done;
+ max_deq_bytes = tc->snd_congestion - start;
+ if (max_deq_bytes == 0)
+ {
+ goto done;
+ }
}
+ /*
+ * Prepare options
+ */
tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state);
- ASSERT (max_bytes <= tc->snd_mss);
+ /*
+ * Allocate and fill in buffer(s)
+ */
+
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return 0;
+ *b = vlib_get_buffer (vm, bi);
+
+ /* Easy case, buffer size greater than mss */
+ if (PREDICT_TRUE (max_deq_bytes <= tm->bytes_per_buffer))
+ {
+ n_bytes = stream_session_peek_bytes (&tc->connection,
+ vlib_buffer_get_current (*b),
+ offset, max_deq_bytes);
+ ASSERT (n_bytes == max_deq_bytes);
+ b[0]->current_length = n_bytes;
+ tcp_push_hdr_i (tc, *b, tc->state, 0);
+ }
+ /* Split mss into multiple buffers */
+ else
+ {
+ u32 chain_bi = ~0, n_bufs_per_seg;
+ u32 thread_index = vlib_get_thread_index ();
+ u16 n_peeked, len_to_deq, available_bufs;
+ vlib_buffer_t *chain_b, *prev_b;
+ u8 *data0;
+ int i;
+
+ n_bufs_per_seg = ceil ((double) max_deq_bytes / tm->bytes_per_buffer);
+ ASSERT (available_bytes >= max_deq_bytes);
+
+ /* Make sure we have enough buffers */
+ available_bufs = vec_len (tm->tx_buffers[thread_index]);
+ if (n_bufs_per_seg > available_bufs)
+ {
+ if (tcp_alloc_tx_buffers (tm, thread_index,
+ VLIB_FRAME_SIZE - available_bufs))
+ {
+ tcp_return_buffer (tm);
+ return 0;
+ }
+ }
+
+ n_bytes = stream_session_peek_bytes (&tc->connection,
+ vlib_buffer_get_current (*b),
+ offset, tm->bytes_per_buffer);
+ b[0]->current_length = n_bytes;
+ b[0]->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
+ b[0]->total_length_not_including_first_buffer = 0;
+
+ tcp_push_hdr_i (tc, *b, tc->state, 0);
+ max_deq_bytes -= n_bytes;
+
+ chain_b = *b;
+ for (i = 1; i < n_bufs_per_seg; i++)
+ {
+ prev_b = chain_b;
+ len_to_deq = clib_min (max_deq_bytes, tm->bytes_per_buffer);
+ tcp_get_free_buffer_index (tm, &chain_bi);
+ ASSERT (chain_bi != (u32) ~ 0);
+ chain_b = vlib_get_buffer (vm, chain_bi);
+ chain_b->current_data = 0;
+ data0 = vlib_buffer_get_current (chain_b);
+ n_peeked = stream_session_peek_bytes (&tc->connection, data0,
+ n_bytes, len_to_deq);
+ n_bytes += n_peeked;
+ ASSERT (n_peeked == len_to_deq);
+ chain_b->current_length = n_peeked;
+ b[0]->total_length_not_including_first_buffer +=
+ chain_b->current_length;
+
+ /* update previous buffer */
+ prev_b->next_buffer = chain_bi;
+ prev_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
+
+ /* update current buffer */
+ chain_b->next_buffer = 0;
+
+ max_deq_bytes -= n_peeked;
+ }
+ }
- n_bytes = stream_session_peek_bytes (&tc->connection,
- vlib_buffer_get_current (b), offset,
- max_bytes);
ASSERT (n_bytes > 0);
- b->current_length = n_bytes;
- tcp_push_hdr_i (tc, b, tc->state, 0);
if (tcp_in_fastrecovery (tc))
tc->snd_rxt_bytes += n_bytes;
@@ -1147,7 +1254,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
vlib_main_t *vm = vlib_get_main ();
u32 thread_index = vlib_get_thread_index ();
tcp_connection_t *tc;
- vlib_buffer_t *b;
+ vlib_buffer_t *b = 0;
u32 bi, n_bytes;
if (is_syn)
@@ -1174,17 +1281,11 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
/* Go back to first un-acked byte */
tc->snd_nxt = tc->snd_una;
- if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
- return;
-
- b = vlib_get_buffer (vm, bi);
-
if (tc->state >= TCP_STATE_ESTABLISHED)
{
/* Lost FIN, retransmit and return */
- if (tc->flags & TCP_CONN_FINSNT)
+ if (tcp_is_lost_fin (tc))
{
- tcp_return_buffer (tm);
tcp_send_fin (tc);
return;
}
@@ -1199,7 +1300,9 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1);
/* Send one segment */
- n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss);
+ n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b);
+ ASSERT (n_bytes);
+ bi = vlib_get_buffer_index (vm, b);
/* TODO be less aggressive about this */
scoreboard_clear (&tc->sack_sb);
@@ -1212,7 +1315,6 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
tcp_retransmit_timer_set (tc);
ASSERT (0 || (tc->rto_boff > 1
&& tc->snd_una == tc->snd_congestion));
- tcp_return_buffer (tm);
return;
}
@@ -1234,7 +1336,6 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
clib_warning ("could not remove half-open connection");
ASSERT (0);
}
- tcp_return_buffer (tm);
return;
}
@@ -1243,6 +1344,9 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
if (tc->rto_boff > TCP_RTO_SYN_RETRIES)
tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
+ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+ return;
+ b = vlib_get_buffer (vm, bi);
vlib_buffer_make_headroom (b, MAX_HDRS_LEN);
tcp_push_hdr_i (tc, b, tc->state, 1);
@@ -1256,7 +1360,6 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
{
ASSERT (tc->state == TCP_STATE_CLOSED);
clib_warning ("connection closed ...");
- tcp_return_buffer (tm);
return;
}
@@ -1305,7 +1408,7 @@ tcp_timer_persist_handler (u32 index)
u32 thread_index = vlib_get_thread_index ();
tcp_connection_t *tc;
vlib_buffer_t *b;
- u32 bi, old_snd_nxt;
+ u32 bi, old_snd_nxt, snd_bytes = 0, available_bytes = 0;
int n_bytes = 0;
tc = tcp_connection_get_if_valid (index, thread_index);
@@ -1317,34 +1420,31 @@ tcp_timer_persist_handler (u32 index)
tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID;
/* Problem already solved or worse */
+ available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection);
if (tc->state == TCP_STATE_CLOSED || tc->state > TCP_STATE_ESTABLISHED
- || tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc))
+ || tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc)
+ || !available_bytes)
return;
/* Increment RTO backoff */
tc->rto_boff += 1;
tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
- /* Try to force the first unsent segment */
+ /*
+ * Try to force the first unsent segment (or buffer)
+ */
if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
return;
-
b = vlib_get_buffer (vm, bi);
tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state);
+ snd_bytes = clib_min (tc->snd_mss, tm->bytes_per_buffer);
n_bytes = stream_session_peek_bytes (&tc->connection,
vlib_buffer_get_current (b),
tc->snd_una_max - tc->snd_una,
- tc->snd_mss);
- /* Nothing to send */
- if (n_bytes <= 0)
- {
- // clib_warning ("persist found nothing to send");
- tcp_return_buffer (tm);
- return;
- }
-
+ snd_bytes);
+ ASSERT (n_bytes != 0);
b->current_length = n_bytes;
ASSERT (tc->snd_nxt == tc->snd_una_max || tc->rto_boff > 1
|| tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT));
@@ -1365,32 +1465,20 @@ tcp_timer_persist_handler (u32 index)
void
tcp_retransmit_first_unacked (tcp_connection_t * tc)
{
- tcp_main_t *tm = vnet_get_tcp_main ();
vlib_main_t *vm = vlib_get_main ();
vlib_buffer_t *b;
- u32 bi, n_bytes, old_snd_nxt;
+ u32 bi, old_snd_nxt, n_bytes;
old_snd_nxt = tc->snd_nxt;
tc->snd_nxt = tc->snd_una;
- /* Get buffer */
- if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
- return;
-
- b = vlib_get_buffer (vm, bi);
-
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2);
-
- n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss);
- if (n_bytes == 0)
- {
- tcp_return_buffer (tm);
- goto done;
- }
-
+ n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b);
+ if (!n_bytes)
+ return;
+ bi = vlib_get_buffer_index (vm, b);
tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
-done:
tc->snd_nxt = old_snd_nxt;
}
@@ -1400,10 +1488,9 @@ done:
void
tcp_fast_retransmit_sack (tcp_connection_t * tc)
{
- tcp_main_t *tm = vnet_get_tcp_main ();
vlib_main_t *vm = vlib_get_main ();
u32 n_written = 0, offset = 0, max_bytes;
- vlib_buffer_t *b;
+ vlib_buffer_t *b = 0;
sack_scoreboard_hole_t *hole;
sack_scoreboard_t *sb;
u32 bi, old_snd_nxt;
@@ -1420,10 +1507,6 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc)
hole = scoreboard_get_hole (sb, sb->cur_rxt_hole);
while (hole && snd_space > 0)
{
- if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
- return;
-
- b = vlib_get_buffer (vm, bi);
hole = scoreboard_next_rxt_hole (sb, hole,
tcp_fastrecovery_sent_1_smss (tc),
&can_rescue, &snd_limited);
@@ -1443,7 +1526,10 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc)
offset = tc->snd_congestion - tc->snd_una - max_bytes;
sb->rescue_rxt = tc->snd_congestion;
tc->snd_nxt = tc->snd_una + offset;
- tcp_prepare_retransmit_segment (tc, b, offset, max_bytes);
+ n_written = tcp_prepare_retransmit_segment (tc, offset, max_bytes,
+ &b);
+ ASSERT (n_written);
+ bi = vlib_get_buffer_index (vm, b);
tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
break;
}
@@ -1451,15 +1537,13 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc)
max_bytes = snd_limited ? tc->snd_mss : hole->end - sb->high_rxt;
offset = sb->high_rxt - tc->snd_una;
tc->snd_nxt = tc->snd_una + offset;
- n_written = tcp_prepare_retransmit_segment (tc, b, offset, max_bytes);
+ n_written = tcp_prepare_retransmit_segment (tc, offset, max_bytes, &b);
/* Nothing left to retransmit */
if (n_written == 0)
- {
- tcp_return_buffer (tm);
- break;
- }
+ break;
+ bi = vlib_get_buffer_index (vm, b);
sb->high_rxt += n_written;
tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
snd_space -= n_written;
@@ -1475,7 +1559,6 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc)
void
tcp_fast_retransmit_no_sack (tcp_connection_t * tc)
{
- tcp_main_t *tm = vnet_get_tcp_main ();
vlib_main_t *vm = vlib_get_main ();
u32 n_written = 0, offset = 0, bi, old_snd_nxt;
int snd_space;
@@ -1491,19 +1574,14 @@ tcp_fast_retransmit_no_sack (tcp_connection_t * tc)
while (snd_space > 0)
{
- if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
- return;
- b = vlib_get_buffer (vm, bi);
offset += n_written;
- n_written = tcp_prepare_retransmit_segment (tc, b, offset, snd_space);
+ n_written = tcp_prepare_retransmit_segment (tc, offset, snd_space, &b);
/* Nothing left to retransmit */
if (n_written == 0)
- {
- tcp_return_buffer (tm);
- break;
- }
+ break;
+ bi = vlib_get_buffer_index (vm, b);
tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
snd_space -= n_written;
}