From 68b0fb0c620c7451ef1a6380c43c39de6614db51 Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Tue, 28 Feb 2017 15:15:56 -0500 Subject: VPP-598: tcp stack initial commit Change-Id: I49e5ce0aae6e4ff634024387ceaf7dbc432a0351 Signed-off-by: Dave Barach Signed-off-by: Florin Coras --- src/vnet/tcp/tcp_packet.h | 184 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 src/vnet/tcp/tcp_packet.h (limited to 'src/vnet/tcp/tcp_packet.h') diff --git a/src/vnet/tcp/tcp_packet.h b/src/vnet/tcp/tcp_packet.h new file mode 100644 index 00000000..866c5fd6 --- /dev/null +++ b/src/vnet/tcp/tcp_packet.h @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_tcp_packet_h +#define included_tcp_packet_h + +#include + +/* TCP flags bit 0 first. */ +#define foreach_tcp_flag \ + _ (FIN) /**< No more data from sender. */ \ + _ (SYN) /**< Synchronize sequence numbers. */ \ + _ (RST) /**< Reset the connection. */ \ + _ (PSH) /**< Push function. */ \ + _ (ACK) /**< Ack field significant. */ \ + _ (URG) /**< Urgent pointer field significant. */ \ + _ (ECE) /**< ECN-echo. Receiver got CE packet */ \ + _ (CWR) /**< Sender reduced congestion window */ + +enum +{ +#define _(f) TCP_FLAG_BIT_##f, + foreach_tcp_flag +#undef _ + TCP_N_FLAG_BITS, +}; + +enum +{ +#define _(f) TCP_FLAG_##f = 1 << TCP_FLAG_BIT_##f, + foreach_tcp_flag +#undef _ +}; + +typedef struct _tcp_header +{ + union + { + struct + { + u16 src_port; /**< Source port. */ + u16 dst_port; /**< Destination port. */ + }; + struct + { + u16 src, dst; + }; + }; + + u32 seq_number; /**< Sequence number of the first data octet in this + * segment, except when SYN is present. If SYN + * is present the seq number is is the ISN and the + * first data octet is ISN+1 */ + u32 ack_number; /**< Acknowledgement number if ACK is set. It contains + * the value of the next sequence number the sender + * of the segment is expecting to receive. */ + u8 data_offset_and_reserved; + u8 flags; /**< Flags: see the macro above */ + u16 window; /**< Number of bytes sender is willing to receive. */ + + u16 checksum; /**< Checksum of TCP pseudo header and data. */ + u16 urgent_pointer; /**< Seq number of the byte after the urgent data. */ +} __attribute__ ((packed)) tcp_header_t; + +/* Flag tests that return 0 or !0 */ +#define tcp_doff(_th) ((_th)->data_offset_and_reserved >> 4) +#define tcp_fin(_th) ((_th)->flags & TCP_FLAG_FIN) +#define tcp_syn(_th) ((_th)->flags & TCP_FLAG_SYN) +#define tcp_rst(_th) ((_th)->flags & TCP_FLAG_RST) +#define tcp_psh(_th) ((_th)->flags & TCP_FLAG_PSH) +#define tcp_ack(_th) ((_th)->flags & TCP_FLAG_ACK) +#define tcp_urg(_th) ((_th)->flags & TCP_FLAG_URG) +#define tcp_ece(_th) ((_th)->flags & TCP_FLAG_ECE) +#define tcp_cwr(_th) ((_th)->flags & TCP_FLAG_CWR) + +/* Flag tests that return 0 or 1 */ +#define tcp_is_syn(_th) !!((_th)->flags & TCP_FLAG_SYN) +#define tcp_is_fin(_th) !!((_th)->flags & TCP_FLAG_FIN) + +always_inline int +tcp_header_bytes (tcp_header_t * t) +{ + return tcp_doff (t) * sizeof (u32); +} + +/* + * TCP options. + */ + +typedef enum tcp_option_type +{ + TCP_OPTION_EOL = 0, /**< End of options. */ + TCP_OPTION_NOOP = 1, /**< No operation. */ + TCP_OPTION_MSS = 2, /**< Limit MSS. */ + TCP_OPTION_WINDOW_SCALE = 3, /**< Window scale. */ + TCP_OPTION_SACK_PERMITTED = 4, /**< Selective Ack permitted. */ + TCP_OPTION_SACK_BLOCK = 5, /**< Selective Ack block. */ + TCP_OPTION_TIMESTAMP = 8, /**< Timestamps. */ + TCP_OPTION_UTO = 28, /**< User timeout. */ + TCP_OPTION_AO = 29, /**< Authentication Option. */ +} tcp_option_type_t; + +#define foreach_tcp_options_flag \ + _ (MSS) /**< MSS advertised in SYN */ \ + _ (TSTAMP) /**< Timestamp capability advertised in SYN */ \ + _ (WSCALE) /**< Wnd scale capability advertised in SYN */ \ + _ (SACK_PERMITTED) /**< SACK capability advertised in SYN */ \ + _ (SACK) /**< SACK present */ + +enum +{ +#define _(f) TCP_OPTS_FLAG_BIT_##f, + foreach_tcp_options_flag +#undef _ + TCP_OPTIONS_N_FLAG_BITS, +}; + +enum +{ +#define _(f) TCP_OPTS_FLAG_##f = 1 << TCP_OPTS_FLAG_BIT_##f, + foreach_tcp_options_flag +#undef _ +}; + +typedef struct _sack_block +{ + u32 start; /**< Start sequence number */ + u32 end; /**< End sequence number */ +} sack_block_t; + +typedef struct +{ + u8 flags; /** Option flags, see above */ + + /* Received options */ + u16 mss; /**< Maximum segment size advertised by peer */ + u8 wscale; /**< Window scale advertised by peer */ + u32 tsval; /**< Peer's timestamp value */ + u32 tsecr; /**< Echoed/reflected time stamp */ + sack_block_t *sacks; /**< SACK blocks received */ + u8 n_sack_blocks; /**< Number of SACKs blocks */ +} tcp_options_t; + +/* Flag tests that return 0 or !0 */ +#define tcp_opts_mss(_to) ((_to)->flags & TCP_OPTS_FLAG_MSS) +#define tcp_opts_tstamp(_to) ((_to)->flags & TCP_OPTS_FLAG_TSTAMP) +#define tcp_opts_wscale(_to) ((_to)->flags & TCP_OPTS_FLAG_WSCALE) +#define tcp_opts_sack(_to) ((_to)->flags & TCP_OPTS_FLAG_SACK) +#define tcp_opts_sack_permitted(_to) ((_to)->flags & TCP_OPTS_FLAG_SACK_PERMITTED) + +/* TCP option lengths */ +#define TCP_OPTION_LEN_EOL 1 +#define TCP_OPTION_LEN_NOOP 1 +#define TCP_OPTION_LEN_MSS 4 +#define TCP_OPTION_LEN_WINDOW_SCALE 3 +#define TCP_OPTION_LEN_SACK_PERMITTED 2 +#define TCP_OPTION_LEN_TIMESTAMP 10 +#define TCP_OPTION_LEN_SACK_BLOCK 8 + +#define TCP_WND_MAX 65535U +#define TCP_MAX_WND_SCALE 14 /* See RFC 1323 */ +#define TCP_OPTS_ALIGN 4 +#define TCP_OPTS_MAX_SACK_BLOCKS 3 +#endif /* included_tcp_packet_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ -- cgit 1.2.3-korg From 6792ec059696a358b6c98d8d86e9740b34c01e24 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Mon, 13 Mar 2017 03:49:51 -0700 Subject: TCP/session improvements - Added svm fifo flag for tracking fifo dequeue events (replaces event length). Updated all code to switch to the new scheme. - More session debugging - Fix peek index wrap - Add a trivial socket test client - Fast retransmit/cc fixes - tx and rx SACK fixes and unit testing - SRTT computation fix - remove dupack/ack burst filters - improve ack rx - improved segment rx - builtin client test code Change-Id: Ic4eb2d5ca446eb2260ccd3ccbcdaa73c64e7f4e1 Signed-off-by: Florin Coras Signed-off-by: Dave Barach --- src/svm/svm_fifo.c | 35 +-- src/svm/svm_fifo.h | 28 ++- src/svm/svm_fifo_segment.h | 4 +- src/uri.am | 5 +- src/uri/uri_socket_test.c | 126 ++++++++++ src/uri/uri_tcp_test.c | 161 +++++++++---- src/uri/uri_udp_test.c | 13 +- src/vnet.am | 2 + src/vnet/session/application.h | 3 +- src/vnet/session/node.c | 127 ++++++---- src/vnet/session/session.c | 63 +++-- src/vnet/session/session.h | 19 +- src/vnet/session/session_cli.c | 2 +- src/vnet/session/session_debug.h | 38 ++- src/vnet/session/transport.h | 2 +- src/vnet/tcp/builtin_client.c | 411 +++++++++++++++++++++++++++++++ src/vnet/tcp/builtin_client.h | 131 ++++++++++ src/vnet/tcp/builtin_server.c | 91 +++++-- src/vnet/tcp/tcp.c | 37 ++- src/vnet/tcp/tcp.h | 111 +++++++-- src/vnet/tcp/tcp_debug.h | 252 ++++++++++++++++--- src/vnet/tcp/tcp_error.def | 7 +- src/vnet/tcp/tcp_input.c | 507 +++++++++++++++++++++++++-------------- src/vnet/tcp/tcp_output.c | 295 ++++++++++++++++------- src/vnet/tcp/tcp_packet.h | 2 +- src/vnet/tcp/tcp_test.c | 216 +++++++++++++++++ src/vnet/udp/builtin_server.c | 29 ++- src/vnet/udp/udp_input.c | 47 ++-- 28 files changed, 2201 insertions(+), 563 deletions(-) create mode 100644 src/uri/uri_socket_test.c create mode 100644 src/vnet/tcp/builtin_client.c create mode 100644 src/vnet/tcp/builtin_client.h create mode 100644 src/vnet/tcp/tcp_test.c (limited to 'src/vnet/tcp/tcp_packet.h') diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c index e3f534b1..07b0d2df 100644 --- a/src/svm/svm_fifo.c +++ b/src/svm/svm_fifo.c @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "svm_fifo.h" +#include /** create an svm fifo, in the current heap. Fails vs blow up the process */ svm_fifo_t * @@ -362,18 +362,19 @@ svm_fifo_enqueue_nowait (svm_fifo_t * f, return svm_fifo_enqueue_internal (f, pid, max_bytes, copy_from_here); } -/** Enqueue a future segment. +/** + * Enqueue a future segment. + * * Two choices: either copies the entire segment, or copies nothing * Returns 0 of the entire segment was copied * Returns -1 if none of the segment was copied due to lack of space */ - static int -svm_fifo_enqueue_with_offset_internal2 (svm_fifo_t * f, - int pid, - u32 offset, - u32 required_bytes, - u8 * copy_from_here) +svm_fifo_enqueue_with_offset_internal (svm_fifo_t * f, + int pid, + u32 offset, + u32 required_bytes, + u8 * copy_from_here) { u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; u32 cursize, nitems; @@ -424,14 +425,14 @@ svm_fifo_enqueue_with_offset (svm_fifo_t * f, u32 offset, u32 required_bytes, u8 * copy_from_here) { - return svm_fifo_enqueue_with_offset_internal2 + return svm_fifo_enqueue_with_offset_internal (f, pid, offset, required_bytes, copy_from_here); } static int -svm_fifo_dequeue_internal2 (svm_fifo_t * f, - int pid, u32 max_bytes, u8 * copy_here) +svm_fifo_dequeue_internal (svm_fifo_t * f, + int pid, u32 max_bytes, u8 * copy_here) { u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; u32 cursize, nitems; @@ -484,7 +485,7 @@ int svm_fifo_dequeue_nowait (svm_fifo_t * f, int pid, u32 max_bytes, u8 * copy_here) { - return svm_fifo_dequeue_internal2 (f, pid, max_bytes, copy_here); + return svm_fifo_dequeue_internal (f, pid, max_bytes, copy_here); } int @@ -492,7 +493,7 @@ svm_fifo_peek (svm_fifo_t * f, int pid, u32 offset, u32 max_bytes, u8 * copy_here) { u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; - u32 cursize, nitems; + u32 cursize, nitems, real_head; if (PREDICT_FALSE (f->cursize == 0)) return -2; /* nothing in the fifo */ @@ -500,6 +501,8 @@ svm_fifo_peek (svm_fifo_t * f, int pid, u32 offset, u32 max_bytes, /* read cursize, which can only increase while we're working */ cursize = f->cursize; nitems = f->nitems; + real_head = f->head + offset; + real_head = real_head >= nitems ? real_head - nitems : real_head; /* Number of bytes we're going to copy */ total_copy_bytes = (cursize < max_bytes) ? cursize : max_bytes; @@ -508,9 +511,9 @@ svm_fifo_peek (svm_fifo_t * f, int pid, u32 offset, u32 max_bytes, { /* Number of bytes in first copy segment */ first_copy_bytes = - ((nitems - f->head + offset) < total_copy_bytes) ? - (nitems - f->head + offset) : total_copy_bytes; - clib_memcpy (copy_here, &f->data[f->head + offset], first_copy_bytes); + ((nitems - real_head) < total_copy_bytes) ? + (nitems - real_head) : total_copy_bytes; + clib_memcpy (copy_here, &f->data[real_head], first_copy_bytes); /* Number of bytes in second copy segment, if any */ second_copy_bytes = total_copy_bytes - first_copy_bytes; diff --git a/src/svm/svm_fifo.h b/src/svm/svm_fifo.h index 70624b74..39556173 100644 --- a/src/svm/svm_fifo.h +++ b/src/svm/svm_fifo.h @@ -46,9 +46,11 @@ typedef struct { pthread_mutex_t mutex; /* 8 bytes */ pthread_cond_t condvar; /* 8 bytes */ - u32 owner_pid; svm_lock_tag_t tag; - volatile u32 cursize; + + volatile u32 cursize; /**< current fifo size */ + volatile u8 has_event; /**< non-zero if deq event exists */ + u32 owner_pid; u32 nitems; /* Backpointers */ @@ -112,6 +114,28 @@ svm_fifo_has_ooo_data (svm_fifo_t * f) return f->ooos_list_head != OOO_SEGMENT_INVALID_INDEX; } +/** + * Sets fifo event flag. + * + * @return 1 if flag was not set. + */ +always_inline u8 +svm_fifo_set_event (svm_fifo_t * f) +{ + /* Probably doesn't need to be atomic. Still, better avoid surprises */ + return __sync_lock_test_and_set (&f->has_event, 1) == 0; +} + +/** + * Unsets fifo event flag. + */ +always_inline void +svm_fifo_unset_event (svm_fifo_t * f) +{ + /* Probably doesn't need to be atomic. Still, better avoid surprises */ + __sync_lock_test_and_set (&f->has_event, 0); +} + svm_fifo_t *svm_fifo_create (u32 data_size_in_bytes); int svm_fifo_enqueue_nowait (svm_fifo_t * f, int pid, u32 max_bytes, diff --git a/src/svm/svm_fifo_segment.h b/src/svm/svm_fifo_segment.h index 793fa7c8..ecb5653a 100644 --- a/src/svm/svm_fifo_segment.h +++ b/src/svm/svm_fifo_segment.h @@ -15,8 +15,8 @@ #ifndef __included_ssvm_fifo_segment_h__ #define __included_ssvm_fifo_segment_h__ -#include "svm_fifo.h" -#include "ssvm.h" +#include +#include typedef struct { diff --git a/src/uri.am b/src/uri.am index 09b5b15b..ad4d65d8 100644 --- a/src/uri.am +++ b/src/uri.am @@ -11,7 +11,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -noinst_PROGRAMS += uri_udp_test uri_tcp_test +noinst_PROGRAMS += uri_udp_test uri_tcp_test uri_socket_test uri_udp_test_SOURCES = uri/uri_udp_test.c uri_udp_test_LDADD = libvlibmemoryclient.la libvlibapi.la libsvm.la \ @@ -20,3 +20,6 @@ uri_udp_test_LDADD = libvlibmemoryclient.la libvlibapi.la libsvm.la \ uri_tcp_test_SOURCES = uri/uri_tcp_test.c uri_tcp_test_LDADD = libvlibmemoryclient.la libvlibapi.la libsvm.la \ libvppinfra.la -lpthread -lm -lrt + +uri_socket_test_SOURCES = uri/uri_socket_test.c +uri_socket_test_LDADD = libvppinfra.la -lpthread -lm -lrt diff --git a/src/uri/uri_socket_test.c b/src/uri/uri_socket_test.c new file mode 100644 index 00000000..9f049bda --- /dev/null +++ b/src/uri/uri_socket_test.c @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +int +main (int argc, char *argv[]) +{ + int sockfd, portno, n; + struct sockaddr_in serv_addr; + struct hostent *server; + u8 *rx_buffer = 0, *tx_buffer = 0; + u32 offset; + int iter, i; + if (0 && argc < 3) + { + fformat (stderr, "usage %s hostname port\n", argv[0]); + exit (0); + } + + portno = 1234; // atoi(argv[2]); + sockfd = socket (AF_INET, SOCK_STREAM, 0); + if (sockfd < 0) + { + clib_unix_error ("socket"); + exit (1); + } + server = gethostbyname ("6.0.1.1" /* argv[1] */ ); + if (server == NULL) + { + clib_unix_warning ("gethostbyname"); + exit (1); + } + bzero ((char *) &serv_addr, sizeof (serv_addr)); + serv_addr.sin_family = AF_INET; + bcopy ((char *) server->h_addr, + (char *) &serv_addr.sin_addr.s_addr, server->h_length); + serv_addr.sin_port = htons (portno); + if (connect (sockfd, (const void *) &serv_addr, sizeof (serv_addr)) < 0) + { + clib_unix_warning ("connect"); + exit (1); + } + + vec_validate (rx_buffer, 1400); + vec_validate (tx_buffer, 1400); + + for (i = 0; i < vec_len (tx_buffer); i++) + tx_buffer[i] = (i + 1) % 0xff; + + /* + * Send one packet to warm up the RX pipeline + */ + n = send (sockfd, tx_buffer, vec_len (tx_buffer), 0 /* flags */ ); + if (n != vec_len (tx_buffer)) + { + clib_unix_warning ("write"); + exit (0); + } + + for (iter = 0; iter < 100000; iter++) + { + if (iter < 99999) + { + n = send (sockfd, tx_buffer, vec_len (tx_buffer), 0 /* flags */ ); + if (n != vec_len (tx_buffer)) + { + clib_unix_warning ("write"); + exit (0); + } + } + offset = 0; + + do + { + n = recv (sockfd, rx_buffer + offset, + vec_len (rx_buffer) - offset, 0 /* flags */ ); + if (n < 0) + { + clib_unix_warning ("read"); + exit (0); + } + offset += n; + } + while (offset < vec_len (rx_buffer)); + + for (i = 0; i < vec_len (rx_buffer); i++) + { + if (rx_buffer[i] != tx_buffer[i]) + { + clib_warning ("[%d] read 0x%x not 0x%x", + rx_buffer[i], tx_buffer[i]); + exit (1); + } + } + + } + close (sockfd); + return 0; +} + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/uri/uri_tcp_test.c b/src/uri/uri_tcp_test.c index 406a5f4e..e2834817 100644 --- a/src/uri/uri_tcp_test.c +++ b/src/uri/uri_tcp_test.c @@ -116,6 +116,7 @@ typedef struct pthread_t client_rx_thread_handle; u32 client_bytes_received; u8 test_return_packets; + u32 bytes_to_send; /* convenience */ svm_fifo_segment_main_t *segment_main; @@ -313,11 +314,16 @@ client_handle_fifo_event_rx (uri_tcp_test_main_t * utm, rx_fifo = e->fifo; - bytes = e->enqueue_length; + bytes = svm_fifo_max_dequeue (rx_fifo); + /* Allow enqueuing of new event */ + svm_fifo_unset_event (rx_fifo); + + /* Read the bytes */ do { - n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len (utm->rx_buf), - utm->rx_buf); + n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, + clib_min (vec_len (utm->rx_buf), + bytes), utm->rx_buf); if (n_read > 0) { bytes -= n_read; @@ -333,9 +339,17 @@ client_handle_fifo_event_rx (uri_tcp_test_main_t * utm, } utm->client_bytes_received += n_read; } + else + { + if (n_read == -2) + { + clib_warning ("weird!"); + break; + } + } } - while (n_read < 0 || bytes > 0); + while (bytes > 0); } void @@ -479,47 +493,41 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) } } -void -client_send_data (uri_tcp_test_main_t * utm) +static void +send_test_chunk (uri_tcp_test_main_t * utm, svm_fifo_t * tx_fifo, int mypid, + u32 bytes) { u8 *test_data = utm->connect_test_data; u64 bytes_sent = 0; - int rv; - int mypid = getpid (); - session_t *session; - svm_fifo_t *tx_fifo; - int buffer_offset, bytes_to_send = 0; + int test_buf_offset = 0; + u32 bytes_to_snd; + u32 queue_max_chunk = 64 << 10, actual_write; session_fifo_event_t evt; static int serial_number = 0; - int i; - u32 max_chunk = 64 << 10, write; - - session = pool_elt_at_index (utm->sessions, utm->connected_session_index); - tx_fifo = session->server_tx_fifo; + int rv; - vec_validate (utm->rx_buf, vec_len (test_data) - 1); + bytes_to_snd = (bytes == 0) ? vec_len (test_data) : bytes; + if (bytes_to_snd > vec_len (test_data)) + bytes_to_snd = vec_len (test_data); - for (i = 0; i < 1; i++) + while (bytes_to_snd > 0) { - bytes_to_send = vec_len (test_data); - buffer_offset = 0; - while (bytes_to_send > 0) + actual_write = + bytes_to_snd > queue_max_chunk ? queue_max_chunk : bytes_to_snd; + rv = svm_fifo_enqueue_nowait (tx_fifo, mypid, actual_write, + test_data + test_buf_offset); + + if (rv > 0) { - write = bytes_to_send > max_chunk ? max_chunk : bytes_to_send; - rv = svm_fifo_enqueue_nowait (tx_fifo, mypid, write, - test_data + buffer_offset); + bytes_to_snd -= rv; + test_buf_offset += rv; + bytes_sent += rv; - if (rv > 0) + if (svm_fifo_set_event (tx_fifo)) { - bytes_to_send -= rv; - buffer_offset += rv; - bytes_sent += rv; - /* Fabricate TX event, send to vpp */ evt.fifo = tx_fifo; evt.event_type = FIFO_EVENT_SERVER_TX; - /* $$$$ for event logging */ - evt.enqueue_length = rv; evt.event_id = serial_number++; unix_shared_memory_queue_add (utm->vpp_event_queue, @@ -528,13 +536,40 @@ client_send_data (uri_tcp_test_main_t * utm) } } } +} + +void +client_send_data (uri_tcp_test_main_t * utm) +{ + u8 *test_data = utm->connect_test_data; + int mypid = getpid (); + session_t *session; + svm_fifo_t *tx_fifo; + u32 n_iterations, leftover; + int i; + + session = pool_elt_at_index (utm->sessions, utm->connected_session_index); + tx_fifo = session->server_tx_fifo; + + vec_validate (utm->rx_buf, vec_len (test_data) - 1); + n_iterations = utm->bytes_to_send / vec_len (test_data); + + for (i = 0; i < n_iterations; i++) + { + send_test_chunk (utm, tx_fifo, mypid, 0); + } + + leftover = utm->bytes_to_send % vec_len (test_data); + if (leftover) + send_test_chunk (utm, tx_fifo, mypid, leftover); if (utm->test_return_packets) { f64 timeout = clib_time_now (&utm->clib_time) + 2; /* Wait for the outstanding packets */ - while (utm->client_bytes_received < vec_len (test_data)) + while (utm->client_bytes_received < + vec_len (test_data) * n_iterations + leftover) { if (clib_time_now (&utm->clib_time) > timeout) { @@ -542,9 +577,8 @@ client_send_data (uri_tcp_test_main_t * utm) break; } } - - utm->time_to_stop = 1; } + utm->time_to_stop = 1; } void @@ -599,6 +633,11 @@ client_test (uri_tcp_test_main_t * utm) /* Disconnect */ client_disconnect (utm); + + if (wait_for_state_change (utm, STATE_START)) + { + return; + } } static void @@ -714,7 +753,6 @@ server_handle_fifo_event_rx (uri_tcp_test_main_t * utm, { svm_fifo_t *rx_fifo, *tx_fifo; int n_read; - session_fifo_event_t evt; unix_shared_memory_queue_t *q; int rv, bytes; @@ -722,34 +760,46 @@ server_handle_fifo_event_rx (uri_tcp_test_main_t * utm, rx_fifo = e->fifo; tx_fifo = utm->sessions[rx_fifo->client_session_index].server_tx_fifo; - bytes = e->enqueue_length; + bytes = svm_fifo_max_dequeue (rx_fifo); + /* Allow enqueuing of a new event */ + svm_fifo_unset_event (rx_fifo); + + if (bytes == 0) + return; + + /* Read the bytes */ do { n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len (utm->rx_buf), utm->rx_buf); + if (n_read > 0) + bytes -= n_read; + + if (utm->drop_packets) + continue; /* Reflect if a non-drop session */ - if (!utm->drop_packets && n_read > 0) + if (n_read > 0) { do { rv = svm_fifo_enqueue_nowait (tx_fifo, 0, n_read, utm->rx_buf); } - while (rv == -2 && !utm->time_to_stop); - - /* Fabricate TX event, send to vpp */ - evt.fifo = tx_fifo; - evt.event_type = FIFO_EVENT_SERVER_TX; - /* $$$$ for event logging */ - evt.enqueue_length = n_read; - evt.event_id = e->event_id; - q = utm->vpp_event_queue; - unix_shared_memory_queue_add (q, (u8 *) & evt, - 0 /* do wait for mutex */ ); - } + while (rv <= 0 && !utm->time_to_stop); - if (n_read > 0) - bytes -= n_read; + /* If event wasn't set, add one */ + if (svm_fifo_set_event (tx_fifo)) + { + /* Fabricate TX event, send to vpp */ + evt.fifo = tx_fifo; + evt.event_type = FIFO_EVENT_SERVER_TX; + evt.event_id = e->event_id; + + q = utm->vpp_event_queue; + unix_shared_memory_queue_add (q, (u8 *) & evt, + 0 /* do wait for mutex */ ); + } + } } while ((n_read < 0 || bytes > 0) && !utm->time_to_stop); } @@ -852,7 +902,10 @@ static void vl_api_disconnect_session_reply_t_handler (vl_api_disconnect_session_reply_t * mp) { + uri_tcp_test_main_t *utm = &uri_tcp_test_main; + clib_warning ("retval %d", ntohl (mp->retval)); + utm->state = STATE_START; } #define foreach_uri_msg \ @@ -888,6 +941,7 @@ main (int argc, char **argv) u8 *heap, *uri = 0; u8 *bind_uri = (u8 *) "tcp://0.0.0.0/1234"; u8 *connect_uri = (u8 *) "tcp://6.0.1.2/1234"; + u32 bytes_to_send = 64 << 10, mbytes; u32 tmp; mheap_t *h; session_t *session; @@ -934,6 +988,10 @@ main (int argc, char **argv) drop_packets = 1; else if (unformat (a, "test")) test_return_packets = 1; + else if (unformat (a, "mbytes %d", &mbytes)) + { + bytes_to_send = mbytes << 20; + } else { fformat (stderr, "%s: usage [master|slave]\n"); @@ -956,6 +1014,7 @@ main (int argc, char **argv) utm->segment_main = &svm_fifo_segment_main; utm->drop_packets = drop_packets; utm->test_return_packets = test_return_packets; + utm->bytes_to_send = bytes_to_send; setup_signal_handlers (); uri_api_hookup (utm); diff --git a/src/uri/uri_udp_test.c b/src/uri/uri_udp_test.c index 54625d64..e6c239c1 100644 --- a/src/uri/uri_udp_test.c +++ b/src/uri/uri_udp_test.c @@ -742,17 +742,20 @@ server_handle_fifo_event_rx (uri_udp_test_main_t * utm, /* Fabricate TX event, send to vpp */ evt.fifo = tx_fifo; evt.event_type = FIFO_EVENT_SERVER_TX; - /* $$$$ for event logging */ - evt.enqueue_length = nbytes; evt.event_id = e->event_id; - q = utm->vpp_event_queue; - unix_shared_memory_queue_add (q, (u8 *) & evt, 0 /* do wait for mutex */ ); + + if (svm_fifo_set_event (tx_fifo)) + { + q = utm->vpp_event_queue; + unix_shared_memory_queue_add (q, (u8 *) & evt, + 0 /* do wait for mutex */ ); + } } void server_handle_event_queue (uri_udp_test_main_t * utm) { - session_fifo_event_t _e, *e = &_e;; + session_fifo_event_t _e, *e = &_e; while (1) { diff --git a/src/vnet.am b/src/vnet.am index 3e73de8f..9c55e336 100644 --- a/src/vnet.am +++ b/src/vnet.am @@ -462,7 +462,9 @@ libvnet_la_SOURCES += \ vnet/tcp/tcp_output.c \ vnet/tcp/tcp_input.c \ vnet/tcp/tcp_newreno.c \ + vnet/tcp/builtin_client.c \ vnet/tcp/builtin_server.c \ + vnet/tcp/tcp_test.c \ vnet/tcp/tcp.c nobase_include_HEADERS += \ diff --git a/src/vnet/session/application.h b/src/vnet/session/application.h index a60a8b8b..480828f7 100644 --- a/src/vnet/session/application.h +++ b/src/vnet/session/application.h @@ -45,8 +45,7 @@ typedef struct _stream_session_cb_vft void (*session_reset_callback) (stream_session_t * s); /* Direct RX callback, for built-in servers */ - int (*builtin_server_rx_callback) (stream_session_t * session, - session_fifo_event_t * ep); + int (*builtin_server_rx_callback) (stream_session_t * session); /* Redirect connection to local server */ int (*redirect_connect_callback) (u32 api_client_index, void *mp); diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c index 822afebd..8681105c 100644 --- a/src/vnet/session/node.c +++ b/src/vnet/session/node.c @@ -13,21 +13,14 @@ * limitations under the License. */ +#include #include #include -#include -#include - #include - -#include -#include #include -#include - -#include -#include +#include #include +#include vlib_node_registration_t session_queue_node; @@ -52,8 +45,8 @@ format_session_queue_trace (u8 * s, va_list * args) vlib_node_registration_t session_queue_node; -#define foreach_session_queue_error \ -_(TX, "Packets transmitted") \ +#define foreach_session_queue_error \ +_(TX, "Packets transmitted") \ _(TIMER, "Timer events") typedef enum @@ -91,10 +84,10 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, transport_proto_vft_t *transport_vft; u32 next_index, next0, *to_next, n_left_to_next, bi0; vlib_buffer_t *b0; - u32 rx_offset; + u32 rx_offset = 0, max_dequeue0; u16 snd_mss0; u8 *data0; - int i; + int i, n_bytes_read; next_index = next0 = session_type_to_next[s0->session_type]; @@ -106,24 +99,33 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, snd_mss0 = transport_vft->send_mss (tc0); /* Can't make any progress */ - if (snd_space0 == 0 || svm_fifo_max_dequeue (s0->server_tx_fifo) == 0 - || snd_mss0 == 0) + if (snd_space0 == 0 || snd_mss0 == 0) { vec_add1 (smm->evts_partially_read[thread_index], *e0); return 0; } - ASSERT (e0->enqueue_length > 0); - - /* Ensure we're not writing more than transport window allows */ - max_len_to_snd0 = clib_min (e0->enqueue_length, snd_space0); - if (peek_data) { /* Offset in rx fifo from where to peek data */ rx_offset = transport_vft->tx_fifo_offset (tc0); } + /* Check how much we can pull. If buffering, subtract the offset */ + max_dequeue0 = svm_fifo_max_dequeue (s0->server_tx_fifo) - rx_offset; + + /* Allow enqueuing of a new event */ + svm_fifo_unset_event (s0->server_tx_fifo); + + /* Nothing to read return */ + if (max_dequeue0 == 0) + { + return 0; + } + + /* Ensure we're not writing more than transport window allows */ + max_len_to_snd0 = clib_min (max_dequeue0, snd_space0); + /* TODO check if transport is willing to send len_to_snd0 * bytes (Nagle) */ @@ -147,13 +149,10 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, * XXX 0.9 because when debugging we might not get a full frame */ if (PREDICT_FALSE (n_bufs < 0.9 * VLIB_FRAME_SIZE)) { - /* Keep track of how much we've dequeued and exit */ - if (left_to_snd0 != max_len_to_snd0) + if (svm_fifo_set_event (s0->server_tx_fifo)) { - e0->enqueue_length -= max_len_to_snd0 - left_to_snd0; vec_add1 (smm->evts_partially_read[thread_index], *e0); } - return -1; } @@ -198,9 +197,9 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, len_to_deq0 = (left_to_snd0 < snd_mss0) ? left_to_snd0 : snd_mss0; /* *INDENT-OFF* */ - SESSION_EVT_DBG(s0, SESSION_EVT_DEQ, ({ + SESSION_EVT_DBG(SESSION_EVT_DEQ, s0, ({ ed->data[0] = e0->event_id; - ed->data[1] = e0->enqueue_length; + ed->data[1] = max_dequeue0; ed->data[2] = len_to_deq0; ed->data[3] = left_to_snd0; })); @@ -214,29 +213,30 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, * 2) buffer chains */ if (peek_data) { - int n_bytes_read; n_bytes_read = svm_fifo_peek (s0->server_tx_fifo, s0->pid, rx_offset, len_to_deq0, data0); - if (n_bytes_read < 0) + if (n_bytes_read <= 0) goto dequeue_fail; /* Keep track of progress locally, transport is also supposed to - * increment it independently when pushing header */ + * increment it independently when pushing the header */ rx_offset += n_bytes_read; } else { - if (svm_fifo_dequeue_nowait (s0->server_tx_fifo, s0->pid, - len_to_deq0, data0) < 0) + n_bytes_read = svm_fifo_dequeue_nowait (s0->server_tx_fifo, + s0->pid, len_to_deq0, + data0); + if (n_bytes_read <= 0) goto dequeue_fail; } - b0->current_length = len_to_deq0; + b0->current_length = n_bytes_read; /* Ask transport to push header */ transport_vft->push_header (tc0, b0); - left_to_snd0 -= len_to_deq0; + left_to_snd0 -= n_bytes_read; *n_tx_packets = *n_tx_packets + 1; vlib_validate_buffer_enqueue_x1 (vm, node, next_index, @@ -246,25 +246,31 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_put_next_frame (vm, node, next_index, n_left_to_next); } - /* If we couldn't dequeue all bytes store progress */ - if (max_len_to_snd0 < e0->enqueue_length) + /* If we couldn't dequeue all bytes mark as partially read */ + if (max_len_to_snd0 < max_dequeue0) { - e0->enqueue_length -= max_len_to_snd0; - vec_add1 (smm->evts_partially_read[thread_index], *e0); + /* If we don't already have new event */ + if (svm_fifo_set_event (s0->server_tx_fifo)) + { + vec_add1 (smm->evts_partially_read[thread_index], *e0); + } } return 0; dequeue_fail: - /* Can't read from fifo. Store event rx progress, save as partially read, - * return buff to free list and return */ - e0->enqueue_length -= max_len_to_snd0 - left_to_snd0; - vec_add1 (smm->evts_partially_read[thread_index], *e0); + /* + * Can't read from fifo. If we don't already have an event, save as partially + * read, return buff to free list and return + */ + clib_warning ("dequeue fail"); - to_next -= 1; - n_left_to_next += 1; + if (svm_fifo_set_event (s0->server_tx_fifo)) + { + vec_add1 (smm->evts_partially_read[thread_index], *e0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next + 1); _vec_len (smm->tx_buffers[thread_index]) += 1; - clib_warning ("dequeue fail"); return 0; } @@ -298,6 +304,7 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, session_fifo_event_t *my_fifo_events, *e; u32 n_to_dequeue, n_events; unix_shared_memory_queue_t *q; + application_t *app; int n_tx_packets = 0; u32 my_thread_index = vm->cpu_index; int i, rv; @@ -321,13 +328,18 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, if (n_to_dequeue == 0 && vec_len (my_fifo_events) == 0) return 0; + SESSION_EVT_DBG (SESSION_EVT_DEQ_NODE, 0); + /* * If we didn't manage to process previous events try going * over them again without dequeuing new ones. */ /* XXX: Block senders to sessions that can't keep up */ if (vec_len (my_fifo_events) >= 100) - goto skip_dequeue; + { + clib_warning ("too many fifo events unsolved"); + goto skip_dequeue; + } /* See you in the next life, don't be late */ if (pthread_mutex_trylock (&q->mutex)) @@ -352,19 +364,17 @@ skip_dequeue: { svm_fifo_t *f0; /* $$$ prefetch 1 ahead maybe */ stream_session_t *s0; - u32 server_session_index0, server_thread_index0; + u32 session_index0; session_fifo_event_t *e0; e0 = &my_fifo_events[i]; f0 = e0->fifo; - server_session_index0 = f0->server_session_index; - server_thread_index0 = f0->server_thread_index; + session_index0 = f0->server_session_index; /* $$$ add multiple event queues, per vpp worker thread */ - ASSERT (server_thread_index0 == my_thread_index); + ASSERT (f0->server_thread_index == my_thread_index); - s0 = stream_session_get_if_valid (server_session_index0, - my_thread_index); + s0 = stream_session_get_if_valid (session_index0, my_thread_index); if (CLIB_DEBUG && !s0) { @@ -385,11 +395,20 @@ skip_dequeue: rv = (smm->session_tx_fns[s0->session_type]) (vm, node, smm, e0, s0, my_thread_index, &n_tx_packets); + /* Out of buffers */ if (rv < 0) goto done; break; - + case FIFO_EVENT_SERVER_EXIT: + stream_session_disconnect (s0); + break; + case FIFO_EVENT_BUILTIN_RX: + svm_fifo_unset_event (s0->server_rx_fifo); + /* Get session's server */ + app = application_get (s0->app_index); + app->cb_fns.builtin_server_rx_callback (s0); + break; default: clib_warning ("unhandled event type %d", e0->event_type); } @@ -418,6 +437,8 @@ done: vlib_node_increment_counter (vm, session_queue_node.index, SESSION_QUEUE_ERROR_TX, n_tx_packets); + SESSION_EVT_DBG (SESSION_EVT_DEQ_NODE, 1); + return n_tx_packets; } diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 06e2a09a..f10918aa 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -804,30 +804,36 @@ stream_session_enqueue_notify (stream_session_t * s, u8 block) /* Get session's server */ app = application_get (s->app_index); - /* Fabricate event */ - evt.fifo = s->server_rx_fifo; - evt.event_type = FIFO_EVENT_SERVER_RX; - evt.event_id = serial_number++; - evt.enqueue_length = svm_fifo_max_dequeue (s->server_rx_fifo); - /* Built-in server? Hand event to the callback... */ if (app->cb_fns.builtin_server_rx_callback) - return app->cb_fns.builtin_server_rx_callback (s, &evt); - - /* Add event to server's event queue */ - q = app->event_queue; + return app->cb_fns.builtin_server_rx_callback (s); - /* Based on request block (or not) for lack of space */ - if (block || PREDICT_TRUE (q->cursize < q->maxsize)) - unix_shared_memory_queue_add (app->event_queue, (u8 *) & evt, - 0 /* do wait for mutex */ ); - else - return -1; + /* If no event, send one */ + if (svm_fifo_set_event (s->server_rx_fifo)) + { + /* Fabricate event */ + evt.fifo = s->server_rx_fifo; + evt.event_type = FIFO_EVENT_SERVER_RX; + evt.event_id = serial_number++; + + /* Add event to server's event queue */ + q = app->event_queue; + + /* Based on request block (or not) for lack of space */ + if (block || PREDICT_TRUE (q->cursize < q->maxsize)) + unix_shared_memory_queue_add (app->event_queue, (u8 *) & evt, + 0 /* do wait for mutex */ ); + else + { + clib_warning ("fifo full"); + return -1; + } + } /* *INDENT-OFF* */ - SESSION_EVT_DBG(s, SESSION_EVT_ENQ, ({ + SESSION_EVT_DBG(SESSION_EVT_ENQ, s, ({ ed->data[0] = evt.event_id; - ed->data[1] = evt.enqueue_length; + ed->data[1] = svm_fifo_max_dequeue (s->server_rx_fifo); })); /* *INDENT-ON* */ @@ -1192,8 +1198,29 @@ stream_session_open (u8 sst, ip46_address_t * addr, u16 port_host_byte_order, void stream_session_disconnect (stream_session_t * s) { +// session_fifo_event_t evt; + s->session_state = SESSION_STATE_CLOSED; + /* RPC to vpp evt queue in the right thread */ + tp_vfts[s->session_type].close (s->connection_index, s->thread_index); + +// { +// /* Fabricate event */ +// evt.fifo = s->server_rx_fifo; +// evt.event_type = FIFO_EVENT_SERVER_RX; +// evt.event_id = serial_number++; +// +// /* Based on request block (or not) for lack of space */ +// if (PREDICT_TRUE(q->cursize < q->maxsize)) +// unix_shared_memory_queue_add (app->event_queue, (u8 *) &evt, +// 0 /* do wait for mutex */); +// else +// { +// clib_warning("fifo full"); +// return -1; +// } +// } } /** diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index 96c00d87..a39bc06f 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -33,6 +33,7 @@ typedef enum FIFO_EVENT_SERVER_TX, FIFO_EVENT_TIMEOUT, FIFO_EVENT_SERVER_EXIT, + FIFO_EVENT_BUILTIN_RX } fifo_event_type_t; #define foreach_session_input_error \ @@ -91,14 +92,13 @@ typedef enum SESSION_STATE_N_STATES, } stream_session_state_t; -typedef CLIB_PACKED (struct - { - svm_fifo_t * fifo; - u8 event_type; - /* $$$$ for event logging */ - u16 event_id; - u32 enqueue_length; - }) session_fifo_event_t; +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct { + svm_fifo_t * fifo; + u8 event_type; + u16 event_id; +}) session_fifo_event_t; +/* *INDENT-ON* */ typedef struct _stream_session_t { @@ -333,7 +333,7 @@ stream_session_get_index (stream_session_t * s) } always_inline u32 -stream_session_max_enqueue (transport_connection_t * tc) +stream_session_max_rx_enqueue (transport_connection_t * tc) { stream_session_t *s = stream_session_get (tc->s_index, tc->thread_index); return svm_fifo_max_enqueue (s->server_rx_fifo); @@ -346,7 +346,6 @@ stream_session_fifo_size (transport_connection_t * tc) return s->server_rx_fifo->nitems; } - int stream_session_enqueue_data (transport_connection_t * tc, u8 * data, u16 len, u8 queue_event); diff --git a/src/vnet/session/session_cli.c b/src/vnet/session/session_cli.c index b029ee65..38762afc 100644 --- a/src/vnet/session/session_cli.c +++ b/src/vnet/session/session_cli.c @@ -107,7 +107,7 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, { if (once_per_pool) { - str = format (str, "%-40s%-20s%-20s%-15s", + str = format (str, "%-50s%-20s%-20s%-15s", "Connection", "Rx fifo", "Tx fifo", "Session Index"); vlib_cli_output (vm, "%v", str); diff --git a/src/vnet/session/session_debug.h b/src/vnet/session/session_debug.h index 858f12e0..80a97cd5 100644 --- a/src/vnet/session/session_debug.h +++ b/src/vnet/session/session_debug.h @@ -21,7 +21,8 @@ #define foreach_session_dbg_evt \ _(ENQ, "enqueue") \ - _(DEQ, "dequeue") + _(DEQ, "dequeue") \ + _(DEQ_NODE, "dequeue") typedef enum _session_evt_dbg { @@ -30,7 +31,10 @@ typedef enum _session_evt_dbg #undef _ } session_evt_dbg_e; -#if TRANSPORT_DEBUG +#define SESSION_DBG (0) +#define SESSION_DEQ_NODE_EVTS (0) + +#if TRANSPORT_DEBUG && SESSION_DBG #define DEC_SESSION_ETD(_s, _e, _size) \ struct \ @@ -44,6 +48,12 @@ typedef enum _session_evt_dbg ed = ELOG_TRACK_DATA (&vlib_global_main.elog_main, \ _e, _tc->elog_track) +#define DEC_SESSION_ED(_e, _size) \ + struct \ + { \ + u32 data[_size]; \ + } * ed; \ + ed = ELOG_DATA (&vlib_global_main.elog_main, _e) #define SESSION_EVT_DEQ_HANDLER(_s, _body) \ { \ @@ -67,13 +77,33 @@ typedef enum _session_evt_dbg do { _body; } while (0); \ } +#if SESSION_DEQ_NODE_EVTS +#define SESSION_EVT_DEQ_NODE_HANDLER(_node_evt) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "deq-node: %s", \ + .format_args = "t4", \ + .n_enum_strings = 2, \ + .enum_strings = { \ + "start", \ + "end", \ + }, \ + }; \ + DEC_SESSION_ED(_e, 1); \ + ed->data[0] = _node_evt; \ +} +#else +#define SESSION_EVT_DEQ_NODE_HANDLER(_node_evt) +#endif + #define CONCAT_HELPER(_a, _b) _a##_b #define CC(_a, _b) CONCAT_HELPER(_a, _b) -#define SESSION_EVT_DBG(_s, _evt, _body) CC(_evt, _HANDLER)(_s, _body) +#define SESSION_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args) #else -#define SESSION_EVT_DBG(_s, _evt, _body) +#define SESSION_EVT_DBG(_evt, _args...) #endif #endif /* SRC_VNET_SESSION_SESSION_DEBUG_H_ */ diff --git a/src/vnet/session/transport.h b/src/vnet/session/transport.h index 421121d2..2f912cbc 100644 --- a/src/vnet/session/transport.h +++ b/src/vnet/session/transport.h @@ -38,7 +38,7 @@ typedef struct _transport_connection u32 thread_index; /**< Worker-thread index */ #if TRANSPORT_DEBUG - elog_track_t elog_track; /**< Debug purposes */ + elog_track_t elog_track; /**< Event logging */ #endif /** Macros for 'derived classes' where base is named "connection" */ diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c new file mode 100644 index 00000000..a6eeb775 --- /dev/null +++ b/src/vnet/tcp/builtin_client.c @@ -0,0 +1,411 @@ +/* + * builtin_client.c - vpp built-in tcp client/connect code + * + * Copyright (c) 2017 by Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include + +/* define message IDs */ +#include + +/* define message structures */ +#define vl_typedefs +#include +#undef vl_typedefs + +/* define generated endian-swappers */ +#define vl_endianfun +#include +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) +#define vl_printfun +#include +#undef vl_printfun + +static void +send_test_chunk (tclient_main_t * tm, session_t * s) +{ + u8 *test_data = tm->connect_test_data; + int test_buf_offset = 0; + u32 bytes_this_chunk; + session_fifo_event_t evt; + static int serial_number = 0; + int rv; + + while (s->bytes_to_send > 0) + { + bytes_this_chunk = vec_len (test_data) < s->bytes_to_send + ? vec_len (test_data) : s->bytes_to_send; + + rv = svm_fifo_enqueue_nowait (s->server_tx_fifo, 0 /*pid */ , + bytes_this_chunk, + test_data + test_buf_offset); + + if (rv > 0) + { + s->bytes_to_send -= rv; + test_buf_offset += rv; + + if (svm_fifo_set_event (s->server_tx_fifo)) + { + /* Fabricate TX event, send to vpp */ + evt.fifo = s->server_tx_fifo; + evt.event_type = FIFO_EVENT_SERVER_TX; + evt.event_id = serial_number++; + + unix_shared_memory_queue_add (tm->vpp_event_queue, (u8 *) & evt, + 0 /* do wait for mutex */ ); + } + } + } +} + +static void +receive_test_chunk (tclient_main_t * tm, session_t * s) +{ + svm_fifo_t *rx_fifo = s->server_rx_fifo; + int n_read, bytes, i; + + bytes = svm_fifo_max_dequeue (rx_fifo); + /* Allow enqueuing of new event */ + svm_fifo_unset_event (rx_fifo); + + /* Read the bytes */ + do + { + n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len (tm->rx_buf), + tm->rx_buf); + if (n_read > 0) + { + bytes -= n_read; + for (i = 0; i < n_read; i++) + { + if (tm->rx_buf[i] != ((s->bytes_received + i) & 0xff)) + { + clib_warning ("read %d error at byte %lld, 0x%x not 0x%x", + n_read, s->bytes_received + i, + tm->rx_buf[i], + ((s->bytes_received + i) & 0xff)); + } + } + s->bytes_to_receive -= n_read; + s->bytes_received += n_read; + } + + } + while (n_read < 0 || bytes > 0); +} + +static void * +tclient_thread_fn (void *arg) +{ + tclient_main_t *tm = &tclient_main; + vl_api_disconnect_session_t *dmp; + session_t *sp; + struct timespec ts, tsrem; + int i; + int try_tx, try_rx; + u32 *session_indices = 0; + + /* stats thread wants no signals. */ + { + sigset_t s; + sigfillset (&s); + pthread_sigmask (SIG_SETMASK, &s, 0); + } + + while (1) + { + /* Wait until we're told to get busy */ + while (tm->run_test == 0 + || (tm->ready_connections != tm->expected_connections)) + { + ts.tv_sec = 0; + ts.tv_nsec = 100000000; + while (nanosleep (&ts, &tsrem) < 0) + ts = tsrem; + } + tm->run_test = 0; + + clib_warning ("Run %d iterations", tm->n_iterations); + + for (i = 0; i < tm->n_iterations; i++) + { + session_t *sp; + + do + { + try_tx = try_rx = 0; + + /* *INDENT-OFF* */ + pool_foreach (sp, tm->sessions, ({ + if (sp->bytes_to_send > 0) + { + send_test_chunk (tm, sp); + try_tx = 1; + } + })); + pool_foreach (sp, tm->sessions, ({ + if (sp->bytes_to_receive > 0) + { + receive_test_chunk (tm, sp); + try_rx = 1; + } + })); + /* *INDENT-ON* */ + + } + while (try_tx || try_rx); + } + clib_warning ("Done %d iterations", tm->n_iterations); + + /* Disconnect sessions... */ + vec_reset_length (session_indices); + pool_foreach (sp, tm->sessions, ( + { + vec_add1 (session_indices, + sp - tm->sessions); + } + )); + + for (i = 0; i < vec_len (session_indices); i++) + { + sp = pool_elt_at_index (tm->sessions, session_indices[i]); + dmp = vl_msg_api_alloc_as_if_client (sizeof (*dmp)); + memset (dmp, 0, sizeof (*dmp)); + dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION); + dmp->client_index = tm->my_client_index; + dmp->session_index = sp->vpp_session_index; + dmp->session_thread_index = sp->vpp_session_thread; + vl_msg_api_send_shmem (tm->vl_input_queue, (u8 *) & dmp); + pool_put (tm->sessions, sp); + } + } + /* NOTREACHED */ + return 0; +} + +/* So we don't get "no handler for... " msgs */ +static void +vl_api_memclnt_create_reply_t_handler (vl_api_memclnt_create_reply_t * mp) +{ + tclient_main_t *tm = &tclient_main; + + tm->my_client_index = mp->index; +} + +static void +vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) +{ + tclient_main_t *tm = &tclient_main; + session_t *session; + u32 session_index; + u64 key; + i32 retval = /* clib_net_to_host_u32 ( */ mp->retval /*) */ ; + + if (retval < 0) + { + clib_warning ("connection failed: retval %d", retval); + return; + } + + tm->our_event_queue = (unix_shared_memory_queue_t *) + mp->vpp_event_queue_address; + + tm->vpp_event_queue = (unix_shared_memory_queue_t *) + mp->vpp_event_queue_address; + + /* + * Setup session + */ + pool_get (tm->sessions, session); + memset (session, 0, sizeof (*session)); + session_index = session - tm->sessions; + session->bytes_to_receive = session->bytes_to_send = tm->bytes_to_send; + + session->server_rx_fifo = (svm_fifo_t *) mp->server_rx_fifo; + session->server_rx_fifo->client_session_index = session_index; + session->server_tx_fifo = (svm_fifo_t *) mp->server_tx_fifo; + session->server_tx_fifo->client_session_index = session_index; + + session->vpp_session_index = mp->session_index; + session->vpp_session_thread = mp->session_thread_index; + + /* Add it to the session lookup table */ + key = (((u64) mp->session_thread_index) << 32) | (u64) mp->session_index; + hash_set (tm->session_index_by_vpp_handles, key, session_index); + + tm->ready_connections++; +} + +static void +create_api_loopback (tclient_main_t * tm) +{ + vl_api_memclnt_create_t _m, *mp = &_m; + extern void vl_api_memclnt_create_t_handler (vl_api_memclnt_create_t *); + api_main_t *am = &api_main; + vl_shmem_hdr_t *shmem_hdr; + + /* + * Create a "loopback" API client connection + * Don't do things like this unless you know what you're doing... + */ + + shmem_hdr = am->shmem_hdr; + tm->vl_input_queue = shmem_hdr->vl_input_queue; + memset (mp, 0, sizeof (*mp)); + mp->_vl_msg_id = VL_API_MEMCLNT_CREATE; + mp->context = 0xFEEDFACE; + mp->input_queue = (u64) tm->vl_input_queue; + strncpy ((char *) mp->name, "tcp_tester", sizeof (mp->name) - 1); + + vl_api_memclnt_create_t_handler (mp); +} + +#define foreach_tclient_static_api_msg \ +_(MEMCLNT_CREATE_REPLY, memclnt_create_reply) \ +_(CONNECT_URI_REPLY, connect_uri_reply) + +static clib_error_t * +tclient_api_hookup (vlib_main_t * vm) +{ + tclient_main_t *tm = &tclient_main; + vl_msg_api_msg_config_t _c, *c = &_c; + int i; + + /* Init test data */ + vec_validate (tm->connect_test_data, 64 * 1024 - 1); + for (i = 0; i < vec_len (tm->connect_test_data); i++) + tm->connect_test_data[i] = i & 0xff; + + tm->session_index_by_vpp_handles = hash_create (0, sizeof (uword)); + vec_validate (tm->rx_buf, vec_len (tm->connect_test_data) - 1); + + /* Hook up client-side static APIs to our handlers */ +#define _(N,n) do { \ + c->id = VL_API_##N; \ + c->name = #n; \ + c->handler = vl_api_##n##_t_handler; \ + c->cleanup = vl_noop_handler; \ + c->endian = vl_api_##n##_t_endian; \ + c->print = vl_api_##n##_t_print; \ + c->size = sizeof(vl_api_##n##_t); \ + c->traced = 1; /* trace, so these msgs print */ \ + c->replay = 0; /* don't replay client create/delete msgs */ \ + c->message_bounce = 0; /* don't bounce this message */ \ + vl_msg_api_config(c);} while (0); + + foreach_tclient_static_api_msg; +#undef _ + + return 0; +} + +VLIB_API_INIT_FUNCTION (tclient_api_hookup); + +static clib_error_t * +test_tcp_clients_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + u8 *connect_uri = (u8 *) "tcp://6.0.1.2/1234"; + u8 *uri; + tclient_main_t *tm = &tclient_main; + int i; + u32 n_clients = 1; + + tm->bytes_to_send = 8192; + tm->n_iterations = 1; + vec_free (tm->connect_uri); + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "nclients %d", &n_clients)) + ; + else if (unformat (input, "iterations %d", &tm->n_iterations)) + ; + else if (unformat (input, "bytes %d", &tm->bytes_to_send)) + ; + else if (unformat (input, "uri %s", &tm->connect_uri)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + tm->ready_connections = 0; + tm->expected_connections = n_clients; + uri = connect_uri; + if (tm->connect_uri) + uri = tm->connect_uri; + + create_api_loopback (tm); + + /* Start a transmit thread */ + if (tm->client_thread_handle == 0) + { + int rv = pthread_create (&tm->client_thread_handle, + NULL /*attr */ , tclient_thread_fn, 0); + if (rv) + { + tm->client_thread_handle = 0; + return clib_error_return (0, "pthread_create returned %d", rv); + } + } + + /* Fire off connect requests, in something approaching a normal manner */ + for (i = 0; i < n_clients; i++) + { + vl_api_connect_uri_t *cmp; + cmp = vl_msg_api_alloc_as_if_client (sizeof (*cmp)); + memset (cmp, 0, sizeof (*cmp)); + + cmp->_vl_msg_id = ntohs (VL_API_CONNECT_URI); + cmp->client_index = tm->my_client_index; + cmp->context = ntohl (0xfeedface); + memcpy (cmp->uri, uri, strlen ((char *) uri) + 1); + vl_msg_api_send_shmem (tm->vl_input_queue, (u8 *) & cmp); + } + + tm->run_test = 1; + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (test_clients_command, static) = +{ + .path = "test tcp clients", + .short_help = "test tcp clients", + .function = test_tcp_clients_command_fn, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/builtin_client.h b/src/vnet/tcp/builtin_client.h new file mode 100644 index 00000000..64030302 --- /dev/null +++ b/src/vnet/tcp/builtin_client.h @@ -0,0 +1,131 @@ + +/* + * tclient.h - skeleton vpp engine plug-in header file + * + * Copyright (c) + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_tclient_h__ +#define __included_tclient_h__ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +typedef struct +{ + u32 bytes_to_send; + u32 bytes_sent; + u32 bytes_to_receive; + u32 bytes_received; + + svm_fifo_t *server_rx_fifo; + svm_fifo_t *server_tx_fifo; + + u32 vpp_session_index; + u32 vpp_session_thread; +} session_t; + +typedef struct +{ + /* API message ID base */ + u16 msg_id_base; + + /* vpe input queue */ + unix_shared_memory_queue_t *vl_input_queue; + + /* API client handle */ + u32 my_client_index; + + /* The URI we're playing with */ + u8 *uri; + + /* Session pool */ + session_t *sessions; + + /* Hash table for disconnect processing */ + uword *session_index_by_vpp_handles; + + /* intermediate rx buffer */ + u8 *rx_buf; + + /* URI for slave's connect */ + u8 *connect_uri; + + u32 connected_session_index; + + int i_am_master; + + /* drop all packets */ + int drop_packets; + + /* Our event queue */ + unix_shared_memory_queue_t *our_event_queue; + + /* $$$ single thread only for the moment */ + unix_shared_memory_queue_t *vpp_event_queue; + + pid_t my_pid; + + /* For deadman timers */ + clib_time_t clib_time; + + /* Connection counts */ + u32 expected_connections; + volatile u32 ready_connections; + + /* Signal variables */ + volatile int run_test; + + /* Number of iterations */ + int n_iterations; + + /* Bytes to send */ + u32 bytes_to_send; + + u32 configured_segment_size; + + /* VNET_API_ERROR_FOO -> "Foo" hash table */ + uword *error_string_by_error_number; + + u8 *connect_test_data; + pthread_t client_thread_handle; + u32 client_bytes_received; + u8 test_return_packets; + + /* convenience */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; + ethernet_main_t *ethernet_main; +} tclient_main_t; + +tclient_main_t tclient_main; + +vlib_node_registration_t tclient_node; + +#endif /* __included_tclient_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c index dd6759c5..efd26e91 100644 --- a/src/vnet/tcp/builtin_server.c +++ b/src/vnet/tcp/builtin_server.c @@ -22,6 +22,7 @@ typedef struct { u8 *rx_buf; unix_shared_memory_queue_t **vpp_queue; + u32 byte_index; vlib_main_t *vlib_main; } builtin_server_main_t; @@ -37,6 +38,7 @@ builtin_session_accept_callback (stream_session_t * s) bsm->vpp_queue[s->thread_index] = session_manager_get_vpp_event_queue (s->thread_index); s->session_state = SESSION_STATE_READY; + bsm->byte_index = 0; return 0; } @@ -80,57 +82,94 @@ builtin_redirect_connect_callback (u32 client_index, void *mp) return -1; } +void +test_bytes (builtin_server_main_t * bsm, int actual_transfer) +{ + int i; + + for (i = 0; i < actual_transfer; i++) + { + if (bsm->rx_buf[i] != ((bsm->byte_index + i) & 0xff)) + { + clib_warning ("at %d expected %d got %d", bsm->byte_index + i, + (bsm->byte_index + i) & 0xff, bsm->rx_buf[i]); + } + } + bsm->byte_index += actual_transfer; +} + int -builtin_server_rx_callback (stream_session_t * s, session_fifo_event_t * e) +builtin_server_rx_callback (stream_session_t * s) { - int n_written, bytes, total_copy_bytes; - int n_read; - svm_fifo_t *tx_fifo; + u32 n_written, max_dequeue, max_enqueue, max_transfer; + int actual_transfer; + svm_fifo_t *tx_fifo, *rx_fifo; builtin_server_main_t *bsm = &builtin_server_main; session_fifo_event_t evt; static int serial_number = 0; - bytes = e->enqueue_length; - if (PREDICT_FALSE (bytes <= 0)) + max_dequeue = svm_fifo_max_dequeue (s->server_rx_fifo); + max_enqueue = svm_fifo_max_enqueue (s->server_tx_fifo); + + if (PREDICT_FALSE (max_dequeue == 0)) { - clib_warning ("bizarre rx callback: bytes %d", bytes); return 0; } tx_fifo = s->server_tx_fifo; + rx_fifo = s->server_rx_fifo; /* Number of bytes we're going to copy */ - total_copy_bytes = (bytes < (tx_fifo->nitems - tx_fifo->cursize)) ? bytes : - tx_fifo->nitems - tx_fifo->cursize; + max_transfer = (max_dequeue < max_enqueue) ? max_dequeue : max_enqueue; - if (PREDICT_FALSE (total_copy_bytes <= 0)) + /* No space in tx fifo */ + if (PREDICT_FALSE (max_transfer == 0)) { - clib_warning ("no space in tx fifo, event had %d bytes", bytes); + /* XXX timeout for session that are stuck */ + + /* Program self-tap to retry */ + if (svm_fifo_set_event (rx_fifo)) + { + evt.fifo = rx_fifo; + evt.event_type = FIFO_EVENT_BUILTIN_RX; + evt.event_id = 0; + unix_shared_memory_queue_add (bsm->vpp_queue[s->thread_index], + (u8 *) & evt, + 0 /* do wait for mutex */ ); + } + return 0; } - vec_validate (bsm->rx_buf, total_copy_bytes - 1); - _vec_len (bsm->rx_buf) = total_copy_bytes; + svm_fifo_unset_event (rx_fifo); + + vec_validate (bsm->rx_buf, max_transfer - 1); + _vec_len (bsm->rx_buf) = max_transfer; - n_read = svm_fifo_dequeue_nowait (s->server_rx_fifo, 0, total_copy_bytes, - bsm->rx_buf); - ASSERT (n_read == total_copy_bytes); + actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, 0, max_transfer, + bsm->rx_buf); + ASSERT (actual_transfer == max_transfer); + +// test_bytes (bsm, actual_transfer); /* * Echo back */ - n_written = svm_fifo_enqueue_nowait (tx_fifo, 0, n_read, bsm->rx_buf); - ASSERT (n_written == total_copy_bytes); + n_written = + svm_fifo_enqueue_nowait (tx_fifo, 0, actual_transfer, bsm->rx_buf); + ASSERT (n_written == max_transfer); - /* Fabricate TX event, send to vpp */ - evt.fifo = tx_fifo; - evt.event_type = FIFO_EVENT_SERVER_TX; - evt.enqueue_length = total_copy_bytes; - evt.event_id = serial_number++; + if (svm_fifo_set_event (tx_fifo)) + { + /* Fabricate TX event, send to vpp */ + evt.fifo = tx_fifo; + evt.event_type = FIFO_EVENT_SERVER_TX; + evt.event_id = serial_number++; - unix_shared_memory_queue_add (bsm->vpp_queue[s->thread_index], (u8 *) & evt, - 0 /* do wait for mutex */ ); + unix_shared_memory_queue_add (bsm->vpp_queue[s->thread_index], + (u8 *) & evt, 0 /* do wait for mutex */ ); + } return 0; } @@ -164,7 +203,7 @@ server_create (vlib_main_t * vm) a->api_client_index = ~0; a->session_cb_vft = &builtin_session_cb_vft; a->options = options; - a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 256 << 10; + a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 128 << 20; a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 64 << 10; a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 64 << 10; a->segment_name = segment_name; diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 0d2e6d0e..c3df5bc1 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -328,7 +328,7 @@ tcp_connection_init_vars (tcp_connection_t * tc) { tcp_connection_timers_init (tc); tcp_set_snd_mss (tc); - tc->sack_sb.head = TCP_INVALID_SACK_HOLE_INDEX; + scoreboard_init (&tc->sack_sb); tcp_cc_init (tc); } @@ -558,17 +558,48 @@ tcp_session_send_mss (transport_connection_t * trans_conn) return tc->snd_mss; } +/** + * Compute tx window session is allowed to fill. + */ u32 tcp_session_send_space (transport_connection_t * trans_conn) { + u32 snd_space; tcp_connection_t *tc = (tcp_connection_t *) trans_conn; - return tcp_available_snd_space (tc); + + /* If we haven't gotten dupacks or if we did and have gotten sacked bytes + * then we can still send */ + if (PREDICT_TRUE (tcp_in_fastrecovery (tc) == 0 + && (tc->rcv_dupacks == 0 + || tc->sack_sb.last_sacked_bytes))) + { + snd_space = tcp_available_snd_space (tc); + + /* If we can't write at least a segment, don't try at all */ + if (snd_space < tc->snd_mss) + return 0; + return snd_space; + } + + /* If in fast recovery, send 1 SMSS if wnd allows */ + if (tcp_in_fastrecovery (tc) && tcp_available_snd_space (tc) + && tcp_fastrecovery_sent_1_smss (tc)) + { + tcp_fastrecovery_1_smss_on (tc); + return tc->snd_mss; + } + + return 0; } u32 tcp_session_tx_fifo_offset (transport_connection_t * trans_conn) { tcp_connection_t *tc = (tcp_connection_t *) trans_conn; + + ASSERT (seq_geq (tc->snd_nxt, tc->snd_una)); + + /* This still works if fast retransmit is on */ return (tc->snd_nxt - tc->snd_una); } @@ -762,7 +793,7 @@ tcp_main_enable (vlib_main_t * vm) vec_validate (tm->timer_wheels, num_threads - 1); tcp_initialize_timer_wheels (tm); - vec_validate (tm->delack_connections, num_threads - 1); +// vec_validate (tm->delack_connections, num_threads - 1); /* Initialize clocks per tick for TCP timestamp. Used to compute * monotonically increasing timestamps. */ diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 082ab1d8..b4286bc4 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -30,9 +30,10 @@ #define TCP_PAWS_IDLE 24 * 24 * 60 * 60 * THZ /**< 24 days */ #define TCP_MAX_OPTION_SPACE 40 -#define TCP_DUPACK_THRESHOLD 3 -#define TCP_MAX_RX_FIFO_SIZE 2 << 20 -#define TCP_IW_N_SEGMENTS 10 +#define TCP_DUPACK_THRESHOLD 3 +#define TCP_MAX_RX_FIFO_SIZE 2 << 20 +#define TCP_IW_N_SEGMENTS 10 +#define TCP_ALWAYS_ACK 0 /**< If on, we always ack */ /** TCP FSM state definitions as per RFC793. */ #define foreach_tcp_fsm_state \ @@ -102,13 +103,12 @@ void tcp_update_time (f64 now, u32 thread_index); /** TCP connection flags */ #define foreach_tcp_connection_flag \ - _(DELACK, "Delay ACK") \ _(SNDACK, "Send ACK") \ - _(BURSTACK, "Burst ACK set") \ _(FINSNT, "FIN sent") \ _(SENT_RCV_WND0, "Sent 0 receive window") \ _(RECOVERY, "Recovery on") \ - _(FAST_RECOVERY, "Fast Recovery on") + _(FAST_RECOVERY, "Fast Recovery on") \ + _(FR_1_SMSS, "Sent 1 SMSS") typedef enum _tcp_connection_flag_bits { @@ -160,8 +160,12 @@ typedef struct _sack_scoreboard_hole typedef struct _sack_scoreboard { sack_scoreboard_hole_t *holes; /**< Pool of holes */ - u32 head; /**< Index to first entry */ + u32 head; /**< Index of first entry */ + u32 tail; /**< Index of last entry */ u32 sacked_bytes; /**< Number of bytes sacked in sb */ + u32 last_sacked_bytes; /**< Number of bytes last sacked */ + u32 snd_una_adv; /**< Bytes to add to snd_una */ + u32 max_byte_sacked; /**< Highest byte acked */ } sack_scoreboard_t; typedef enum _tcp_cc_algorithm_type @@ -214,7 +218,7 @@ typedef struct _tcp_connection sack_block_t *snd_sacks; /**< Vector of SACKs to send. XXX Fixed size? */ sack_scoreboard_t sack_sb; /**< SACK "scoreboard" that tracks holes */ - u8 rcv_dupacks; /**< Number of DUPACKs received */ + u16 rcv_dupacks; /**< Number of DUPACKs received */ u8 snt_dupacks; /**< Number of DUPACKs sent in a burst */ /* Congestion control */ @@ -224,6 +228,7 @@ typedef struct _tcp_connection u32 bytes_acked; /**< Bytes acknowledged by current segment */ u32 rtx_bytes; /**< Retransmitted bytes */ u32 tsecr_last_ack; /**< Timestamp echoed to us in last healthy ACK */ + u32 snd_congestion; /**< snd_una_max when congestion is detected */ tcp_cc_algorithm_t *cc_algo; /**< Congestion control algorithm */ /* RTT and RTO */ @@ -250,8 +255,10 @@ struct _tcp_cc_algorithm #define tcp_fastrecovery_off(tc) (tc)->flags &= ~TCP_CONN_FAST_RECOVERY #define tcp_in_fastrecovery(tc) ((tc)->flags & TCP_CONN_FAST_RECOVERY) #define tcp_in_recovery(tc) ((tc)->flags & (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY)) -#define tcp_recovery_off(tc) ((tc)->flags &= ~(TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY)) #define tcp_in_slowstart(tc) (tc->cwnd < tc->ssthresh) +#define tcp_fastrecovery_sent_1_smss(tc) ((tc)->flags & TCP_CONN_FR_1_SMSS) +#define tcp_fastrecovery_1_smss_on(tc) ((tc)->flags |= TCP_CONN_FR_1_SMSS) +#define tcp_fastrecovery_1_smss_off(tc) ((tc)->flags &= ~TCP_CONN_FR_1_SMSS) typedef enum { @@ -293,8 +300,8 @@ typedef struct _tcp_main /* Per worker-thread timer wheel for connections timers */ tw_timer_wheel_16t_2w_512sl_t *timer_wheels; - /* Convenience per worker-thread vector of connections to DELACK */ - u32 **delack_connections; +// /* Convenience per worker-thread vector of connections to DELACK */ +// u32 **delack_connections; /* Pool of half-open connections on which we've sent a SYN */ tcp_connection_t *half_open_connections; @@ -397,8 +404,16 @@ tcp_end_seq (tcp_header_t * th, u32 len) always_inline u32 tcp_flight_size (const tcp_connection_t * tc) { - return tc->snd_una_max - tc->snd_una - tc->sack_sb.sacked_bytes - + tc->rtx_bytes; + int flight_size; + + flight_size = (int) ((tc->snd_una_max - tc->snd_una) + tc->rtx_bytes) + - (tc->rcv_dupacks * tc->snd_mss) /* - tc->sack_sb.sacked_bytes */ ; + + /* Happens if we don't clear sacked bytes */ + if (flight_size < 0) + return 0; + + return flight_size; } /** @@ -439,9 +454,13 @@ tcp_available_snd_space (const tcp_connection_t * tc) return available_wnd - flight_size; } +void tcp_update_rcv_wnd (tcp_connection_t * tc); + void tcp_retransmit_first_unacked (tcp_connection_t * tc); void tcp_fast_retransmit (tcp_connection_t * tc); +void tcp_cc_congestion (tcp_connection_t * tc); +void tcp_cc_recover (tcp_connection_t * tc); always_inline u32 tcp_time_now (void) @@ -453,7 +472,7 @@ u32 tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b); u32 tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, - u32 max_bytes); + u32 offset, u32 max_bytes); void tcp_connection_timers_init (tcp_connection_t * tc); void tcp_connection_timers_reset (tcp_connection_t * tc); @@ -476,14 +495,6 @@ tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval) tc->c_c_index, timer_id, interval); } -always_inline void -tcp_retransmit_timer_set (tcp_connection_t * tc) -{ - /* XXX Switch to faster TW */ - tcp_timer_set (tc, TCP_TIMER_RETRANSMIT, - clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); -} - always_inline void tcp_timer_reset (tcp_connection_t * tc, u8 timer_id) { @@ -506,6 +517,27 @@ tcp_timer_update (tcp_connection_t * tc, u8 timer_id, u32 interval) tc->c_c_index, timer_id, interval); } +/* XXX Switch retransmit to faster TW */ +always_inline void +tcp_retransmit_timer_set (tcp_connection_t * tc) +{ + tcp_timer_set (tc, TCP_TIMER_RETRANSMIT, + clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); +} + +always_inline void +tcp_retransmit_timer_update (tcp_connection_t * tc) +{ + tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, + clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); +} + +always_inline void +tcp_retransmit_timer_reset (tcp_connection_t * tc) +{ + tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT); +} + always_inline u8 tcp_timer_is_active (tcp_connection_t * tc, tcp_timers_e timer) { @@ -516,6 +548,14 @@ void scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole); +always_inline sack_scoreboard_hole_t * +scoreboard_get_hole (sack_scoreboard_t * sb, u32 index) +{ + if (index != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, index); + return 0; +} + always_inline sack_scoreboard_hole_t * scoreboard_next_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) { @@ -532,6 +572,14 @@ scoreboard_first_hole (sack_scoreboard_t * sb) return 0; } +always_inline sack_scoreboard_hole_t * +scoreboard_last_hole (sack_scoreboard_t * sb) +{ + if (sb->tail != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, sb->tail); + return 0; +} + always_inline void scoreboard_clear (sack_scoreboard_t * sb) { @@ -540,6 +588,10 @@ scoreboard_clear (sack_scoreboard_t * sb) { scoreboard_remove_hole (sb, hole); } + sb->sacked_bytes = 0; + sb->last_sacked_bytes = 0; + sb->snd_una_adv = 0; + sb->max_byte_sacked = 0; } always_inline u32 @@ -548,6 +600,21 @@ scoreboard_hole_bytes (sack_scoreboard_hole_t * hole) return hole->end - hole->start; } +always_inline u32 +scoreboard_hole_index (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) +{ + return hole - sb->holes; +} + +always_inline void +scoreboard_init (sack_scoreboard_t * sb) +{ + sb->head = TCP_INVALID_SACK_HOLE_INDEX; + sb->tail = TCP_INVALID_SACK_HOLE_INDEX; +} + +void tcp_rcv_sacks (tcp_connection_t * tc, u32 ack); + always_inline void tcp_cc_algo_register (tcp_cc_algorithm_type_e type, const tcp_cc_algorithm_t * vft) diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index 069c512d..5a71694e 100644 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -19,6 +19,8 @@ #include #define TCP_DEBUG (1) +#define TCP_DEBUG_CC (1) +#define TCP_DEBUG_VERBOSE (0) #define foreach_tcp_dbg_evt \ _(INIT, "") \ @@ -30,14 +32,24 @@ _(DELETE, "delete") \ _(SYN_SENT, "SYN sent") \ _(FIN_SENT, "FIN sent") \ + _(ACK_SENT, "ACK sent") \ + _(DUPACK_SENT, "DUPACK sent") \ _(RST_SENT, "RST sent") \ _(SYN_RCVD, "SYN rcvd") \ _(ACK_RCVD, "ACK rcvd") \ + _(DUPACK_RCVD, "DUPACK rcvd") \ _(FIN_RCVD, "FIN rcvd") \ _(RST_RCVD, "RST rcvd") \ _(PKTIZE, "packetize") \ _(INPUT, "in") \ - _(TIMER_POP, "timer pop") + _(SND_WND, "snd_wnd update") \ + _(OUTPUT, "output") \ + _(TIMER_POP, "timer pop") \ + _(CC_RTX, "retransmit") \ + _(CC_EVT, "cc event") \ + _(CC_PACK, "cc partial ack") \ + _(SEG_INVALID, "invalid segment") \ + _(ACK_RCV_ERR, "invalid ack") \ typedef enum _tcp_dbg { @@ -73,10 +85,10 @@ typedef enum _tcp_dbg_evt ed = ELOG_TRACK_DATA (&vlib_global_main.elog_main, \ _e, _tc->c_elog_track) -#define TCP_EVT_INIT_HANDLER(_tc, ...) \ +#define TCP_EVT_INIT_HANDLER(_tc, _fmt, ...) \ { \ _tc->c_elog_track.name = \ - (char *) format (0, "%d%c", _tc->c_c_index, 0); \ + (char *) format (0, _fmt, _tc->c_c_index, 0); \ elog_track_register (&vlib_global_main.elog_main, &_tc->c_elog_track);\ } @@ -87,7 +99,7 @@ typedef enum _tcp_dbg_evt #define TCP_EVT_OPEN_HANDLER(_tc, ...) \ { \ - TCP_EVT_INIT_HANDLER(_tc); \ + TCP_EVT_INIT_HANDLER(_tc, "s%d%c"); \ ELOG_TYPE_DECLARE (_e) = \ { \ .format = "open: index %d", \ @@ -110,7 +122,7 @@ typedef enum _tcp_dbg_evt #define TCP_EVT_BIND_HANDLER(_tc, ...) \ { \ - TCP_EVT_INIT_HANDLER(_tc); \ + TCP_EVT_INIT_HANDLER(_tc, "l%d%c"); \ ELOG_TYPE_DECLARE (_e) = \ { \ .format = "bind: listener %d", \ @@ -138,16 +150,44 @@ typedef enum _tcp_dbg_evt .format = "delete: %d", \ .format_args = "i4", \ }; \ - DECLARE_ETD(_tc, _e, 0); \ + DECLARE_ETD(_tc, _e, 1); \ ed->data[0] = _tc->c_c_index; \ TCP_EVT_DEALLOC_HANDLER(_tc); \ } +#define TCP_EVT_ACK_SENT_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "ack_prep: acked %u rcv_nxt %u rcv_wnd %u snd_nxt %u", \ + .format_args = "i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 4); \ + ed->data[0] = _tc->rcv_nxt - _tc->rcv_las; \ + ed->data[1] = _tc->rcv_nxt - _tc->irs; \ + ed->data[2] = _tc->rcv_wnd; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ +} + +#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "dack_tx: rcv_nxt %u rcv_wnd %u snd_nxt %u av-wnd %u", \ + .format_args = "i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 4); \ + ed->data[0] = _tc->rcv_nxt - _tc->irs; \ + ed->data[1] = _tc->rcv_wnd; \ + ed->data[2] = _tc->snd_nxt - _tc->iss; \ + ed->data[3] = tcp_available_wnd(_tc); \ +} + #define TCP_EVT_SYN_SENT_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "SYN: iss %d", \ + .format = "SYNtx: iss %u", \ .format_args = "i4", \ }; \ DECLARE_ETD(_tc, _e, 1); \ @@ -158,7 +198,7 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "FIN: snd_nxt %d rcv_nxt %d", \ + .format = "FINtx: snd_nxt %d rcv_nxt %d", \ .format_args = "i4i4", \ }; \ DECLARE_ETD(_tc, _e, 2); \ @@ -170,7 +210,7 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "RST: snd_nxt %d rcv_nxt %d", \ + .format = "RSTtx: snd_nxt %d rcv_nxt %d", \ .format_args = "i4i4", \ }; \ DECLARE_ETD(_tc, _e, 2); \ @@ -180,10 +220,10 @@ typedef enum _tcp_dbg_evt #define TCP_EVT_SYN_RCVD_HANDLER(_tc, ...) \ { \ - TCP_EVT_INIT_HANDLER(_tc); \ + TCP_EVT_INIT_HANDLER(_tc, "s%d%c"); \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "SYN rcvd: irs %d", \ + .format = "SYNrx: irs %u", \ .format_args = "i4", \ }; \ DECLARE_ETD(_tc, _e, 1); \ @@ -194,7 +234,7 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "FIN rcvd: snd_nxt %d rcv_nxt %d", \ + .format = "FINrx: snd_nxt %d rcv_nxt %d", \ .format_args = "i4i4", \ }; \ DECLARE_ETD(_tc, _e, 2); \ @@ -206,7 +246,7 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "RST rcvd: snd_nxt %d rcv_nxt %d", \ + .format = "RSTrx: snd_nxt %d rcv_nxt %d", \ .format_args = "i4i4", \ }; \ DECLARE_ETD(_tc, _e, 2); \ @@ -214,54 +254,68 @@ typedef enum _tcp_dbg_evt ed->data[1] = _tc->rcv_nxt - _tc->irs; \ } -#define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...) \ +#define TCP_EVT_ACK_RCVD_HANDLER(_tc, _ack, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "ACK: acked %u cwnd %u inflight %u", \ - .format_args = "i4i4i4", \ + .format = "acked: %u snd_una %u ack %u cwnd %u inflight %u", \ + .format_args = "i4i4i4i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 3); \ + DECLARE_ETD(_tc, _e, 5); \ ed->data[0] = _tc->bytes_acked; \ - ed->data[1] = _tc->cwnd; \ - ed->data[2] = tcp_flight_size(_tc); \ + ed->data[1] = _tc->snd_una - _tc->iss; \ + ed->data[2] = _ack - _tc->iss; \ + ed->data[3] = _tc->cwnd; \ + ed->data[4] = tcp_flight_size(_tc); \ } -#define TCP_EVT_PKTIZE_HANDLER(_tc, ...) \ +#define TCP_EVT_DUPACK_RCVD_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "pktize: snd_una %u snd_nxt %u una_max %u", \ - .format_args = "i4i4i4", \ + .format = "dack_rx: snd_una %u cwnd %u snd_wnd %u inflight %u", \ + .format_args = "i4i4i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 3); \ + DECLARE_ETD(_tc, _e, 4); \ ed->data[0] = _tc->snd_una - _tc->iss; \ - ed->data[1] = _tc->snd_nxt - _tc->iss; \ - ed->data[2] = _tc->snd_una_max - _tc->iss; \ + ed->data[1] = _tc->cwnd; \ + ed->data[2] = _tc->snd_wnd; \ + ed->data[3] = tcp_flight_size(_tc); \ } -#define TCP_EVT_OUTPUT_HANDLER(_tc, flags, n_bytes,...) \ +#define TCP_EVT_PKTIZE_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "out: flags %x, bytes %u", \ - .format_args = "i4i4", \ + .format = "pktize: una %u snd_nxt %u space %u flight %u rcv_wnd %u",\ + .format_args = "i4i4i4i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 2); \ - ed->data[0] = flags; \ - ed->data[1] = n_bytes; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->snd_una - _tc->iss; \ + ed->data[1] = _tc->snd_nxt - _tc->iss; \ + ed->data[2] = tcp_available_snd_space (_tc); \ + ed->data[3] = tcp_flight_size (_tc); \ + ed->data[4] = _tc->rcv_wnd; \ } -#define TCP_EVT_INPUT_HANDLER(_tc, n_bytes, ...) \ +#define TCP_EVT_INPUT_HANDLER(_tc, _type, _len, _written, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "in: bytes %u rcv_nxt %u", \ - .format_args = "i4i4", \ + .format = "in: %s len %u written %d rcv_nxt %u free wnd %d", \ + .format_args = "t4i4i4i4i4", \ + .n_enum_strings = 2, \ + .enum_strings = { \ + "order", \ + "ooo", \ + }, \ }; \ - DECLARE_ETD(_tc, _e, 2); \ - ed->data[0] = n_bytes; \ - ed->data[1] = _tc->rcv_nxt - _tc->irs; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _type; \ + ed->data[1] = _len; \ + ed->data[2] = _written; \ + ed->data[3] = (_tc->rcv_nxt - _tc->irs) + _written; \ + ed->data[4] = _tc->rcv_wnd - (_tc->rcv_nxt - _tc->rcv_las); \ } #define TCP_EVT_TIMER_POP_HANDLER(_tc_index, _timer_id, ...) \ @@ -296,9 +350,131 @@ typedef enum _tcp_dbg_evt ed->data[1] = _timer_id; \ } +#define TCP_EVT_SEG_INVALID_HANDLER(_tc, _seq, _end, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "seg-inv: seq %u end %u rcv_las %u rcv_nxt %u wnd %u", \ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _seq - _tc->irs; \ + ed->data[1] = _end - _tc->irs; \ + ed->data[2] = _tc->rcv_las - _tc->irs; \ + ed->data[3] = _tc->rcv_nxt - _tc->irs; \ + ed->data[4] = _tc->rcv_wnd; \ +} + +#define TCP_EVT_ACK_RCV_ERR_HANDLER(_tc, _type, _ack, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "ack-err: %s ack %u snd_una %u snd_nxt %u una_max %u", \ + .format_args = "t4i4i4i4i4", \ + .n_enum_strings = 3, \ + .enum_strings = { \ + "invalid", \ + "old", \ + "future", \ + }, \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _type; \ + ed->data[1] = _ack - _tc->iss; \ + ed->data[2] = _tc->snd_una - _tc->iss; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ + ed->data[4] = _tc->snd_una_max - _tc->iss; \ +} + +/* + * Congestion Control + */ + +#if TCP_DEBUG_CC +#define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "rtx: snd_nxt %u offset %u snd %u rtx %u", \ + .format_args = "i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 4); \ + ed->data[0] = _tc->snd_nxt - _tc->iss; \ + ed->data[1] = offset; \ + ed->data[2] = n_bytes; \ + ed->data[3] = _tc->rtx_bytes; \ +} + +#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "cc: %s wnd %u snd_cong %u rtx_bytes %u", \ + .format_args = "t4i4i4i4", \ + .n_enum_strings = 5, \ + .enum_strings = { \ + "fast-rtx", \ + "rtx-timeout", \ + "first-rtx", \ + "recovered", \ + "congestion", \ + }, \ + }; \ + DECLARE_ETD(_tc, _e, 4); \ + ed->data[0] = _sub_evt; \ + ed->data[1] = tcp_available_snd_space (_tc); \ + ed->data[2] = _tc->snd_congestion - _tc->iss; \ + ed->data[3] = _tc->rtx_bytes; \ +} + +#define TCP_EVT_CC_PACK_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "pack: snd_una %u snd_una_max %u", \ + .format_args = "i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = _tc->snd_una - _tc->iss; \ + ed->data[1] = _tc->snd_una_max - _tc->iss; \ +} + +#else +#define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...) +#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, _snd_space, ...) +#define TCP_EVT_CC_PACK_HANDLER(_tc, ...) +#endif + +#if TCP_DBG_VERBOSE +#define TCP_EVT_SND_WND_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "snd_wnd update: %u ", \ + .format_args = "i4", \ + }; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->snd_wnd; \ +} + +#define TCP_EVT_OUTPUT_HANDLER(_tc, flags, n_bytes,...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "out: flags %x, bytes %u", \ + .format_args = "i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = flags; \ + ed->data[1] = n_bytes; \ +} +#else +#define TCP_EVT_SND_WND_HANDLER(_tc, ...) +#define TCP_EVT_OUTPUT_HANDLER(_tc, flags, n_bytes,...) +#endif + #define CONCAT_HELPER(_a, _b) _a##_b #define CC(_a, _b) CONCAT_HELPER(_a, _b) - #define TCP_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args) #else diff --git a/src/vnet/tcp/tcp_error.def b/src/vnet/tcp/tcp_error.def index 2dbdd9b3..b91a08c0 100644 --- a/src/vnet/tcp/tcp_error.def +++ b/src/vnet/tcp/tcp_error.def @@ -12,12 +12,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - tcp_error (NONE, "no error") tcp_error (NO_LISTENER, "no listener for dst port") tcp_error (LOOKUP_DROPS, "lookup drops") tcp_error (DISPATCH, "Dispatch error") tcp_error (ENQUEUED, "Packets pushed into rx fifo") +tcp_error (PARTIALLY_ENQUEUED, "Packets partially pushed into rx fifo") tcp_error (PURE_ACK, "Pure acks") tcp_error (SYNS_RCVD, "SYNs received") tcp_error (SYN_ACKS_RCVD, "SYN-ACKs received") @@ -26,11 +26,14 @@ tcp_error (FIFO_FULL, "Packets dropped for lack of rx fifo space") tcp_error (EVENT_FIFO_FULL, "Events not sent for lack of event fifo space") tcp_error (API_QUEUE_FULL, "Sessions not created for lack of API queue space") tcp_error (CREATE_SESSION_FAIL, "Sessions couldn't be allocated") -tcp_error (SEGMENT_INVALID, "Invalid segment") +tcp_error (SEGMENT_INVALID, "Invalid segments") +tcp_error (SEGMENT_OLD, "Old segment") tcp_error (ACK_INVALID, "Invalid ACK") tcp_error (ACK_DUP, "Duplicate ACK") tcp_error (ACK_OLD, "Old ACK") +tcp_error (ACK_FUTURE, "Future ACK") tcp_error (PKTS_SENT, "Packets sent") tcp_error (FILTERED_DUPACKS, "Filtered duplicate ACKs") tcp_error (RST_SENT, "Resets sent") tcp_error (INVALID_CONNECTION, "Invalid connection") +tcp_error (NO_WND, "No window") \ No newline at end of file diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 67af4321..5d11985f 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -95,13 +95,21 @@ vlib_node_registration_t tcp6_established_node; * or the rcv_nxt at last ack sent instead of rcv_nxt since that's the * peer's reference when computing our receive window. * - * This accepts only segments within the window. + * This: + * seq_leq (end_seq, tc->rcv_las + tc->rcv_wnd) && seq_geq (seq, tc->rcv_las) + * however, is too strict when we have retransmits. Instead we just check that + * the seq is not beyond the right edge and that the end of the segment is not + * less than the left edge. + * + * N.B. rcv_nxt and rcv_wnd are both updated in this node if acks are sent, so + * use rcv_nxt in the right edge window test instead of rcv_las. + * */ always_inline u8 tcp_segment_in_rcv_wnd (tcp_connection_t * tc, u32 seq, u32 end_seq) { - return seq_leq (end_seq, tc->rcv_las + tc->rcv_wnd) - && seq_geq (seq, tc->rcv_nxt); + return (seq_geq (end_seq, tc->rcv_las) + && seq_leq (seq, tc->rcv_nxt + tc->rcv_wnd)); } void @@ -253,6 +261,7 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, { tcp_make_ack (tc0, b0); *next0 = tcp_next_output (tc0->c_is_ip4); + TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0); return -1; } } @@ -262,13 +271,25 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, if (!tcp_segment_in_rcv_wnd (tc0, vnet_buffer (b0)->tcp.seq_number, vnet_buffer (b0)->tcp.seq_end)) { - if (!tcp_rst (th0)) + /* If our window is 0 and the packet is in sequence, let it pass + * through for ack processing. It should be dropped later.*/ + if (tc0->rcv_wnd == 0 + && tc0->rcv_nxt == vnet_buffer (b0)->tcp.seq_number) { - /* Send dup ack */ - tcp_make_ack (tc0, b0); - *next0 = tcp_next_output (tc0->c_is_ip4); + /* Make it look as if there's nothing to dequeue */ + vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number; + } + else + { + /* If not RST, send dup ack */ + if (!tcp_rst (th0)) + { + tcp_make_ack (tc0, b0); + *next0 = tcp_next_output (tc0->c_is_ip4); + TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0); + } + return -1; } - return -1; } /* 2nd: check the RST bit */ @@ -326,13 +347,13 @@ tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt) /* XXX Drop in RTT results in RTTVAR increase and bigger RTO. * The increase should be bound */ - tc->rttvar += (clib_abs (err) - tc->rttvar) >> 2; + tc->rttvar += ((int) clib_abs (err) - (int) tc->rttvar) >> 2; } else { /* First measurement. */ tc->srtt = mrtt; - tc->rttvar = mrtt << 1; + tc->rttvar = mrtt >> 1; } } @@ -394,7 +415,11 @@ tcp_dequeue_acked (tcp_connection_t * tc, u32 ack) } } -/** Check if dupack as per RFC5681 Sec. 2 */ +/** + * Check if dupack as per RFC5681 Sec. 2 + * + * This works only if called before updating snd_wnd. + * */ always_inline u8 tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 new_snd_wnd) { @@ -429,10 +454,10 @@ scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) } sack_scoreboard_hole_t * -scoreboard_insert_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * prev, +scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index, u32 start, u32 end) { - sack_scoreboard_hole_t *hole, *next; + sack_scoreboard_hole_t *hole, *next, *prev; u32 hole_index; pool_get (sb->holes, hole); @@ -442,6 +467,7 @@ scoreboard_insert_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * prev, hole->end = end; hole_index = hole - sb->holes; + prev = scoreboard_get_hole (sb, prev_index); if (prev) { hole->prev = prev - sb->holes; @@ -462,28 +488,35 @@ scoreboard_insert_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * prev, return hole; } -static void +void tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { sack_scoreboard_t *sb = &tc->sack_sb; sack_block_t *blk, tmp; - sack_scoreboard_hole_t *hole, *next_hole; - u32 blk_index = 0; + sack_scoreboard_hole_t *hole, *next_hole, *last_hole, *new_hole; + u32 blk_index = 0, old_sacked_bytes, hole_index; int i, j; - if (!tcp_opts_sack (tc) && sb->head == TCP_INVALID_SACK_HOLE_INDEX) + sb->last_sacked_bytes = 0; + sb->snd_una_adv = 0; + old_sacked_bytes = sb->sacked_bytes; + + if (!tcp_opts_sack (&tc->opt) && sb->head == TCP_INVALID_SACK_HOLE_INDEX) return; /* Remove invalid blocks */ - vec_foreach (blk, tc->opt.sacks) - { - if (seq_lt (blk->start, blk->end) - && seq_gt (blk->start, tc->snd_una) - && seq_gt (blk->start, ack) && seq_lt (blk->end, tc->snd_nxt)) - continue; - - vec_del1 (tc->opt.sacks, blk - tc->opt.sacks); - } + blk = tc->opt.sacks; + while (blk < vec_end (tc->opt.sacks)) + { + if (seq_lt (blk->start, blk->end) + && seq_gt (blk->start, tc->snd_una) + && seq_gt (blk->start, ack) && seq_leq (blk->end, tc->snd_nxt)) + { + blk++; + continue; + } + vec_del1 (tc->opt.sacks, blk - tc->opt.sacks); + } /* Add block for cumulative ack */ if (seq_gt (ack, tc->snd_una)) @@ -498,7 +531,7 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) /* Make sure blocks are ordered */ for (i = 0; i < vec_len (tc->opt.sacks); i++) - for (j = i; j < vec_len (tc->opt.sacks); j++) + for (j = i + 1; j < vec_len (tc->opt.sacks); j++) if (seq_lt (tc->opt.sacks[j].start, tc->opt.sacks[i].start)) { tmp = tc->opt.sacks[i]; @@ -506,10 +539,22 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) tc->opt.sacks[j] = tmp; } - /* If no holes, insert the first that covers all outstanding bytes */ if (sb->head == TCP_INVALID_SACK_HOLE_INDEX) { - scoreboard_insert_hole (sb, 0, tc->snd_una, tc->snd_una_max); + /* If no holes, insert the first that covers all outstanding bytes */ + last_hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX, + tc->snd_una, tc->snd_una_max); + sb->tail = scoreboard_hole_index (sb, last_hole); + } + else + { + /* If we have holes but snd_una_max is beyond the last hole, update + * last hole end */ + tmp = tc->opt.sacks[vec_len (tc->opt.sacks) - 1]; + last_hole = scoreboard_last_hole (sb); + if (seq_gt (tc->snd_una_max, sb->max_byte_sacked) + && seq_gt (tc->snd_una_max, last_hole->end)) + last_hole->end = tc->snd_una_max; } /* Walk the holes with the SACK blocks */ @@ -526,10 +571,10 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) next_hole = scoreboard_next_hole (sb, hole); /* Byte accounting */ - if (seq_lt (hole->end, ack)) + if (seq_leq (hole->end, ack)) { - /* Bytes lost because snd wnd left edge advances */ - if (seq_lt (next_hole->start, ack)) + /* Bytes lost because snd_wnd left edge advances */ + if (next_hole && seq_leq (next_hole->start, ack)) sb->sacked_bytes -= next_hole->start - hole->end; else sb->sacked_bytes -= ack - hole->end; @@ -539,35 +584,78 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) sb->sacked_bytes += scoreboard_hole_bytes (hole); } + /* snd_una needs to be advanced */ + if (seq_geq (ack, hole->end)) + { + if (next_hole && seq_lt (ack, next_hole->start)) + sb->snd_una_adv = next_hole->start - ack; + else + sb->snd_una_adv = sb->max_byte_sacked - ack; + + /* all these can be delivered */ + sb->sacked_bytes -= sb->snd_una_adv; + } + + /* About to remove last hole */ + if (hole == last_hole) + { + sb->tail = hole->prev; + last_hole = scoreboard_last_hole (sb); + /* keep track of max byte sacked in case the last hole + * is acked */ + if (seq_gt (hole->end, sb->max_byte_sacked)) + sb->max_byte_sacked = hole->end; + } scoreboard_remove_hole (sb, hole); hole = next_hole; } - /* Partial overlap */ + /* Partial 'head' overlap */ else { - sb->sacked_bytes += blk->end - hole->start; - hole->start = blk->end; + if (seq_gt (blk->end, hole->start)) + { + sb->sacked_bytes += blk->end - hole->start; + hole->start = blk->end; + } blk_index++; } } else { /* Hole must be split */ - if (seq_leq (blk->end, hole->end)) + if (seq_lt (blk->end, hole->end)) { sb->sacked_bytes += blk->end - blk->start; - scoreboard_insert_hole (sb, hole, blk->end, hole->end); - hole->end = blk->start - 1; + hole_index = scoreboard_hole_index (sb, hole); + new_hole = scoreboard_insert_hole (sb, hole_index, blk->end, + hole->end); + + /* Pool might've moved */ + hole = scoreboard_get_hole (sb, hole_index); + hole->end = blk->start; + + /* New or split of tail */ + if ((last_hole->end == new_hole->end) + || seq_lt (last_hole->end, new_hole->start)) + { + last_hole = new_hole; + sb->tail = scoreboard_hole_index (sb, new_hole); + } + blk_index++; + hole = scoreboard_next_hole (sb, hole); } else { - sb->sacked_bytes += hole->end - blk->start + 1; - hole->end = blk->start - 1; + sb->sacked_bytes += hole->end - blk->start; + hole->end = blk->start; hole = scoreboard_next_hole (sb, hole); } } } + + sb->last_sacked_bytes = sb->sacked_bytes + sb->snd_una_adv + - old_sacked_bytes; } /** Update snd_wnd @@ -577,72 +665,94 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) static void tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd) { - if (tc->snd_wl1 < seq || (tc->snd_wl1 == seq && tc->snd_wl2 <= ack)) + if (seq_lt (tc->snd_wl1, seq) + || (tc->snd_wl1 == seq && seq_leq (tc->snd_wl2, ack))) { tc->snd_wnd = snd_wnd; tc->snd_wl1 = seq; tc->snd_wl2 = ack; + TCP_EVT_DBG (TCP_EVT_SND_WND, tc); } } -static void +void tcp_cc_congestion (tcp_connection_t * tc) { + tc->snd_congestion = tc->snd_nxt; tc->cc_algo->congestion (tc); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 4); } -static void +void tcp_cc_recover (tcp_connection_t * tc) { - if (tcp_in_fastrecovery (tc)) - { - tc->cc_algo->recovered (tc); - tcp_recovery_off (tc); - } - else if (tcp_in_recovery (tc)) - { - tcp_recovery_off (tc); - tc->cwnd = tcp_loss_wnd (tc); - } + tc->cc_algo->recovered (tc); + + tc->rtx_bytes = 0; + tc->rcv_dupacks = 0; + tc->snd_nxt = tc->snd_una; + + tc->cc_algo->rcv_ack (tc); + tc->tsecr_last_ack = tc->opt.tsecr; + + tcp_fastrecovery_1_smss_off (tc); + tcp_fastrecovery_off (tc); + + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); } static void -tcp_cc_rcv_ack (tcp_connection_t * tc) +tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) { u8 partial_ack; if (tcp_in_recovery (tc)) { - partial_ack = seq_lt (tc->snd_una, tc->snd_una_max); + partial_ack = seq_lt (tc->snd_una, tc->snd_congestion); if (!partial_ack) { /* Clear retransmitted bytes. */ - tc->rtx_bytes = 0; tcp_cc_recover (tc); } else { + TCP_EVT_DBG (TCP_EVT_CC_PACK, tc); + /* Clear retransmitted bytes. XXX should we clear all? */ tc->rtx_bytes = 0; tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK); - /* Retransmit first unacked segment */ - tcp_retransmit_first_unacked (tc); + /* In case snd_nxt is still in the past and output tries to + * shove some new bytes */ + tc->snd_nxt = tc->snd_una; + + /* XXX need proper RFC6675 support */ + if (tc->sack_sb.last_sacked_bytes) + { + tcp_fast_retransmit (tc); + } + else + { + /* Retransmit first unacked segment */ + tcp_retransmit_first_unacked (tc); + /* If window allows, send 1 SMSS of new data */ + if (seq_lt (tc->snd_nxt, tc->snd_congestion)) + tc->snd_nxt = tc->snd_congestion; + } } } else { tc->cc_algo->rcv_ack (tc); + tc->tsecr_last_ack = tc->opt.tsecr; + tc->rcv_dupacks = 0; } - - tc->rcv_dupacks = 0; - tc->tsecr_last_ack = tc->opt.tsecr; } static void tcp_cc_rcv_dupack (tcp_connection_t * tc, u32 ack) { - ASSERT (tc->snd_una == ack); +// ASSERT (seq_geq(tc->snd_una, ack)); tc->rcv_dupacks++; if (tc->rcv_dupacks == TCP_DUPACK_THRESHOLD) @@ -688,20 +798,39 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, { u32 new_snd_wnd; - /* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) then send an - * ACK, drop the segment, and return */ + /* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) */ if (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)) { - tcp_make_ack (tc, b); - *next = tcp_next_output (tc->c_is_ip4); - *error = TCP_ERROR_ACK_INVALID; - return -1; + /* If we have outstanding data and this is within the window, accept it, + * probably retransmit has timed out. Otherwise ACK segment and then + * drop it */ + if (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_una_max)) + { + tcp_make_ack (tc, b); + *next = tcp_next_output (tc->c_is_ip4); + *error = TCP_ERROR_ACK_INVALID; + TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 0, + vnet_buffer (b)->tcp.ack_number); + return -1; + } + + tc->snd_nxt = vnet_buffer (b)->tcp.ack_number; + *error = TCP_ERROR_ACK_FUTURE; + TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 2, + vnet_buffer (b)->tcp.ack_number); } - /* If old ACK, discard */ + /* If old ACK, probably it's an old dupack */ if (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una)) { *error = TCP_ERROR_ACK_OLD; + TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 1, + vnet_buffer (b)->tcp.ack_number); + if (tcp_in_fastrecovery (tc) && tc->rcv_dupacks == TCP_DUPACK_THRESHOLD) + { + TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc); + tcp_cc_rcv_dupack (tc, vnet_buffer (b)->tcp.ack_number); + } return -1; } @@ -712,32 +841,40 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, if (tcp_ack_is_dupack (tc, b, new_snd_wnd)) { + TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc, 1); tcp_cc_rcv_dupack (tc, vnet_buffer (b)->tcp.ack_number); *error = TCP_ERROR_ACK_DUP; return -1; } - /* Valid ACK */ + /* + * Valid ACK + */ + tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una; - tc->snd_una = vnet_buffer (b)->tcp.ack_number; + tc->snd_una = vnet_buffer (b)->tcp.ack_number + tc->sack_sb.snd_una_adv; - /* Dequeue ACKed packet and update RTT */ + /* Dequeue ACKed data and update RTT */ tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number); - tcp_update_snd_wnd (tc, vnet_buffer (b)->tcp.seq_number, vnet_buffer (b)->tcp.ack_number, new_snd_wnd); - /* Updates congestion control (slow start/congestion avoidance) */ - tcp_cc_rcv_ack (tc); + /* If some of our sent bytes have been acked, update cc and retransmit + * timer. */ + if (tc->bytes_acked) + { + TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc, vnet_buffer (b)->tcp.ack_number); - TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc); + /* Updates congestion control (slow start/congestion avoidance) */ + tcp_cc_rcv_ack (tc, b); - /* If everything has been acked, stop retransmit timer - * otherwise update */ - if (tc->snd_una == tc->snd_una_max) - tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT); - else - tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, tc->rto); + /* If everything has been acked, stop retransmit timer + * otherwise update */ + if (tc->snd_una == tc->snd_una_max) + tcp_retransmit_timer_reset (tc); + else + tcp_retransmit_timer_update (tc); + } return 0; } @@ -757,9 +894,7 @@ static void tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end) { sack_block_t *new_list = 0, block; - u32 n_elts; int i; - u8 new_head = 0; /* If the first segment is ooo add it to the list. Last write might've moved * rcv_nxt over the first segment. */ @@ -768,7 +903,6 @@ tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end) block.start = start; block.end = end; vec_add1 (new_list, block); - new_head = 1; } /* Find the blocks still worth keeping. */ @@ -782,20 +916,19 @@ tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end) || seq_leq (tc->snd_sacks[i].start, end)) continue; - /* Save subsequent segments to new SACK list. */ - n_elts = clib_min (vec_len (tc->snd_sacks) - i, - TCP_MAX_SACK_BLOCKS - new_head); - vec_insert_elts (new_list, &tc->snd_sacks[i], n_elts, new_head); - break; + /* Save to new SACK list. */ + vec_add1 (new_list, tc->snd_sacks[i]); } + ASSERT (vec_len (new_list) < TCP_MAX_SACK_BLOCKS); + /* Replace old vector with new one */ vec_free (tc->snd_sacks); tc->snd_sacks = new_list; } /** Enqueue data for delivery to application */ -always_inline u32 +always_inline int tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, u16 data_len) { @@ -812,6 +945,8 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, vlib_buffer_get_current (b), data_len, 1 /* queue event */ ); + TCP_EVT_DBG (TCP_EVT_INPUT, tc, 0, data_len, written); + /* Update rcv_nxt */ if (PREDICT_TRUE (written == data_len)) { @@ -824,38 +959,61 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, /* Send ACK confirming the update */ tc->flags |= TCP_CONN_SNDACK; + } + else if (written > 0) + { + /* We've written something but FIFO is probably full now */ + tc->rcv_nxt += written; - /* Update SACK list if need be */ - if (tcp_opts_sack_permitted (&tc->opt)) - { - /* Remove SACK blocks that have been delivered */ - tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt); - } + /* Depending on how fast the app is, all remaining buffers in burst will + * not be enqueued. Should we inform peer of the damage? XXX */ + return TCP_ERROR_PARTIALLY_ENQUEUED; } else { - ASSERT (0); return TCP_ERROR_FIFO_FULL; } + /* Update SACK list if need be */ + if (tcp_opts_sack_permitted (&tc->opt)) + { + /* Remove SACK blocks that have been delivered */ + tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt); + } + return TCP_ERROR_ENQUEUED; } /** Enqueue out-of-order data */ -always_inline u32 +always_inline int tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, u16 data_len) { stream_session_t *s0; u32 offset, seq; + int rv; + + /* Pure ACK. Do nothing */ + if (PREDICT_FALSE (data_len == 0)) + { + return TCP_ERROR_PURE_ACK; + } s0 = stream_session_get (tc->c_s_index, tc->c_thread_index); seq = vnet_buffer (b)->tcp.seq_number; offset = seq - tc->rcv_nxt; - if (svm_fifo_enqueue_with_offset (s0->server_rx_fifo, s0->pid, offset, - data_len, vlib_buffer_get_current (b))) - return TCP_ERROR_FIFO_FULL; + rv = svm_fifo_enqueue_with_offset (s0->server_rx_fifo, s0->pid, offset, + data_len, vlib_buffer_get_current (b)); + + /* Nothing written */ + if (rv) + { + TCP_EVT_DBG (TCP_EVT_INPUT, tc, 1, data_len, 0); + return TCP_ERROR_FIFO_FULL; + } + + TCP_EVT_DBG (TCP_EVT_INPUT, tc, 1, data_len, data_len); /* Update SACK list if in use */ if (tcp_opts_sack_permitted (&tc->opt)) @@ -875,20 +1033,23 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, } /** - * Check if ACK could be delayed. DELACK timer is set only after frame is - * processed so this can return true for a full bursts of packets. + * Check if ACK could be delayed. If ack can be delayed, it should return + * true for a full frame. If we're always acking return 0. */ always_inline int tcp_can_delack (tcp_connection_t * tc) { - /* If there's no DELACK timer set and the last window sent wasn't 0 we - * can safely delay. */ - if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK) - && (tc->flags & TCP_CONN_SENT_RCV_WND0) == 0 - && (tc->flags & TCP_CONN_SNDACK) == 0) - return 1; + /* Send ack if ... */ + if (TCP_ALWAYS_ACK + /* just sent a rcv wnd 0 */ + || (tc->flags & TCP_CONN_SENT_RCV_WND0) != 0 + /* constrained to send ack */ + || (tc->flags & TCP_CONN_SNDACK) != 0 + /* we're almost out of tx wnd */ + || tcp_available_snd_space (tc) < 2 * tc->snd_mss) + return 0; - return 0; + return 1; } static int @@ -900,23 +1061,33 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, /* Handle out-of-order data */ if (PREDICT_FALSE (vnet_buffer (b)->tcp.seq_number != tc->rcv_nxt)) { + /* Old sequence numbers allowed through because they overlapped + * the rx window */ + if (seq_lt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt)) + { + error = TCP_ERROR_SEGMENT_OLD; + *next0 = TCP_NEXT_DROP; + goto done; + } + error = tcp_session_enqueue_ooo (tc, b, n_data_bytes); - /* Don't send more than 3 dupacks per burst - * XXX decide if this is good */ - if (tc->snt_dupacks < 3) - { - /* RFC2581: Send DUPACK for fast retransmit */ - tcp_make_ack (tc, b); - *next0 = tcp_next_output (tc->c_is_ip4); + /* N.B. Should not filter burst of dupacks. Two issues 1) dupacks open + * cwnd on remote peer when congested 2) acks leaving should have the + * latest rcv_wnd since the burst may eaten up all of it, so only the + * old ones could be filtered. + */ - /* Mark as DUPACK. We may filter these in output if - * the burst fills the holes. */ - vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_DUPACK; + /* RFC2581: Send DUPACK for fast retransmit */ + tcp_make_ack (tc, b); + *next0 = tcp_next_output (tc->c_is_ip4); - tc->snt_dupacks++; - } + /* Mark as DUPACK. We may filter these in output if + * the burst fills the holes. */ + if (n_data_bytes) + vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_DUPACK; + TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc); goto done; } @@ -924,63 +1095,45 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, * segments can be enqueued after fifo tail offset changes. */ error = tcp_session_enqueue_data (tc, b, n_data_bytes); - TCP_EVT_DBG (TCP_EVT_INPUT, tc, n_data_bytes); + if (n_data_bytes == 0) + { + *next0 = TCP_NEXT_DROP; + goto done; + } + + if (PREDICT_FALSE (error == TCP_ERROR_FIFO_FULL)) + *next0 = TCP_NEXT_DROP; /* Check if ACK can be delayed */ - if (tcp_can_delack (tc)) + if (!tcp_can_delack (tc)) { - /* Nothing to do for pure ACKs */ + /* Nothing to do for pure ACKs XXX */ if (n_data_bytes == 0) goto done; - /* If connection has not been previously marked for delay ack - * add it to the list and flag it */ - if (!tc->flags & TCP_CONN_DELACK) - { - vec_add1 (tm->delack_connections[tc->c_thread_index], - tc->c_c_index); - tc->flags |= TCP_CONN_DELACK; - } + *next0 = tcp_next_output (tc->c_is_ip4); + tcp_make_ack (tc, b); } else { - /* Check if a packet has already been enqueued to output for burst. - * If yes, then drop this one, otherwise, let it pass through to - * output */ - if ((tc->flags & TCP_CONN_BURSTACK) == 0) - { - *next0 = tcp_next_output (tc->c_is_ip4); - tcp_make_ack (tc, b); - error = TCP_ERROR_ENQUEUED; - - /* TODO: maybe add counter to ensure N acks will be sent/burst */ - tc->flags |= TCP_CONN_BURSTACK; - } + if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK)) + tcp_timer_set (tc, TCP_TIMER_DELACK, TCP_DELACK_TIME); } done: return error; } -void -delack_timers_init (tcp_main_t * tm, u32 thread_index) +always_inline void +tcp_established_inc_counter (vlib_main_t * vm, u8 is_ip4, u8 evt, u8 val) { - tcp_connection_t *tc; - u32 i, *conns; - tw_timer_wheel_16t_2w_512sl_t *tw; - - tw = &tm->timer_wheels[thread_index]; - conns = tm->delack_connections[thread_index]; - for (i = 0; i < vec_len (conns); i++) - { - tc = pool_elt_at_index (tm->connections[thread_index], conns[i]); - ASSERT (0 != tc); + if (PREDICT_TRUE (!val)) + return; - tc->timers[TCP_TIMER_DELACK] - = tw_timer_start_16t_2w_512sl (tw, conns[i], - TCP_TIMER_DELACK, TCP_DELACK_TIME); - } - vec_reset_length (tm->delack_connections[thread_index]); + if (is_ip4) + vlib_node_increment_counter (vm, tcp4_established_node.index, evt, val); + else + vlib_node_increment_counter (vm, tcp6_established_node.index, evt, val); } always_inline uword @@ -1027,7 +1180,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (PREDICT_FALSE (tc0 == 0)) { error0 = TCP_ERROR_INVALID_CONNECTION; - goto drop; + goto done; } /* Checksum computed by ipx_local no need to compute again */ @@ -1061,18 +1214,22 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (PREDICT_FALSE (tcp_segment_validate (vm, tc0, b0, th0, &next0))) { error0 = TCP_ERROR_SEGMENT_INVALID; - goto drop; + TCP_EVT_DBG (TCP_EVT_SEG_INVALID, tc0, + vnet_buffer (b0)->tcp.seq_number, + vnet_buffer (b0)->tcp.seq_end); + goto done; } /* 5: check the ACK field */ if (tcp_rcv_ack (tc0, b0, th0, &next0, &error0)) { - goto drop; + goto done; } /* 6: check the URG bit TODO */ /* 7: process the segment text */ + vlib_buffer_advance (b0, n_advance_bytes0); error0 = tcp_segment_rcv (tm, tc0, b0, n_data_bytes0, &next0); @@ -1088,7 +1245,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); } - drop: + done: b0->error = node->errors[error0]; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { @@ -1103,17 +1260,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } errors = session_manager_flush_enqueue_events (my_thread_index); - if (errors) - { - if (is_ip4) - vlib_node_increment_counter (vm, tcp4_established_node.index, - TCP_ERROR_EVENT_FIFO_FULL, errors); - else - vlib_node_increment_counter (vm, tcp6_established_node.index, - TCP_ERROR_EVENT_FIFO_FULL, errors); - } - - delack_timers_init (tm, my_thread_index); + tcp_established_inc_counter (vm, is_ip4, TCP_ERROR_EVENT_FIFO_FULL, errors); return from_frame->n_vectors; } @@ -1602,7 +1749,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, stream_session_accept_notify (&tc0->connection); /* Reset SYN-ACK retransmit timer */ - tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT); + tcp_retransmit_timer_reset (tc0); break; case TCP_STATE_ESTABLISHED: /* We can get packets in established state here because they @@ -1668,7 +1815,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); /* Stop retransmit */ - tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT); + tcp_retransmit_timer_reset (tc0); goto drop; diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 114a5b9e..a671f728 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -125,15 +125,33 @@ tcp_initial_window_to_advertise (tcp_connection_t * tc) u32 tcp_window_to_advertise (tcp_connection_t * tc, tcp_state_t state) { - u32 available_space, max_fifo, observed_wnd; - if (state < TCP_STATE_ESTABLISHED) return tcp_initial_window_to_advertise (tc); + tcp_update_rcv_wnd (tc); + + if (tc->rcv_wnd == 0) + { + tc->flags |= TCP_CONN_SENT_RCV_WND0; + } + else + { + tc->flags &= ~TCP_CONN_SENT_RCV_WND0; + } + + return tc->rcv_wnd >> tc->rcv_wscale; +} + +void +tcp_update_rcv_wnd (tcp_connection_t * tc) +{ + i32 observed_wnd; + u32 available_space, max_fifo, wnd; + /* * Figure out how much space we have available */ - available_space = stream_session_max_enqueue (&tc->connection); + available_space = stream_session_max_rx_enqueue (&tc->connection); max_fifo = stream_session_fifo_size (&tc->connection); ASSERT (tc->opt.mss < max_fifo); @@ -145,23 +163,25 @@ tcp_window_to_advertise (tcp_connection_t * tc, tcp_state_t state) * Use the above and what we know about what we've previously advertised * to compute the new window */ - observed_wnd = tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las); + observed_wnd = (i32) tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las); + if (observed_wnd < 0) + observed_wnd = 0; /* Bad. Thou shalt not shrink */ if (available_space < observed_wnd) { - if (available_space == 0) - clib_warning ("Didn't shrink rcv window despite not having space"); + /* Does happen! */ + wnd = observed_wnd; } - - tc->rcv_wnd = clib_min (available_space, TCP_WND_MAX << tc->rcv_wscale); - - if (tc->rcv_wnd == 0) + else { - tc->flags |= TCP_CONN_SENT_RCV_WND0; + wnd = available_space; } - return tc->rcv_wnd >> tc->rcv_wscale; + if (wnd && ((wnd << tc->rcv_wscale) >> tc->rcv_wscale != wnd)) + wnd += 1 << tc->rcv_wscale; + + tc->rcv_wnd = clib_min (wnd, TCP_WND_MAX << tc->rcv_wscale); } /** @@ -363,7 +383,7 @@ tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts, #define tcp_get_free_buffer_index(tm, bidx) \ do { \ u32 *my_tx_buffers, n_free_buffers; \ - u32 cpu_index = tm->vlib_main->cpu_index; \ + u32 cpu_index = os_get_cpu_number(); \ my_tx_buffers = tm->tx_buffers[cpu_index]; \ if (PREDICT_FALSE(vec_len (my_tx_buffers) == 0)) \ { \ @@ -381,6 +401,14 @@ do { \ _vec_len (my_tx_buffers) -= 1; \ } while (0) +#define tcp_return_buffer(tm) \ +do { \ + u32 *my_tx_buffers; \ + u32 cpu_index = os_get_cpu_number(); \ + my_tx_buffers = tm->tx_buffers[cpu_index]; \ + _vec_len (my_tx_buffers) +=1; \ +} while (0) + always_inline void tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b) { @@ -421,8 +449,6 @@ tcp_make_ack_i (tcp_connection_t * tc, vlib_buffer_t * b, tcp_state_t state, tc->rcv_nxt, tcp_hdr_opts_len, flags, wnd); tcp_options_write ((u8 *) (th + 1), snd_opts); - - /* Mark as ACK */ vnet_buffer (b)->tcp.connection_index = tc->c_c_index; } @@ -432,12 +458,12 @@ tcp_make_ack_i (tcp_connection_t * tc, vlib_buffer_t * b, tcp_state_t state, void tcp_make_ack (tcp_connection_t * tc, vlib_buffer_t * b) { - tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; + vlib_main_t *vm = vlib_get_main (); tcp_reuse_buffer (vm, b); tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_ACK); vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; + TCP_EVT_DBG (TCP_EVT_ACK_SENT, tc); } /** @@ -446,8 +472,7 @@ tcp_make_ack (tcp_connection_t * tc, vlib_buffer_t * b) void tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b) { - tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; + vlib_main_t *vm = vlib_get_main (); u8 flags = 0; tcp_reuse_buffer (vm, b); @@ -467,8 +492,7 @@ tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b) void tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b) { - tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; + vlib_main_t *vm = vlib_get_main (); tcp_options_t _snd_opts, *snd_opts = &_snd_opts; u8 tcp_opts_len, tcp_hdr_opts_len; tcp_header_t *th; @@ -631,7 +655,7 @@ tcp_send_reset (vlib_buffer_t * pkt, u8 is_ip4) vlib_buffer_t *b; u32 bi; tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; + vlib_main_t *vm = vlib_get_main (); u8 tcp_hdr_len, flags = 0; tcp_header_t *th, *pkt_th; u32 seq, ack; @@ -736,7 +760,7 @@ tcp_send_syn (tcp_connection_t * tc) vlib_buffer_t *b; u32 bi; tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; + vlib_main_t *vm = vlib_get_main (); u8 tcp_hdr_opts_len, tcp_opts_len; tcp_header_t *th; u32 time_now; @@ -795,9 +819,9 @@ tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) /* Decide where to send the packet */ next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index; - f = vlib_get_frame_to_node (vm, next_index); /* Enqueue the packet */ + f = vlib_get_frame_to_node (vm, next_index); to_next = vlib_frame_vector_args (f); to_next[0] = bi; f->n_vectors = 1; @@ -813,7 +837,7 @@ tcp_send_fin (tcp_connection_t * tc) vlib_buffer_t *b; u32 bi; tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; + vlib_main_t *vm = vlib_get_main (); tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); @@ -884,22 +908,21 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, vnet_buffer (b)->tcp.connection_index = tc->c_c_index; tc->snd_nxt += data_len; + /* TODO this is updated in output as well ... */ + if (tc->snd_nxt > tc->snd_una_max) + tc->snd_una_max = tc->snd_nxt; TCP_EVT_DBG (TCP_EVT_PKTIZE, tc); } -/* Send delayed ACK when timer expires */ void -tcp_timer_delack_handler (u32 index) +tcp_send_ack (tcp_connection_t * tc) { tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; - u32 thread_index = os_get_cpu_number (); - tcp_connection_t *tc; + vlib_main_t *vm = vlib_get_main (); + vlib_buffer_t *b; u32 bi; - tc = tcp_connection_get (index, thread_index); - /* Get buffer */ tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); @@ -907,12 +930,22 @@ tcp_timer_delack_handler (u32 index) /* Fill in the ACK */ tcp_make_ack (tc, b); - tc->timers[TCP_TIMER_DELACK] = TCP_TIMER_HANDLE_INVALID; - tc->flags &= ~TCP_CONN_DELACK; - tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); } +/* Send delayed ACK when timer expires */ +void +tcp_timer_delack_handler (u32 index) +{ + u32 thread_index = os_get_cpu_number (); + tcp_connection_t *tc; + + tc = tcp_connection_get (index, thread_index); + tc->timers[TCP_TIMER_DELACK] = TCP_TIMER_HANDLE_INVALID; +// tc->flags &= ~TCP_CONN_DELACK; + tcp_send_ack (tc); +} + /** Build a retransmit segment * * @return the number of bytes in the segment or 0 if there's nothing to @@ -920,59 +953,74 @@ tcp_timer_delack_handler (u32 index) * */ u32 tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, - u32 max_bytes) + u32 offset, u32 max_bytes) { - tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; - u32 n_bytes, offset = 0; - sack_scoreboard_hole_t *hole; - u32 hole_size; + vlib_main_t *vm = vlib_get_main (); + u32 n_bytes = 0; tcp_reuse_buffer (vm, b); ASSERT (tc->state >= TCP_STATE_ESTABLISHED); ASSERT (max_bytes != 0); - if (tcp_opts_sack_permitted (&tc->opt)) - { - /* XXX get first hole not retransmitted yet */ - hole = scoreboard_first_hole (&tc->sack_sb); - if (!hole) - return 0; - - offset = hole->start - tc->snd_una; - hole_size = hole->end - hole->start; + max_bytes = clib_min (tc->snd_mss, max_bytes); - ASSERT (hole_size); + /* Start is beyond snd_congestion */ + if (seq_geq (tc->snd_una + offset, tc->snd_congestion)) + goto done; - if (hole_size < max_bytes) - max_bytes = hole_size; - } - else + /* Don't overshoot snd_congestion */ + if (seq_gt (tc->snd_nxt + max_bytes, tc->snd_congestion)) { - if (seq_geq (tc->snd_nxt, tc->snd_una_max)) - return 0; + max_bytes = tc->snd_congestion - tc->snd_nxt; + if (max_bytes == 0) + goto done; } + ASSERT (max_bytes <= tc->snd_mss); + n_bytes = stream_session_peek_bytes (&tc->connection, vlib_buffer_get_current (b), offset, max_bytes); ASSERT (n_bytes != 0); - + b->current_length = n_bytes; tcp_push_hdr_i (tc, b, tc->state); +done: + TCP_EVT_DBG (TCP_EVT_CC_RTX, tc, offset, n_bytes); return n_bytes; } +/** + * Reset congestion control, switch cwnd to loss window and try again. + */ +static void +tcp_rtx_timeout_cc_recover (tcp_connection_t * tc) +{ + /* Cleanly recover cc (also clears up fast retransmit) */ + if (tcp_in_fastrecovery (tc)) + { + tcp_cc_recover (tc); + } + else + { + tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss); + } + + /* Start again from the beginning */ + tc->cwnd = tcp_loss_wnd (tc); + tc->snd_congestion = tc->snd_una_max; +} + static void tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; + vlib_main_t *vm = vlib_get_main (); u32 thread_index = os_get_cpu_number (); tcp_connection_t *tc; vlib_buffer_t *b; - u32 bi, max_bytes, snd_space; + u32 bi, snd_space, n_bytes; if (is_syn) { @@ -998,26 +1046,43 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) if (tc->state >= TCP_STATE_ESTABLISHED) { - tcp_fastrecovery_off (tc); + /* First retransmit timeout */ + if (tc->rto_boff == 1) + tcp_rtx_timeout_cc_recover (tc); /* Exponential backoff */ tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); /* Figure out what and how many bytes we can send */ snd_space = tcp_available_snd_space (tc); - max_bytes = clib_min (tc->snd_mss, snd_space); - if (max_bytes == 0) + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1); + + if (snd_space == 0) { clib_warning ("no wnd to retransmit"); + tcp_return_buffer (tm); + + /* Force one segment */ + tcp_retransmit_first_unacked (tc); + + /* Re-enable retransmit timer. Output may be unwilling + * to do it for us */ + tcp_retransmit_timer_set (tc); + return; } - tcp_prepare_retransmit_segment (tc, b, max_bytes); + else + { + /* No fancy recovery for now! */ + n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, snd_space); + scoreboard_clear (&tc->sack_sb); - tc->rtx_bytes += max_bytes; + if (n_bytes == 0) + return; - /* No fancy recovery for now! */ - scoreboard_clear (&tc->sack_sb); + tc->rtx_bytes += n_bytes; + } } else { @@ -1072,63 +1137,110 @@ tcp_timer_retransmit_syn_handler (u32 index) } /** - * Retansmit first unacked segment */ + * Retransmit first unacked segment + */ void tcp_retransmit_first_unacked (tcp_connection_t * tc) { tcp_main_t *tm = vnet_get_tcp_main (); - u32 snd_nxt = tc->snd_nxt; + vlib_main_t *vm = vlib_get_main (); vlib_buffer_t *b; - u32 bi; + u32 bi, n_bytes; tc->snd_nxt = tc->snd_una; /* Get buffer */ tcp_get_free_buffer_index (tm, &bi); - b = vlib_get_buffer (tm->vlib_main, bi); + b = vlib_get_buffer (vm, bi); + + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2); - tcp_prepare_retransmit_segment (tc, b, tc->snd_mss); - tcp_enqueue_to_output (tm->vlib_main, b, bi, tc->c_is_ip4); + n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); + if (n_bytes == 0) + return; - tc->snd_nxt = snd_nxt; - tc->rtx_bytes += tc->snd_mss; + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + tc->rtx_bytes += n_bytes; +} + +sack_scoreboard_hole_t * +scoreboard_first_rtx_hole (sack_scoreboard_t * sb) +{ + sack_scoreboard_hole_t *hole = 0; + +// hole = scoreboard_first_hole (&tc->sack_sb); +// if (hole) +// { +// +// offset = hole->start - tc->snd_una; +// hole_size = hole->end - hole->start; +// +// ASSERT(hole_size); +// +// if (hole_size < max_bytes) +// max_bytes = hole_size; +// } + return hole; } +/** + * Do fast retransmit. + */ void tcp_fast_retransmit (tcp_connection_t * tc) { tcp_main_t *tm = vnet_get_tcp_main (); - u32 snd_space, max_bytes, n_bytes, bi; + vlib_main_t *vm = vlib_get_main (); + u32 bi; + int snd_space; + u32 n_written = 0, offset = 0; vlib_buffer_t *b; + u8 use_sacks = 0; ASSERT (tcp_in_fastrecovery (tc)); - clib_warning ("fast retransmit!"); - /* Start resending from first un-acked segment */ tc->snd_nxt = tc->snd_una; snd_space = tcp_available_snd_space (tc); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); + + /* If we have SACKs use them */ + if (tcp_opts_sack_permitted (&tc->opt) + && scoreboard_first_hole (&tc->sack_sb)) + use_sacks = 0; - while (snd_space) + while (snd_space > 0) { tcp_get_free_buffer_index (tm, &bi); - b = vlib_get_buffer (tm->vlib_main, bi); + b = vlib_get_buffer (vm, bi); + + if (use_sacks) + { + scoreboard_first_rtx_hole (&tc->sack_sb); + } + else + { + offset += n_written; + } - max_bytes = clib_min (tc->snd_mss, snd_space); - n_bytes = tcp_prepare_retransmit_segment (tc, b, max_bytes); + n_written = tcp_prepare_retransmit_segment (tc, b, offset, snd_space); /* Nothing left to retransmit */ - if (n_bytes == 0) - return; - - tcp_enqueue_to_output (tm->vlib_main, b, bi, tc->c_is_ip4); + if (n_written == 0) + { + tcp_return_buffer (tm); + break; + } - snd_space -= n_bytes; + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + tc->rtx_bytes += n_written; + snd_space -= n_written; } - /* If window allows, send new data */ - tc->snd_nxt = tc->snd_una_max; + /* If window allows, send 1 SMSS of new data */ + if (seq_lt (tc->snd_nxt, tc->snd_congestion)) + tc->snd_nxt = tc->snd_congestion; } always_inline u32 @@ -1209,8 +1321,6 @@ tcp46_output_inline (vlib_main_t * vm, if (PREDICT_FALSE (vnet_buffer (b0)->tcp.flags & TCP_BUF_FLAG_DUPACK)) { - ASSERT (tc0->snt_dupacks > 0); - tc0->snt_dupacks--; if (!tcp_session_has_ooo_data (tc0)) { error0 = TCP_ERROR_FILTERED_DUPACKS; @@ -1223,8 +1333,7 @@ tcp46_output_inline (vlib_main_t * vm, tc0->rcv_las = tc0->rcv_nxt; /* Stop DELACK timer and fix flags */ - tc0->flags &= - ~(TCP_CONN_SNDACK | TCP_CONN_DELACK | TCP_CONN_BURSTACK); + tc0->flags &= ~(TCP_CONN_SNDACK); if (tcp_timer_is_active (tc0, TCP_TIMER_DELACK)) { tcp_timer_reset (tc0, TCP_TIMER_DELACK); diff --git a/src/vnet/tcp/tcp_packet.h b/src/vnet/tcp/tcp_packet.h index 866c5fd6..4f28cf32 100644 --- a/src/vnet/tcp/tcp_packet.h +++ b/src/vnet/tcp/tcp_packet.h @@ -137,7 +137,7 @@ enum typedef struct _sack_block { u32 start; /**< Start sequence number */ - u32 end; /**< End sequence number */ + u32 end; /**< End sequence number (first outside) */ } sack_block_t; typedef struct diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c new file mode 100644 index 00000000..0725bb04 --- /dev/null +++ b/src/vnet/tcp/tcp_test.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#define TCP_TEST_I(_cond, _comment, _args...) \ +({ \ + int _evald = (_cond); \ + if (!(_evald)) { \ + fformat(stderr, "FAIL:%d: " _comment "\n", \ + __LINE__, ##_args); \ + } else { \ + fformat(stderr, "PASS:%d: " _comment "\n", \ + __LINE__, ##_args); \ + } \ + _evald; \ +}) + +#define TCP_TEST(_cond, _comment, _args...) \ +{ \ + if (!TCP_TEST_I(_cond, _comment, ##_args)) { \ + return 1; \ + } \ +} + +static int +tcp_test_sack () +{ + tcp_connection_t _tc, *tc = &_tc; + sack_scoreboard_t *sb = &tc->sack_sb; + sack_block_t *sacks = 0, block; + sack_scoreboard_hole_t *hole; + int i; + + memset (tc, 0, sizeof (*tc)); + + tc->snd_una = 0; + tc->snd_una_max = 1000; + tc->snd_nxt = 1000; + tc->opt.flags |= TCP_OPTS_FLAG_SACK; + scoreboard_init (&tc->sack_sb); + + for (i = 0; i < 1000 / 100; i++) + { + block.start = i * 100; + block.end = (i + 1) * 100; + vec_add1 (sacks, block); + } + + /* + * Inject even blocks + */ + + for (i = 0; i < 1000 / 200; i++) + { + vec_add1 (tc->opt.sacks, sacks[i * 2]); + } + tc->opt.n_sack_blocks = vec_len (tc->opt.sacks); + tcp_rcv_sacks (tc, 0); + + TCP_TEST ((pool_elts (sb->holes) == 5), + "scoreboard has %d elements", pool_elts (sb->holes)); + + /* First SACK block should be rejected */ + hole = scoreboard_first_hole (sb); + TCP_TEST ((hole->start == 0 && hole->end == 200), + "first hole start %u end %u", hole->start, hole->end); + hole = scoreboard_last_hole (sb); + TCP_TEST ((hole->start == 900 && hole->end == 1000), + "last hole start %u end %u", hole->start, hole->end); + TCP_TEST ((sb->sacked_bytes == 400), "sacked bytes %d", sb->sacked_bytes); + TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv); + TCP_TEST ((sb->last_sacked_bytes == 400), + "last sacked bytes %d", sb->last_sacked_bytes); + + /* + * Inject odd blocks + */ + + vec_reset_length (tc->opt.sacks); + for (i = 0; i < 1000 / 200; i++) + { + vec_add1 (tc->opt.sacks, sacks[i * 2 + 1]); + } + tc->opt.n_sack_blocks = vec_len (tc->opt.sacks); + tcp_rcv_sacks (tc, 0); + + hole = scoreboard_first_hole (sb); + TCP_TEST ((pool_elts (sb->holes) == 1), + "scoreboard has %d holes", pool_elts (sb->holes)); + TCP_TEST ((hole->start == 0 && hole->end == 100), + "first hole start %u end %u", hole->start, hole->end); + TCP_TEST ((sb->sacked_bytes == 900), "sacked bytes %d", sb->sacked_bytes); + TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv); + TCP_TEST ((sb->max_byte_sacked == 1000), + "max sacked byte %u", sb->max_byte_sacked); + TCP_TEST ((sb->last_sacked_bytes == 500), + "last sacked bytes %d", sb->last_sacked_bytes); + + /* + * Ack until byte 100, all bytes are now acked + sacked + */ + tcp_rcv_sacks (tc, 100); + + TCP_TEST ((pool_elts (sb->holes) == 0), + "scoreboard has %d elements", pool_elts (sb->holes)); + TCP_TEST ((sb->snd_una_adv == 900), + "snd_una_adv after ack %u", sb->snd_una_adv); + TCP_TEST ((sb->max_byte_sacked == 1000), + "max sacked byte %u", sb->max_byte_sacked); + TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes); + TCP_TEST ((sb->last_sacked_bytes == 0), + "last sacked bytes %d", sb->last_sacked_bytes); + + /* + * Add new block + */ + + vec_reset_length (tc->opt.sacks); + + block.start = 1200; + block.end = 1300; + vec_add1 (tc->opt.sacks, block); + + tc->snd_una_max = 1500; + tc->snd_una = 1000; + tc->snd_nxt = 1500; + tcp_rcv_sacks (tc, 1000); + + TCP_TEST ((sb->snd_una_adv == 0), + "snd_una_adv after ack %u", sb->snd_una_adv); + TCP_TEST ((pool_elts (sb->holes) == 2), + "scoreboard has %d holes", pool_elts (sb->holes)); + hole = scoreboard_first_hole (sb); + TCP_TEST ((hole->start == 1000 && hole->end == 1200), + "first hole start %u end %u", hole->start, hole->end); + hole = scoreboard_last_hole (sb); + TCP_TEST ((hole->start == 1300 && hole->end == 1500), + "last hole start %u end %u", hole->start, hole->end); + TCP_TEST ((sb->sacked_bytes == 100), "sacked bytes %d", sb->sacked_bytes); + + /* + * Ack first hole + */ + + vec_reset_length (tc->opt.sacks); + tcp_rcv_sacks (tc, 1200); + + TCP_TEST ((sb->snd_una_adv == 100), + "snd_una_adv after ack %u", sb->snd_una_adv); + TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes); + TCP_TEST ((pool_elts (sb->holes) == 1), + "scoreboard has %d elements", pool_elts (sb->holes)); + + /* + * Remove all + */ + + scoreboard_clear (sb); + TCP_TEST ((pool_elts (sb->holes) == 0), + "number of holes %d", pool_elts (sb->holes)); + return 0; +} + +static clib_error_t * +tcp_test (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd_arg) +{ + int res = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "sack")) + { + res = tcp_test_sack (); + } + else + { + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + } + + if (res) + { + return clib_error_return (0, "TCP unit test failed"); + } + else + { + return 0; + } +} + +VLIB_CLI_COMMAND (tcp_test_command, static) = +{ +.path = "test tcp",.short_help = "internal tcp unit tests",.function = + tcp_test,}; +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/udp/builtin_server.c b/src/vnet/udp/builtin_server.c index 46c8e734..57f774c5 100644 --- a/src/vnet/udp/builtin_server.c +++ b/src/vnet/udp/builtin_server.c @@ -39,10 +39,10 @@ builtin_session_disconnect_callback (stream_session_t * s) } static int -builtin_server_rx_callback (stream_session_t * s, session_fifo_event_t * ep) +builtin_server_rx_callback (stream_session_t * s) { svm_fifo_t *rx_fifo, *tx_fifo; - u32 this_transfer; + u32 this_transfer, max_deq, max_enq; int actual_transfer; u8 *my_copy_buffer; session_fifo_event_t evt; @@ -52,9 +52,9 @@ builtin_server_rx_callback (stream_session_t * s, session_fifo_event_t * ep) rx_fifo = s->server_rx_fifo; tx_fifo = s->server_tx_fifo; - this_transfer = svm_fifo_max_enqueue (tx_fifo) - < svm_fifo_max_dequeue (rx_fifo) ? - svm_fifo_max_enqueue (tx_fifo) : svm_fifo_max_dequeue (rx_fifo); + max_deq = svm_fifo_max_dequeue (rx_fifo); + max_enq = svm_fifo_max_enqueue (tx_fifo); + this_transfer = max_enq < max_deq ? max_enq : max_deq; vec_validate (my_copy_buffer, this_transfer - 1); _vec_len (my_copy_buffer) = this_transfer; @@ -64,17 +64,20 @@ builtin_server_rx_callback (stream_session_t * s, session_fifo_event_t * ep) ASSERT (actual_transfer == this_transfer); actual_transfer = svm_fifo_enqueue_nowait (tx_fifo, 0, this_transfer, my_copy_buffer); + ASSERT (actual_transfer == this_transfer); copy_buffers[s->thread_index] = my_copy_buffer; - /* Fabricate TX event, send to ourselves */ - evt.fifo = tx_fifo; - evt.event_type = FIFO_EVENT_SERVER_TX; - /* $$$$ for event logging */ - evt.enqueue_length = actual_transfer; - evt.event_id = 0; - q = session_manager_get_vpp_event_queue (s->thread_index); - unix_shared_memory_queue_add (q, (u8 *) & evt, 0 /* do wait for mutex */ ); + if (svm_fifo_set_event (tx_fifo)) + { + /* Fabricate TX event, send to ourselves */ + evt.fifo = tx_fifo; + evt.event_type = FIFO_EVENT_SERVER_TX; + evt.event_id = 0; + q = session_manager_get_vpp_event_queue (s->thread_index); + unix_shared_memory_queue_add (q, (u8 *) & evt, + 0 /* do wait for mutex */ ); + } return 0; } diff --git a/src/vnet/udp/udp_input.c b/src/vnet/udp/udp_input.c index 88278735..4b22109b 100644 --- a/src/vnet/udp/udp_input.c +++ b/src/vnet/udp/udp_input.c @@ -244,44 +244,53 @@ udp4_uri_input_node_fn (vlib_main_t * vm, /* Get session's server */ server0 = application_get (s0->app_index); - /* Fabricate event */ - evt.fifo = s0->server_rx_fifo; - evt.event_type = FIFO_EVENT_SERVER_RX; - evt.event_id = serial_number++; - evt.enqueue_length = svm_fifo_max_dequeue (s0->server_rx_fifo); - /* Built-in server? Deliver the goods... */ if (server0->cb_fns.builtin_server_rx_callback) { - server0->cb_fns.builtin_server_rx_callback (s0, &evt); + server0->cb_fns.builtin_server_rx_callback (s0); continue; } - /* Add event to server's event queue */ - q = server0->event_queue; - - /* Don't block for lack of space */ - if (PREDICT_TRUE (q->cursize < q->maxsize)) - unix_shared_memory_queue_add (server0->event_queue, (u8 *) & evt, - 0 /* do wait for mutex */ ); - else + if (svm_fifo_set_event (s0->server_rx_fifo)) { - vlib_node_increment_counter (vm, udp4_uri_input_node.index, - SESSION_ERROR_FIFO_FULL, 1); + /* Fabricate event */ + evt.fifo = s0->server_rx_fifo; + evt.event_type = FIFO_EVENT_SERVER_RX; + evt.event_id = serial_number++; + + /* Add event to server's event queue */ + q = server0->event_queue; + + /* Don't block for lack of space */ + if (PREDICT_TRUE (q->cursize < q->maxsize)) + { + unix_shared_memory_queue_add (server0->event_queue, + (u8 *) & evt, + 0 /* do wait for mutex */ ); + } + else + { + vlib_node_increment_counter (vm, udp4_uri_input_node.index, + SESSION_ERROR_FIFO_FULL, 1); + } } + /* *INDENT-OFF* */ if (1) { ELOG_TYPE_DECLARE (e) = { - .format = "evt-enqueue: id %d length %d",.format_args = "i4i4",}; + .format = "evt-enqueue: id %d length %d", + .format_args = "i4i4",}; struct { u32 data[2]; } *ed; ed = ELOG_DATA (&vlib_global_main.elog_main, e); ed->data[0] = evt.event_id; - ed->data[1] = evt.enqueue_length; + ed->data[1] = svm_fifo_max_dequeue (s0->server_rx_fifo); } + /* *INDENT-ON* */ + } vec_reset_length (session_indices_to_enqueue); -- cgit 1.2.3-korg From f6359c8cace5b73a813e5f4e3d1bc28f7752fcdf Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Mon, 19 Jun 2017 12:26:09 -0400 Subject: Improve svm fifo and tcp tx path performance (VPP-846) - multiarch on svm fifo - avoid ip lookup on tx Change-Id: Iab0d85204a710979417bca1d692cc47877131203 Signed-off-by: Florin Coras Signed-off-by: Dave Barach --- src/svm/svm_fifo.c | 95 +++++++++++++++++++++++++++++++++++++--- src/uri/uri_tcp_test.c | 10 ++--- src/vnet/session/node.c | 13 +++--- src/vnet/session/transport.h | 7 ++- src/vnet/tcp/tcp.c | 100 +++++++++++++++++++++++++++++++++++++++++-- src/vnet/tcp/tcp.h | 4 ++ src/vnet/tcp/tcp_output.c | 29 +++++++++---- src/vnet/tcp/tcp_packet.h | 9 ++-- 8 files changed, 232 insertions(+), 35 deletions(-) (limited to 'src/vnet/tcp/tcp_packet.h') diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c index 6ca437cf..aed5d6a7 100644 --- a/src/svm/svm_fifo.c +++ b/src/svm/svm_fifo.c @@ -14,6 +14,7 @@ */ #include +#include static inline u8 position_lt (svm_fifo_t * f, u32 a, u32 b) @@ -417,10 +418,38 @@ svm_fifo_enqueue_internal (svm_fifo_t * f, u32 max_bytes, u8 * copy_from_here) return (total_copy_bytes); } +#define SVM_ENQUEUE_CLONE_TEMPLATE(arch, fn, tgt) \ + uword \ + __attribute__ ((flatten)) \ + __attribute__ ((target (tgt))) \ + CLIB_CPU_OPTIMIZED \ + fn ## _ ## arch ( svm_fifo_t * f, u32 max_bytes, u8 * copy_from_here) \ + { return fn (f, max_bytes, copy_from_here);} + +static int +svm_fifo_enqueue_nowait_ma (svm_fifo_t * f, u32 max_bytes, + u8 * copy_from_here) +{ + return svm_fifo_enqueue_internal (f, max_bytes, copy_from_here); +} + +foreach_march_variant (SVM_ENQUEUE_CLONE_TEMPLATE, + svm_fifo_enqueue_nowait_ma); +CLIB_MULTIARCH_SELECT_FN (svm_fifo_enqueue_nowait_ma); + int svm_fifo_enqueue_nowait (svm_fifo_t * f, u32 max_bytes, u8 * copy_from_here) { - return svm_fifo_enqueue_internal (f, max_bytes, copy_from_here); +#if CLIB_DEBUG > 0 + return svm_fifo_enqueue_nowait_ma (f, max_bytes, copy_from_here); +#else + static int (*fp) (svm_fifo_t *, u32, u8 *); + + if (PREDICT_FALSE (fp == 0)) + fp = (void *) svm_fifo_enqueue_nowait_ma_multiarch_select (); + + return (*fp) (f, max_bytes, copy_from_here); +#endif } /** @@ -541,15 +570,43 @@ svm_fifo_dequeue_internal (svm_fifo_t * f, u32 max_bytes, u8 * copy_here) return (total_copy_bytes); } -int -svm_fifo_dequeue_nowait (svm_fifo_t * f, u32 max_bytes, u8 * copy_here) +static int +svm_fifo_dequeue_nowait_ma (svm_fifo_t * f, u32 max_bytes, u8 * copy_here) { return svm_fifo_dequeue_internal (f, max_bytes, copy_here); } +#define SVM_FIFO_DEQUEUE_CLONE_TEMPLATE(arch, fn, tgt) \ + uword \ + __attribute__ ((flatten)) \ + __attribute__ ((target (tgt))) \ + CLIB_CPU_OPTIMIZED \ + fn ## _ ## arch ( svm_fifo_t * f, u32 max_bytes, \ + u8 * copy_here) \ + { return fn (f, max_bytes, copy_here);} + +foreach_march_variant (SVM_FIFO_DEQUEUE_CLONE_TEMPLATE, + svm_fifo_dequeue_nowait_ma); +CLIB_MULTIARCH_SELECT_FN (svm_fifo_dequeue_nowait_ma); + int -svm_fifo_peek (svm_fifo_t * f, u32 relative_offset, u32 max_bytes, - u8 * copy_here) +svm_fifo_dequeue_nowait (svm_fifo_t * f, u32 max_bytes, u8 * copy_here) +{ +#if CLIB_DEBUG > 0 + return svm_fifo_dequeue_nowait_ma (f, max_bytes, copy_here); +#else + static int (*fp) (svm_fifo_t *, u32, u8 *); + + if (PREDICT_FALSE (fp == 0)) + fp = (void *) svm_fifo_dequeue_nowait_ma_multiarch_select (); + + return (*fp) (f, max_bytes, copy_here); +#endif +} + +static int +svm_fifo_peek_ma (svm_fifo_t * f, u32 relative_offset, u32 max_bytes, + u8 * copy_here) { u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; u32 cursize, nitems, real_head; @@ -586,6 +643,34 @@ svm_fifo_peek (svm_fifo_t * f, u32 relative_offset, u32 max_bytes, return total_copy_bytes; } +#define SVM_FIFO_PEEK_CLONE_TEMPLATE(arch, fn, tgt) \ + uword \ + __attribute__ ((flatten)) \ + __attribute__ ((target (tgt))) \ + CLIB_CPU_OPTIMIZED \ + fn ## _ ## arch ( svm_fifo_t * f, u32 relative_offset, u32 max_bytes, \ + u8 * copy_here) \ + { return fn (f, relative_offset, max_bytes, copy_here);} + +foreach_march_variant (SVM_FIFO_PEEK_CLONE_TEMPLATE, svm_fifo_peek_ma); +CLIB_MULTIARCH_SELECT_FN (svm_fifo_peek_ma); + +int +svm_fifo_peek (svm_fifo_t * f, u32 relative_offset, u32 max_bytes, + u8 * copy_here) +{ +#if CLIB_DEBUG > 0 + return svm_fifo_peek_ma (f, relative_offset, max_bytes, copy_here); +#else + static int (*fp) (svm_fifo_t *, u32, u32, u8 *); + + if (PREDICT_FALSE (fp == 0)) + fp = (void *) svm_fifo_peek_ma_multiarch_select (); + + return (*fp) (f, relative_offset, max_bytes, copy_here); +#endif +} + int svm_fifo_dequeue_drop (svm_fifo_t * f, u32 max_bytes) { diff --git a/src/uri/uri_tcp_test.c b/src/uri/uri_tcp_test.c index d1694cf4..80aab183 100755 --- a/src/uri/uri_tcp_test.c +++ b/src/uri/uri_tcp_test.c @@ -398,7 +398,6 @@ static void vl_api_reset_session_t_handler (vl_api_reset_session_t * mp) { uri_tcp_test_main_t *utm = &uri_tcp_test_main; - session_t *session; vl_api_reset_session_reply_t *rmp; uword *p; int rv = 0; @@ -407,9 +406,8 @@ vl_api_reset_session_t_handler (vl_api_reset_session_t * mp) if (p) { - session = pool_elt_at_index (utm->sessions, p[0]); - hash_unset (utm->session_index_by_vpp_handles, mp->handle); - pool_put (utm->sessions, session); + clib_warning ("got reset"); + /* Cleanup later */ utm->time_to_stop = 1; } else @@ -603,7 +601,7 @@ send_test_chunk (uri_tcp_test_main_t * utm, svm_fifo_t * tx_fifo, int mypid, if (bytes_to_snd > vec_len (test_data)) bytes_to_snd = vec_len (test_data); - while (bytes_to_snd > 0) + while (bytes_to_snd > 0 && !utm->time_to_stop) { actual_write = (bytes_to_snd > queue_max_chunk) ? queue_max_chunk : bytes_to_snd; @@ -652,6 +650,8 @@ client_send_data (uri_tcp_test_main_t * utm) for (i = 0; i < n_iterations; i++) { send_test_chunk (utm, tx_fifo, mypid, 0); + if (utm->time_to_stop) + break; } leftover = utm->bytes_to_send % vec_len (test_data); diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c index c0ab1bf0..b24f5fd9 100644 --- a/src/vnet/session/node.c +++ b/src/vnet/session/node.c @@ -248,6 +248,11 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, ASSERT (bi0); _vec_len (smm->tx_buffers[thread_index]) = n_bufs; + /* usual speculation, or the enqueue_x1 macro will barf */ + to_next[0] = bi0; + to_next += 1; + n_left_to_next -= 1; + b0 = vlib_get_buffer (vm, bi0); b0->error = 0; b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID @@ -255,10 +260,6 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, b0->current_data = 0; b0->total_length_not_including_first_buffer = 0; - /* RX on the local interface. tx in default fib */ - vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; - vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; - len_to_deq0 = clib_min (left_to_snd0, deq_per_buf); data0 = vlib_buffer_make_headroom (b0, MAX_HDRS_LEN); @@ -307,10 +308,6 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, })); /* *INDENT-ON* */ - /* usual speculation, or the enqueue_x1 macro will barf */ - to_next[0] = bi0; - to_next += 1; - n_left_to_next -= 1; VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); if (PREDICT_FALSE (n_trace > 0)) diff --git a/src/vnet/session/transport.h b/src/vnet/session/transport.h index 04bd5ca0..561a9257 100644 --- a/src/vnet/session/transport.h +++ b/src/vnet/session/transport.h @@ -30,13 +30,16 @@ typedef struct _transport_connection ip46_address_t lcl_ip; /**< Local IP */ u16 lcl_port; /**< Local port */ u16 rmt_port; /**< Remote port */ - u8 proto; /**< Transport protocol id (also session type) */ + u8 proto; /**< Protocol id (also session type) */ u32 s_index; /**< Parent session index */ u32 c_index; /**< Connection index in transport pool */ u8 is_ip4; /**< Flag if IP4 connection */ u32 thread_index; /**< Worker-thread index */ + fib_node_index_t rmt_fei; /**< FIB entry index for rmt */ + dpo_id_t rmt_dpo; /**< Forwarding DPO for rmt */ + #if TRANSPORT_DEBUG elog_track_t elog_track; /**< Event logging */ u32 cc_stat_tstamp; /**< CC stats timestamp */ @@ -59,6 +62,8 @@ typedef struct _transport_connection #define c_thread_index connection.thread_index #define c_elog_track connection.elog_track #define c_cc_stat_tstamp connection.cc_stat_tstamp +#define c_rmt_fei connection.rmt_fei +#define c_rmt_dpo connection.rmt_dpo } transport_connection_t; /* diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 5c554bac..4e85eb3f 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -16,6 +16,7 @@ #include #include #include +#include #include tcp_main_t tcp_main; @@ -342,6 +343,99 @@ tcp_connection_timers_reset (tcp_connection_t * tc) } } +typedef struct ip4_tcp_hdr +{ + ip4_header_t ip; + tcp_header_t tcp; +} ip4_tcp_hdr_t; + +typedef struct ip6_tcp_hdr +{ + ip6_header_t ip; + tcp_header_t tcp; +} ip6_tcp_hdr_t; + +static void +tcp_connection_select_lb_bucket (tcp_connection_t * tc, const dpo_id_t * dpo, + dpo_id_t * result) +{ + const dpo_id_t *choice; + load_balance_t *lb; + int hash; + + lb = load_balance_get (dpo->dpoi_index); + if (tc->c_is_ip4) + { + ip4_tcp_hdr_t hdr; + memset (&hdr, 0, sizeof (hdr)); + hdr.ip.protocol = IP_PROTOCOL_TCP; + hdr.ip.address_pair.src.as_u32 = tc->c_lcl_ip.ip4.as_u32; + hdr.ip.address_pair.dst.as_u32 = tc->c_rmt_ip.ip4.as_u32; + hdr.tcp.src_port = tc->c_lcl_port; + hdr.tcp.dst_port = tc->c_rmt_port; + hash = ip4_compute_flow_hash (&hdr.ip, lb->lb_hash_config); + } + else + { + ip6_tcp_hdr_t hdr; + memset (&hdr, 0, sizeof (hdr)); + hdr.ip.protocol = IP_PROTOCOL_TCP; + clib_memcpy (&hdr.ip.src_address, &tc->c_lcl_ip.ip6, + sizeof (ip6_address_t)); + clib_memcpy (&hdr.ip.dst_address, &tc->c_rmt_ip.ip6, + sizeof (ip6_address_t)); + hdr.tcp.src_port = tc->c_lcl_port; + hdr.tcp.dst_port = tc->c_rmt_port; + hash = ip6_compute_flow_hash (&hdr.ip, lb->lb_hash_config); + } + choice = load_balance_get_bucket_i (lb, hash & lb->lb_n_buckets_minus_1); + dpo_copy (result, choice); +} + +fib_node_index_t +tcp_lookup_rmt_in_fib (tcp_connection_t * tc) +{ + fib_prefix_t prefix; + + clib_memcpy (&prefix.fp_addr, &tc->c_rmt_ip, sizeof (prefix.fp_addr)); + prefix.fp_proto = tc->c_is_ip4 ? FIB_PROTOCOL_IP4 : FIB_PROTOCOL_IP6; + prefix.fp_len = tc->c_is_ip4 ? 32 : 128; + return fib_table_lookup (0, &prefix); +} + +static int +tcp_connection_stack_on_fib_entry (tcp_connection_t * tc) +{ + dpo_id_t choice = DPO_INVALID; + u32 output_node_index; + fib_entry_t *fe; + + fe = fib_entry_get (tc->c_rmt_fei); + if (fe->fe_lb.dpoi_type != DPO_LOAD_BALANCE) + return -1; + + tcp_connection_select_lb_bucket (tc, &fe->fe_lb, &choice); + + output_node_index = + tc->c_is_ip4 ? tcp4_output_node.index : tcp6_output_node.index; + dpo_stack_from_node (output_node_index, &tc->c_rmt_dpo, &choice); + return 0; +} + +/** Stack tcp connection on peer's fib entry. + * + * This ultimately populates the dpo the connection will use to send packets. + */ +static void +tcp_connection_fib_attach (tcp_connection_t * tc) +{ + tc->c_rmt_fei = tcp_lookup_rmt_in_fib (tc); + + ASSERT (tc->c_rmt_fei != FIB_NODE_INDEX_INVALID); + + tcp_connection_stack_on_fib_entry (tc); +} + /** Initialize tcp connection variables * * Should be called after having received a msg from the peer, i.e., a SYN or @@ -353,6 +447,7 @@ tcp_connection_init_vars (tcp_connection_t * tc) tcp_init_mss (tc); scoreboard_init (&tc->sack_sb); tcp_cc_init (tc); + tcp_connection_fib_attach (tc); } int @@ -361,7 +456,8 @@ tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4) tcp_main_t *tm = vnet_get_tcp_main (); tcp_connection_t *tc; fib_prefix_t prefix; - u32 fei, sw_if_index; + fib_node_index_t fei; + u32 sw_if_index; ip46_address_t lcl_addr; u16 lcl_port; @@ -985,8 +1081,6 @@ tcp_main_enable (vlib_main_t * vm) vec_validate (tm->timer_wheels, num_threads - 1); tcp_initialize_timer_wheels (tm); -// vec_validate (tm->delack_connections, num_threads - 1); - /* Initialize clocks per tick for TCP timestamp. Used to compute * monotonically increasing timestamps. */ tm->tstamp_ticks_per_clock = vm->clib_time.seconds_per_clock diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index e8398718..12d804b8 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -28,6 +28,7 @@ #define THZ (u32) (1/TCP_TICK) /**< TCP tick frequency */ #define TCP_TSTAMP_RESOLUTION TCP_TICK /**< Time stamp resolution */ #define TCP_PAWS_IDLE 24 * 24 * 60 * 60 * THZ /**< 24 days */ +#define TCP_FIB_RECHECK_PERIOD 1 * THZ /**< Recheck every 1s */ #define TCP_MAX_OPTION_SPACE 40 #define TCP_DUPACK_THRESHOLD 3 @@ -256,6 +257,7 @@ typedef struct _tcp_connection u16 mss; /**< Our max seg size that includes options */ u32 limited_transmit; /**< snd_nxt when limited transmit starts */ + u32 last_fib_check; /**< Last time we checked fib route for peer */ } tcp_connection_t; struct _tcp_cc_algorithm @@ -528,6 +530,8 @@ void tcp_cc_init_congestion (tcp_connection_t * tc); int tcp_cc_recover (tcp_connection_t * tc); void tcp_cc_fastrecovery_exit (tcp_connection_t * tc); +fib_node_index_t tcp_lookup_rmt_in_fib (tcp_connection_t * tc); + /* Made public for unit testing only */ void tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end); diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 554a981d..41bebcb3 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -22,17 +22,14 @@ vlib_node_registration_t tcp6_output_node; typedef enum _tcp_output_nect { TCP_OUTPUT_NEXT_DROP, - TCP_OUTPUT_NEXT_IP_LOOKUP, TCP_OUTPUT_N_NEXT } tcp_output_next_t; #define foreach_tcp4_output_next \ _ (DROP, "error-drop") \ - _ (IP_LOOKUP, "ip4-lookup") #define foreach_tcp6_output_next \ _ (DROP, "error-drop") \ - _ (IP_LOOKUP, "ip6-lookup") static char *tcp_error_strings[] = { #define tcp_error(n,s) s, @@ -1451,7 +1448,7 @@ tcp46_output_inline (vlib_main_t * vm, tcp_connection_t *tc0; tcp_tx_trace_t *t0; tcp_header_t *th0 = 0; - u32 error0 = TCP_ERROR_PKTS_SENT, next0 = TCP_OUTPUT_NEXT_IP_LOOKUP; + u32 error0 = TCP_ERROR_PKTS_SENT, next0 = TCP_OUTPUT_NEXT_DROP; bi0 = from[0]; to_next[0] = bi0; @@ -1530,10 +1527,26 @@ tcp46_output_inline (vlib_main_t * vm, tc0->rto_boff = 0; } - /* set fib index to default and lookup node */ - /* XXX network virtualization (vrf/vni) */ - vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; - vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + /* Make sure we haven't lost route to our peer */ + if (PREDICT_FALSE (tc0->last_fib_check + < tc0->snd_opts.tsval + TCP_FIB_RECHECK_PERIOD)) + { + if (PREDICT_TRUE + (tc0->c_rmt_fei == tcp_lookup_rmt_in_fib (tc0))) + { + tc0->last_fib_check = tc0->snd_opts.tsval; + } + else + { + clib_warning ("lost connection to peer"); + tcp_connection_reset (tc0); + goto done; + } + } + + /* Use pre-computed dpo to set next node */ + next0 = tc0->c_rmt_dpo.dpoi_next_node; + vnet_buffer (b0)->ip.adj_index[VLIB_TX] = tc0->c_rmt_dpo.dpoi_index; b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; done: diff --git a/src/vnet/tcp/tcp_packet.h b/src/vnet/tcp/tcp_packet.h index 4f28cf32..a6f62ee1 100644 --- a/src/vnet/tcp/tcp_packet.h +++ b/src/vnet/tcp/tcp_packet.h @@ -144,12 +144,11 @@ typedef struct { u8 flags; /** Option flags, see above */ - /* Received options */ - u16 mss; /**< Maximum segment size advertised by peer */ - u8 wscale; /**< Window scale advertised by peer */ - u32 tsval; /**< Peer's timestamp value */ + u16 mss; /**< Maximum segment size advertised */ + u8 wscale; /**< Window scale advertised */ + u32 tsval; /**< Timestamp value */ u32 tsecr; /**< Echoed/reflected time stamp */ - sack_block_t *sacks; /**< SACK blocks received */ + sack_block_t *sacks; /**< SACK blocks */ u8 n_sack_blocks; /**< Number of SACKs blocks */ } tcp_options_t; -- cgit 1.2.3-korg From 2c25a62cc1cc4937165de740a3b32d78429c72d6 Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Mon, 26 Jun 2017 11:35:07 -0400 Subject: Horizontal (nSessions) scaling draft - Data structure preallocation. - Input state machine fixes for mid-stream 3-way handshake retries. - Batch connections in the builtin_client - Multiple private fifo segment support - Fix elog simultaneous event type registration - Fix sacks when segment hole is added after highest sacked - Add "accepting" session state for sessions pending accept - Add ssvm non-recursive locking - Estimate RTT for syn-ack - Don't init fifo pointers. We're using relative offsets for ooo segments - CLI to dump individual session Change-Id: Ie0598563fd246537bafba4feed7985478ea1d415 Signed-off-by: Dave Barach Signed-off-by: Florin Coras --- src/svm/ssvm.h | 17 +++ src/svm/svm_fifo.c | 56 +++++--- src/svm/svm_fifo.h | 16 ++- src/svm/svm_fifo_segment.c | 114 +++++++++++----- src/svm/svm_fifo_segment.h | 4 +- src/svm/test_svm_fifo1.c | 10 +- src/uri/uri_udp_test.c | 2 +- src/vnet/session/application.c | 2 + src/vnet/session/application_interface.c | 21 --- src/vnet/session/application_interface.h | 12 +- src/vnet/session/node.c | 23 +--- src/vnet/session/segment_manager.c | 26 ++-- src/vnet/session/segment_manager.h | 4 + src/vnet/session/session.c | 72 +++++++--- src/vnet/session/session.h | 30 ++++- src/vnet/session/session_cli.c | 99 +++++++++++--- src/vnet/session/transport.h | 6 + src/vnet/tcp/builtin_client.c | 118 +++++++++++----- src/vnet/tcp/builtin_client.h | 7 +- src/vnet/tcp/builtin_server.c | 66 +++++++-- src/vnet/tcp/tcp.c | 225 ++++++++++++++++++++++++++++--- src/vnet/tcp/tcp.h | 13 ++ src/vnet/tcp/tcp_debug.h | 13 +- src/vnet/tcp/tcp_input.c | 97 ++++++++----- src/vnet/tcp/tcp_newreno.c | 4 +- src/vnet/tcp/tcp_output.c | 53 +++++--- src/vnet/tcp/tcp_packet.h | 1 + src/vnet/tcp/tcp_test.c | 10 +- src/vnet/udp/udp_input.c | 2 +- 29 files changed, 838 insertions(+), 285 deletions(-) (limited to 'src/vnet/tcp/tcp_packet.h') diff --git a/src/svm/ssvm.h b/src/svm/ssvm.h index bccfc164..8466e155 100644 --- a/src/svm/ssvm.h +++ b/src/svm/ssvm.h @@ -101,6 +101,15 @@ ssvm_lock (ssvm_shared_header_t * h, u32 my_pid, u32 tag) h->tag = tag; } +always_inline void +ssvm_lock_non_recursive (ssvm_shared_header_t * h, u32 tag) +{ + while (__sync_lock_test_and_set (&h->lock, 1)) + ; + + h->tag = tag; +} + always_inline void ssvm_unlock (ssvm_shared_header_t * h) { @@ -113,6 +122,14 @@ ssvm_unlock (ssvm_shared_header_t * h) } } +always_inline void +ssvm_unlock_non_recursive (ssvm_shared_header_t * h) +{ + h->tag = 0; + CLIB_MEMORY_BARRIER (); + h->lock = 0; +} + static inline void * ssvm_push_heap (ssvm_shared_header_t * sh) { diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c index aed5d6a7..da60fee5 100644 --- a/src/svm/svm_fifo.c +++ b/src/svm/svm_fifo.c @@ -19,29 +19,29 @@ static inline u8 position_lt (svm_fifo_t * f, u32 a, u32 b) { - return (ooo_segment_distance_to_tail (f, a) - < ooo_segment_distance_to_tail (f, b)); + return (ooo_segment_distance_from_tail (f, a) + < ooo_segment_distance_from_tail (f, b)); } static inline u8 position_leq (svm_fifo_t * f, u32 a, u32 b) { - return (ooo_segment_distance_to_tail (f, a) - <= ooo_segment_distance_to_tail (f, b)); + return (ooo_segment_distance_from_tail (f, a) + <= ooo_segment_distance_from_tail (f, b)); } static inline u8 position_gt (svm_fifo_t * f, u32 a, u32 b) { - return (ooo_segment_distance_to_tail (f, a) - > ooo_segment_distance_to_tail (f, b)); + return (ooo_segment_distance_from_tail (f, a) + > ooo_segment_distance_from_tail (f, b)); } static inline u32 position_diff (svm_fifo_t * f, u32 posa, u32 posb) { - return ooo_segment_distance_to_tail (f, posa) - - ooo_segment_distance_to_tail (f, posb); + return ooo_segment_distance_from_tail (f, posa) + - ooo_segment_distance_from_tail (f, posb); } static inline u32 @@ -113,7 +113,7 @@ svm_fifo_create (u32 data_size_in_bytes) if (f == 0) return 0; - memset (f, 0, sizeof (*f) + data_size_in_bytes); + memset (f, 0, sizeof (*f)); f->nitems = data_size_in_bytes; f->ooos_list_head = OOO_SEGMENT_INVALID_INDEX; @@ -204,7 +204,19 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) { s = prev; s_end_pos = ooo_segment_end_pos (f, s); - goto merge; + + /* Check head and tail now since segment may be wider at both ends so + * merge tests lower won't work */ + if (position_lt (f, normalized_position, s->start)) + { + s->start = normalized_position; + s->length = position_diff (f, s_end_pos, s->start); + } + if (position_gt (f, normalized_end_position, s_end_pos)) + { + s->length = position_diff (f, normalized_end_position, s->start); + } + goto check_tail; } s_index = s - f->ooo_segments; @@ -257,8 +269,6 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) * Merge needed */ -merge: - /* Merge at head */ if (position_lt (f, normalized_position, s->start)) { @@ -278,6 +288,7 @@ merge: goto done; } +check_tail: /* The new segment's tail may cover multiple smaller ones */ if (position_gt (f, normalized_end_position, s_end_pos)) { @@ -296,7 +307,8 @@ merge: /* If partial overlap with last, merge */ if (it && position_leq (f, it->start, normalized_end_position)) { - s->length = ooo_segment_end_pos (f, it) - s->start; + s->length = + position_diff (f, ooo_segment_end_pos (f, it), s->start); ooo_segment_del (f, it - f->ooo_segments); } } @@ -319,9 +331,9 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) i32 diff; s = pool_elt_at_index (f->ooo_segments, f->ooos_list_head); + diff = ooo_segment_distance_to_tail (f, s->start); - diff = (f->tail >= s->start) ? - f->tail - s->start : f->nitems + f->tail - s->start; + ASSERT (diff != n_bytes_enqueued); if (diff > n_bytes_enqueued) return 0; @@ -345,8 +357,7 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) if (s->next != OOO_SEGMENT_INVALID_INDEX) { s = pool_elt_at_index (f->ooo_segments, s->next); - diff = (f->tail >= s->start) ? - f->tail - s->start : f->nitems + f->tail - s->start; + diff = ooo_segment_distance_to_tail (f, s->start); ooo_segment_del (f, index); } /* End of search */ @@ -357,6 +368,7 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) } } + ASSERT (bytes >= 0 && bytes <= f->nitems); return bytes; } @@ -401,6 +413,8 @@ svm_fifo_enqueue_internal (svm_fifo_t * f, u32 max_bytes, u8 * copy_from_here) } else { + ASSERT (0); + /* Account for a zero-copy enqueue done elsewhere */ ASSERT (max_bytes <= (nitems - cursize)); f->tail += max_bytes; @@ -413,6 +427,7 @@ svm_fifo_enqueue_internal (svm_fifo_t * f, u32 max_bytes, u8 * copy_from_here) total_copy_bytes += ooo_segment_try_collect (f, total_copy_bytes); /* Atomically increase the queue length */ + ASSERT (cursize + total_copy_bytes <= nitems); __sync_fetch_and_add (&f->cursize, total_copy_bytes); return (total_copy_bytes); @@ -475,6 +490,8 @@ svm_fifo_enqueue_with_offset_internal (svm_fifo_t * f, cursize = svm_fifo_max_dequeue (f); nitems = f->nitems; + ASSERT (required_bytes < nitems); + normalized_offset = (f->tail + offset) % nitems; /* Will this request fit? */ @@ -557,6 +574,7 @@ svm_fifo_dequeue_internal (svm_fifo_t * f, u32 max_bytes, u8 * copy_here) } else { + ASSERT (0); /* Account for a zero-copy dequeue done elsewhere */ ASSERT (max_bytes <= cursize); f->head += max_bytes; @@ -565,6 +583,8 @@ svm_fifo_dequeue_internal (svm_fifo_t * f, u32 max_bytes, u8 * copy_here) total_copy_bytes = max_bytes; } + ASSERT (f->head <= nitems); + ASSERT (cursize >= total_copy_bytes); __sync_fetch_and_sub (&f->cursize, total_copy_bytes); return (total_copy_bytes); @@ -702,6 +722,8 @@ svm_fifo_dequeue_drop (svm_fifo_t * f, u32 max_bytes) f->head = (f->head == nitems) ? 0 : f->head; } + ASSERT (f->head <= nitems); + ASSERT (cursize >= total_drop_bytes); __sync_fetch_and_sub (&f->cursize, total_drop_bytes); return total_drop_bytes; diff --git a/src/svm/svm_fifo.h b/src/svm/svm_fifo.h index f32ef41d..fe21de47 100644 --- a/src/svm/svm_fifo.h +++ b/src/svm/svm_fifo.h @@ -133,25 +133,31 @@ svm_fifo_newest_ooo_segment (svm_fifo_t * f) } always_inline u32 -ooo_segment_distance_to_tail (svm_fifo_t * f, u32 a) +ooo_segment_distance_from_tail (svm_fifo_t * f, u32 pos) { /* Ambiguous. Assumption is that ooo segments don't touch tail */ - if (a == f->tail && f->tail == f->head) + if (PREDICT_FALSE (pos == f->tail && f->tail == f->head)) return f->nitems; - return ((f->nitems + a - f->tail) % f->nitems); + return (((f->nitems + pos) - f->tail) % f->nitems); +} + +always_inline u32 +ooo_segment_distance_to_tail (svm_fifo_t * f, u32 pos) +{ + return (((f->nitems + f->tail) - pos) % f->nitems); } always_inline u32 ooo_segment_offset (svm_fifo_t * f, ooo_segment_t * s) { - return ooo_segment_distance_to_tail (f, s->start); + return ooo_segment_distance_from_tail (f, s->start); } always_inline u32 ooo_segment_end_offset (svm_fifo_t * f, ooo_segment_t * s) { - return ooo_segment_distance_to_tail (f, s->start) + s->length; + return ooo_segment_distance_from_tail (f, s->start) + s->length; } always_inline u32 diff --git a/src/svm/svm_fifo_segment.c b/src/svm/svm_fifo_segment.c index c4ac2352..69d4ecb9 100644 --- a/src/svm/svm_fifo_segment.c +++ b/src/svm/svm_fifo_segment.c @@ -35,6 +35,11 @@ preallocate_fifo_pairs (svm_fifo_segment_header_t * fsh, rx_fifo_size = (sizeof (*f) + a->rx_fifo_size) * a->preallocated_fifo_pairs; tx_fifo_size = (sizeof (*f) + a->tx_fifo_size) * a->preallocated_fifo_pairs; + if (0) + clib_warning ("rx_fifo_size %u (%d mb), tx_fifo_size %u (%d mb)", + rx_fifo_size, rx_fifo_size >> 20, + tx_fifo_size, tx_fifo_size >> 20); + /* Allocate rx fifo space. May fail. */ rx_fifo_space = clib_mem_alloc_aligned_at_offset (rx_fifo_size, CLIB_CACHE_LINE_BYTES, 0 /* align_offset */ , @@ -129,7 +134,7 @@ svm_fifo_segment_create (svm_fifo_segment_create_args_t * a) ssvm_pop_heap (oldheap); sh->ready = 1; - a->new_segment_index = s - sm->segments; + vec_add1 (a->new_segment_indices, s - sm->segments); return (0); } @@ -141,35 +146,81 @@ svm_fifo_segment_create_process_private (svm_fifo_segment_create_args_t * a) svm_fifo_segment_main_t *sm = &svm_fifo_segment_main; ssvm_shared_header_t *sh; svm_fifo_segment_header_t *fsh; + void *oldheap; + u8 **heaps = 0; + mheap_t *heap_header; + int segment_count = 1; + int i; - /* Allocate a fresh segment */ - pool_get (sm->segments, s); - memset (s, 0, sizeof (*s)); - - s->ssvm.ssvm_size = ~0; - s->ssvm.i_am_master = 1; - s->ssvm.my_pid = getpid (); - s->ssvm.name = (u8 *) a->segment_name; - s->ssvm.requested_va = ~0; - - /* Allocate a [sic] shared memory header, in process memory... */ - sh = clib_mem_alloc_aligned (sizeof (*sh), CLIB_CACHE_LINE_BYTES); - s->ssvm.sh = sh; + if (a->private_segment_count && a->private_segment_size) + { + void *mem; + u8 *heap; + u32 pagesize = clib_mem_get_page_size (); + u32 rnd_size; - memset (sh, 0, sizeof (*sh)); - sh->heap = clib_mem_get_heap (); + for (i = 0; i < a->private_segment_count; i++) + { + rnd_size = (a->private_segment_size + (pagesize - 1)) & ~pagesize; + + mem = mmap (0, rnd_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + -1 /* fd */ , 0 /* offset */ ); + + if (mem == MAP_FAILED) + { + clib_unix_warning ("mmap"); + return -1; + } + heap = mheap_alloc (mem, rnd_size); + heap_header = mheap_header (heap); + heap_header->flags |= MHEAP_FLAG_THREAD_SAFE; + vec_add1 (heaps, heap); + } + segment_count = a->private_segment_count; + } - /* Set up svm_fifo_segment shared header */ - fsh = clib_mem_alloc (sizeof (*fsh)); - memset (fsh, 0, sizeof (*fsh)); - sh->opaque[0] = fsh; - s->h = fsh; - fsh->segment_name = format (0, "%s%c", a->segment_name, 0); + /* Spread preallocated fifo pairs across segments */ + a->preallocated_fifo_pairs /= segment_count; - preallocate_fifo_pairs (fsh, a); + /* Allocate segments */ + for (i = 0; i < segment_count; i++) + { + pool_get (sm->segments, s); + memset (s, 0, sizeof (*s)); + + s->ssvm.ssvm_size = ~0; + s->ssvm.i_am_master = 1; + s->ssvm.my_pid = getpid (); + s->ssvm.name = (u8 *) a->segment_name; + s->ssvm.requested_va = ~0; + + /* Allocate a [sic] shared memory header, in process memory... */ + sh = clib_mem_alloc_aligned (sizeof (*sh), CLIB_CACHE_LINE_BYTES); + s->ssvm.sh = sh; + + memset (sh, 0, sizeof (*sh)); + sh->heap = a->private_segment_count ? heaps[i] : clib_mem_get_heap (); + + /* Set up svm_fifo_segment shared header */ + fsh = clib_mem_alloc (sizeof (*fsh)); + memset (fsh, 0, sizeof (*fsh)); + sh->opaque[0] = fsh; + s->h = fsh; + fsh->segment_name = format (0, "%s%c", a->segment_name, 0); + + if (a->private_segment_count) + { + oldheap = clib_mem_get_heap (); + clib_mem_set_heap (sh->heap); + preallocate_fifo_pairs (fsh, a); + clib_mem_set_heap (oldheap); + } - sh->ready = 1; - a->new_segment_index = s - sm->segments; + sh->ready = 1; + vec_add1 (a->new_segment_indices, s - sm->segments); + } + vec_free (heaps); return (0); } @@ -205,7 +256,7 @@ svm_fifo_segment_attach (svm_fifo_segment_create_args_t * a) fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; s->h = fsh; - a->new_segment_index = s - sm->segments; + vec_add1 (a->new_segment_indices, s - sm->segments); return (0); } @@ -230,7 +281,7 @@ svm_fifo_segment_alloc_fifo (svm_fifo_segment_private_t * s, sh = s->ssvm.sh; fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; - ssvm_lock (sh, 1, 0); + ssvm_lock_non_recursive (sh, 1); oldheap = ssvm_push_heap (sh); switch (list_index) @@ -261,7 +312,7 @@ svm_fifo_segment_alloc_fifo (svm_fifo_segment_private_t * s, if (PREDICT_FALSE (f == 0)) { ssvm_pop_heap (oldheap); - ssvm_unlock (sh); + ssvm_unlock_non_recursive (sh); return (0); } @@ -281,7 +332,7 @@ found: } ssvm_pop_heap (oldheap); - ssvm_unlock (sh); + ssvm_unlock_non_recursive (sh); return (f); } @@ -293,10 +344,11 @@ svm_fifo_segment_free_fifo (svm_fifo_segment_private_t * s, svm_fifo_t * f, svm_fifo_segment_header_t *fsh; void *oldheap; + sh = s->ssvm.sh; fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; - ssvm_lock (sh, 1, 0); + ssvm_lock_non_recursive (sh, 2); oldheap = ssvm_push_heap (sh); switch (list_index) @@ -325,7 +377,7 @@ svm_fifo_segment_free_fifo (svm_fifo_segment_private_t * s, svm_fifo_t * f, } ssvm_pop_heap (oldheap); - ssvm_unlock (sh); + ssvm_unlock_non_recursive (sh); } void diff --git a/src/svm/svm_fifo_segment.h b/src/svm/svm_fifo_segment.h index 31e14db5..a7a3f469 100644 --- a/src/svm/svm_fifo_segment.h +++ b/src/svm/svm_fifo_segment.h @@ -57,10 +57,12 @@ typedef struct { char *segment_name; u32 segment_size; - u32 new_segment_index; + u32 *new_segment_indices; u32 rx_fifo_size; u32 tx_fifo_size; u32 preallocated_fifo_pairs; + u32 private_segment_count; + u32 private_segment_size; } svm_fifo_segment_create_args_t; static inline svm_fifo_segment_private_t * diff --git a/src/svm/test_svm_fifo1.c b/src/svm/test_svm_fifo1.c index 63b4a9b7..63d75845 100644 --- a/src/svm/test_svm_fifo1.c +++ b/src/svm/test_svm_fifo1.c @@ -39,7 +39,7 @@ hello_world (int verbose) if (rv) return clib_error_return (0, "svm_fifo_segment_create returned %d", rv); - sp = svm_fifo_get_segment (a->new_segment_index); + sp = svm_fifo_get_segment (a->new_segment_indices[0]); f = svm_fifo_segment_alloc_fifo (sp, 4096, FIFO_SEGMENT_RX_FREELIST); @@ -92,7 +92,7 @@ master (int verbose) if (rv) return clib_error_return (0, "svm_fifo_segment_create returned %d", rv); - sp = svm_fifo_get_segment (a->new_segment_index); + sp = svm_fifo_get_segment (a->new_segment_indices[0]); f = svm_fifo_segment_alloc_fifo (sp, 4096, FIFO_SEGMENT_RX_FREELIST); @@ -128,7 +128,7 @@ mempig (int verbose) if (rv) return clib_error_return (0, "svm_fifo_segment_create returned %d", rv); - sp = svm_fifo_get_segment (a->new_segment_index); + sp = svm_fifo_get_segment (a->new_segment_indices[0]); for (i = 0; i < 1000; i++) { @@ -186,7 +186,7 @@ offset (int verbose) if (rv) return clib_error_return (0, "svm_fifo_segment_create returned %d", rv); - sp = svm_fifo_get_segment (a->new_segment_index); + sp = svm_fifo_get_segment (a->new_segment_indices[0]); f = svm_fifo_segment_alloc_fifo (sp, 200 << 10, FIFO_SEGMENT_RX_FREELIST); @@ -246,7 +246,7 @@ slave (int verbose) if (rv) return clib_error_return (0, "svm_fifo_segment_attach returned %d", rv); - sp = svm_fifo_get_segment (a->new_segment_index); + sp = svm_fifo_get_segment (a->new_segment_indices[0]); sh = sp->ssvm.sh; fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; diff --git a/src/uri/uri_udp_test.c b/src/uri/uri_udp_test.c index 45ad35a4..a8e39eaa 100644 --- a/src/uri/uri_udp_test.c +++ b/src/uri/uri_udp_test.c @@ -707,7 +707,7 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) return; } - segment_index = a->new_segment_index; + segment_index = a->new_segment_indices[0]; vec_add2 (utm->seg, seg, 1); memcpy (seg, sm->segments + segment_index, sizeof (*seg)); sleep (1); diff --git a/src/vnet/session/application.c b/src/vnet/session/application.c index 3cc56f37..8a953719 100644 --- a/src/vnet/session/application.c +++ b/src/vnet/session/application.c @@ -174,6 +174,8 @@ application_init (application_t * app, u32 api_client_index, u64 * options, props->preallocated_fifo_pairs = options[APP_OPTIONS_PREALLOC_FIFO_PAIRS]; props->use_private_segment = options[APP_OPTIONS_FLAGS] & APP_OPTIONS_FLAGS_BUILTIN_APP; + props->private_segment_count = options[APP_OPTIONS_PRIVATE_SEGMENT_COUNT]; + props->private_segment_size = options[APP_OPTIONS_PRIVATE_SEGMENT_SIZE]; first_seg_size = options[SESSION_OPTIONS_SEGMENT_SIZE]; if ((rv = segment_manager_init (sm, props, first_seg_size))) diff --git a/src/vnet/session/application_interface.c b/src/vnet/session/application_interface.c index 338ae857..566a52d7 100644 --- a/src/vnet/session/application_interface.c +++ b/src/vnet/session/application_interface.c @@ -275,27 +275,6 @@ vnet_application_detach (vnet_app_detach_args_t * a) return 0; } -session_type_t -session_type_from_proto_and_ip (session_api_proto_t proto, u8 is_ip4) -{ - if (proto == SESSION_PROTO_TCP) - { - if (is_ip4) - return SESSION_TYPE_IP4_TCP; - else - return SESSION_TYPE_IP6_TCP; - } - else - { - if (is_ip4) - return SESSION_TYPE_IP4_UDP; - else - return SESSION_TYPE_IP6_UDP; - } - - return SESSION_N_TYPES; -} - int vnet_bind_uri (vnet_bind_args_t * a) { diff --git a/src/vnet/session/application_interface.h b/src/vnet/session/application_interface.h index 4d6f9def..ed9f89b3 100644 --- a/src/vnet/session/application_interface.h +++ b/src/vnet/session/application_interface.h @@ -22,12 +22,6 @@ #include #include -typedef enum _session_api_proto -{ - SESSION_PROTO_TCP, - SESSION_PROTO_UDP -} session_api_proto_t; - typedef struct _vnet_app_attach_args_t { /** Binary API client index */ @@ -65,7 +59,7 @@ typedef struct _vnet_bind_args_t struct { transport_endpoint_t tep; - session_api_proto_t proto; + transport_proto_t proto; }; }; @@ -98,7 +92,7 @@ typedef struct _vnet_connect_args struct { transport_endpoint_t tep; - session_api_proto_t proto; + transport_proto_t proto; }; }; u32 app_index; @@ -120,6 +114,8 @@ typedef enum APP_EVT_QUEUE_SIZE, APP_OPTIONS_FLAGS, APP_OPTIONS_PREALLOC_FIFO_PAIRS, + APP_OPTIONS_PRIVATE_SEGMENT_COUNT, + APP_OPTIONS_PRIVATE_SEGMENT_SIZE, SESSION_OPTIONS_SEGMENT_SIZE, SESSION_OPTIONS_ADD_SEGMENT_SIZE, SESSION_OPTIONS_RX_FIFO_SIZE, diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c index b24f5fd9..56e62637 100644 --- a/src/vnet/session/node.c +++ b/src/vnet/session/node.c @@ -378,24 +378,12 @@ session_tx_fifo_dequeue_and_snd (vlib_main_t * vm, vlib_node_runtime_t * node, n_tx_pkts, 0); } -stream_session_t * -session_event_get_session (session_fifo_event_t * e0, u8 thread_index) +always_inline stream_session_t * +session_event_get_session (session_fifo_event_t * e, u8 thread_index) { - svm_fifo_t *f0; - stream_session_t *s0; - u32 session_index0; - - f0 = e0->fifo; - session_index0 = f0->master_session_index; - - /* $$$ add multiple event queues, per vpp worker thread */ - ASSERT (f0->master_thread_index == thread_index); - - s0 = stream_session_get_if_valid (session_index0, thread_index); - - ASSERT (s0 == 0 || s0->thread_index == thread_index); - - return s0; + ASSERT (e->fifo->master_thread_index == thread_index); + return stream_session_get_if_valid (e->fifo->master_session_index, + thread_index); } void @@ -569,7 +557,6 @@ skip_dequeue: case FIFO_EVENT_BUILTIN_RX: s0 = session_event_get_session (e0, my_thread_index); svm_fifo_unset_event (s0->server_rx_fifo); - /* Get session's server */ app = application_get (s0->app_index); app->cb_fns.builtin_server_rx_callback (s0); break; diff --git a/src/vnet/session/segment_manager.c b/src/vnet/session/segment_manager.c index dcef6261..262b7faa 100644 --- a/src/vnet/session/segment_manager.c +++ b/src/vnet/session/segment_manager.c @@ -30,7 +30,7 @@ segment_manager_t *segment_managers = 0; /** * Process private segment index */ -u32 private_segment_index = ~0; +u32 *private_segment_indices; /** * Default fifo and segment size. TODO config. @@ -70,7 +70,8 @@ session_manager_add_segment_i (segment_manager_t * sm, u32 segment_size, return VNET_API_ERROR_SVM_SEGMENT_CREATE_FAIL; } - vec_add1 (sm->segment_indices, ca->new_segment_index); + vec_append (sm->segment_indices, ca->new_segment_indices); + vec_free (ca->new_segment_indices); return 0; } @@ -111,22 +112,23 @@ static void { svm_fifo_segment_create_args_t _a, *a = &_a; - if (private_segment_index != ~0) + if (private_segment_indices) return; memset (a, 0, sizeof (*a)); a->segment_name = "process-private-segment"; a->segment_size = ~0; - a->new_segment_index = ~0; a->rx_fifo_size = props->rx_fifo_size; a->tx_fifo_size = props->tx_fifo_size; a->preallocated_fifo_pairs = props->preallocated_fifo_pairs; + a->private_segment_count = props->private_segment_count; + a->private_segment_size = props->private_segment_size; if (svm_fifo_segment_create_process_private (a)) clib_warning ("Failed to create process private segment"); - private_segment_index = a->new_segment_index; - ASSERT (private_segment_index != ~0); + private_segment_indices = a->new_segment_indices; + ASSERT (vec_len (private_segment_indices)); } /** @@ -156,10 +158,10 @@ segment_manager_init (segment_manager_t * sm, } else { - if (private_segment_index == ~0) + if (vec_len (private_segment_indices) == 0) segment_manager_alloc_process_private_segment (properties); - ASSERT (private_segment_index != ~0); - vec_add1 (sm->segment_indices, private_segment_index); + ASSERT (vec_len (private_segment_indices)); + vec_append (sm->segment_indices, private_segment_indices); } clib_spinlock_init (&sm->lockp); @@ -320,7 +322,7 @@ again: /* See if we're supposed to create another segment */ if (*server_rx_fifo == 0) { - if (sm->properties->add_segment) + if (sm->properties->add_segment && !sm->properties->use_private_segment) { if (added_a_segment) { @@ -379,6 +381,10 @@ segment_manager_dealloc_fifos (u32 svm_segment_index, svm_fifo_t * rx_fifo, svm_fifo_segment_free_fifo (fifo_segment, tx_fifo, FIFO_SEGMENT_TX_FREELIST); + /* Don't try to delete process-private segments */ + if (sm->properties->private_segment_count > 0) + return; + /* Remove segment only if it holds no fifos and not the first */ if (sm->segment_indices[0] != svm_segment_index && !svm_fifo_segment_has_fifos (fifo_segment)) diff --git a/src/vnet/session/segment_manager.h b/src/vnet/session/segment_manager.h index df38d2b3..41abeb22 100644 --- a/src/vnet/session/segment_manager.h +++ b/src/vnet/session/segment_manager.h @@ -39,6 +39,10 @@ typedef struct _segment_manager_properties /** Use private memory segment instead of shared memory */ u8 use_private_segment; + + /** Use one or more private mheaps, instead of the global heap */ + u32 private_segment_count; + u32 private_segment_size; } segment_manager_properties_t; typedef struct _segment_manager diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index fe198044..0a86d563 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -198,21 +198,28 @@ stream_session_lookup_listener4 (ip4_address_t * lcl, u16 lcl_port, u8 proto) */ stream_session_t * stream_session_lookup4 (ip4_address_t * lcl, ip4_address_t * rmt, - u16 lcl_port, u16 rmt_port, u8 proto, - u32 my_thread_index) + u16 lcl_port, u16 rmt_port, u8 proto) { session_manager_main_t *smm = &session_manager_main; session_kv4_t kv4; + stream_session_t *s; int rv; /* Lookup session amongst established ones */ make_v4_ss_kv (&kv4, lcl, rmt, lcl_port, rmt_port, proto); rv = clib_bihash_search_inline_16_8 (&smm->v4_session_hash, &kv4); if (rv == 0) - return stream_session_get_tsi (kv4.value, my_thread_index); + return stream_session_get_from_handle (kv4.value); /* If nothing is found, check if any listener is available */ - return stream_session_lookup_listener4 (lcl, lcl_port, proto); + if ((s = stream_session_lookup_listener4 (lcl, lcl_port, proto))) + return s; + + /* Finally, try half-open connections */ + rv = clib_bihash_search_inline_16_8 (&smm->v4_half_open_hash, &kv4); + if (rv == 0) + return stream_session_get_from_handle (kv4.value); + return 0; } stream_session_t * @@ -242,20 +249,27 @@ stream_session_lookup_listener6 (ip6_address_t * lcl, u16 lcl_port, u8 proto) * wildcarded local source (listener bound to all interfaces) */ stream_session_t * stream_session_lookup6 (ip6_address_t * lcl, ip6_address_t * rmt, - u16 lcl_port, u16 rmt_port, u8 proto, - u32 my_thread_index) + u16 lcl_port, u16 rmt_port, u8 proto) { session_manager_main_t *smm = vnet_get_session_manager_main (); session_kv6_t kv6; + stream_session_t *s; int rv; make_v6_ss_kv (&kv6, lcl, rmt, lcl_port, rmt_port, proto); rv = clib_bihash_search_inline_48_8 (&smm->v6_session_hash, &kv6); if (rv == 0) - return stream_session_get_tsi (kv6.value, my_thread_index); + return stream_session_get_from_handle (kv6.value); /* If nothing is found, check if any listener is available */ - return stream_session_lookup_listener6 (lcl, lcl_port, proto); + if ((s = stream_session_lookup_listener6 (lcl, lcl_port, proto))) + return s; + + /* Finally, try half-open connections */ + rv = clib_bihash_search_inline_48_8 (&smm->v6_half_open_hash, &kv6); + if (rv == 0) + return stream_session_get_from_handle (kv6.value); + return 0; } stream_session_t * @@ -340,7 +354,6 @@ stream_session_lookup_transport4 (ip4_address_t * lcl, ip4_address_t * rmt, rv = clib_bihash_search_inline_16_8 (&smm->v4_half_open_hash, &kv4); if (rv == 0) return tp_vfts[proto].get_half_open (kv4.value & 0xFFFFFFFF); - return 0; } @@ -390,6 +403,8 @@ stream_session_create_i (segment_manager_t * sm, transport_connection_t * tc, u32 thread_index = tc->thread_index; int rv; + ASSERT (thread_index == vlib_get_thread_index ()); + if ((rv = segment_manager_alloc_session_fifos (sm, &server_rx_fifo, &server_tx_fifo, &fifo_segment_index))) @@ -854,6 +869,7 @@ stream_session_accept (transport_connection_t * tc, u32 listener_index, s->app_index = server->index; s->listener_index = listener_index; + s->session_state = SESSION_STATE_ACCEPTING; /* Shoulder-tap the server */ if (notify) @@ -1088,6 +1104,27 @@ session_vpp_event_queue_allocate (session_manager_main_t * smm, } } +session_type_t +session_type_from_proto_and_ip (transport_proto_t proto, u8 is_ip4) +{ + if (proto == TRANSPORT_PROTO_TCP) + { + if (is_ip4) + return SESSION_TYPE_IP4_TCP; + else + return SESSION_TYPE_IP6_TCP; + } + else + { + if (is_ip4) + return SESSION_TYPE_IP4_UDP; + else + return SESSION_TYPE_IP6_UDP; + } + + return SESSION_N_TYPES; +} + static clib_error_t * session_manager_main_enable (vlib_main_t * vm) { @@ -1131,14 +1168,13 @@ session_manager_main_enable (vlib_main_t * vm) session_vpp_event_queue_allocate (smm, i); /* $$$$ preallocate hack config parameter */ - for (i = 0; i < 200000; i++) + for (i = 0; i < smm->preallocated_sessions; i++) { - stream_session_t *ss; + stream_session_t *ss __attribute__ ((unused)); pool_get_aligned (smm->sessions[0], ss, CLIB_CACHE_LINE_BYTES); - memset (ss, 0, sizeof (*ss)); } - for (i = 0; i < 200000; i++) + for (i = 0; i < smm->preallocated_sessions; i++) pool_put_index (smm->sessions[0], i); clib_bihash_init_16_8 (&smm->v4_session_hash, "v4 session table", @@ -1208,9 +1244,10 @@ session_manager_main_init (vlib_main_t * vm) return 0; } -VLIB_INIT_FUNCTION (session_manager_main_init) - static clib_error_t *session_config_fn (vlib_main_t * vm, - unformat_input_t * input) +VLIB_INIT_FUNCTION (session_manager_main_init); + +static clib_error_t * +session_config_fn (vlib_main_t * vm, unformat_input_t * input) { session_manager_main_t *smm = &session_manager_main; u32 nitems; @@ -1224,6 +1261,9 @@ VLIB_INIT_FUNCTION (session_manager_main_init) else clib_warning ("event queue length %d too small, ignored", nitems); } + if (unformat (input, "preallocated-sessions %d", + &smm->preallocated_sessions)) + ; else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index 5fa4225c..b4507d4e 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -80,6 +80,10 @@ typedef enum SESSION_N_TYPES, } session_type_t; + +session_type_t +session_type_from_proto_and_ip (transport_proto_t proto, u8 is_ip4); + /* * Application session state */ @@ -87,6 +91,7 @@ typedef enum { SESSION_STATE_LISTENING, SESSION_STATE_CONNECTING, + SESSION_STATE_ACCEPTING, SESSION_STATE_READY, SESSION_STATE_CLOSED, SESSION_STATE_N_STATES, @@ -211,8 +216,12 @@ struct _session_manager_main /** Per transport rx function that can either dequeue or peek */ session_fifo_rx_fn *session_tx_fns[SESSION_N_TYPES]; + /** Session manager is enabled */ u8 is_enabled; + /** Preallocate session config parameter */ + u32 preallocated_sessions; + /* Convenience */ vlib_main_t *vlib_main; vnet_main_t *vnet_main; @@ -247,13 +256,12 @@ stream_session_t *stream_session_lookup_listener4 (ip4_address_t * lcl, u16 lcl_port, u8 proto); stream_session_t *stream_session_lookup4 (ip4_address_t * lcl, ip4_address_t * rmt, u16 lcl_port, - u16 rmt_port, u8 proto, - u32 thread_index); + u16 rmt_port, u8 proto); stream_session_t *stream_session_lookup_listener6 (ip6_address_t * lcl, u16 lcl_port, u8 proto); stream_session_t *stream_session_lookup6 (ip6_address_t * lcl, ip6_address_t * rmt, u16 lcl_port, - u16 rmt_port, u8, u32 thread_index); + u16 rmt_port, u8 proto); transport_connection_t * stream_session_lookup_transport4 (ip4_address_t * lcl, ip4_address_t * rmt, u16 lcl_port, @@ -277,9 +285,24 @@ stream_session_get_tsi (u64 ti_and_si, u32 thread_index) ti_and_si & 0xFFFFFFFFULL); } +always_inline u8 +stream_session_is_valid (u32 si, u8 thread_index) +{ + stream_session_t *s; + s = pool_elt_at_index (session_manager_main.sessions[thread_index], si); + if (s->thread_index != thread_index || s->session_index != si + || s->server_rx_fifo->master_session_index != si + || s->server_tx_fifo->master_session_index != si + || s->server_rx_fifo->master_thread_index != thread_index + || s->server_tx_fifo->master_thread_index != thread_index) + return 0; + return 1; +} + always_inline stream_session_t * stream_session_get (u32 si, u32 thread_index) { + ASSERT (stream_session_is_valid (si, thread_index)); return pool_elt_at_index (session_manager_main.sessions[thread_index], si); } @@ -292,6 +315,7 @@ stream_session_get_if_valid (u64 si, u32 thread_index) if (pool_is_free_index (session_manager_main.sessions[thread_index], si)) return 0; + ASSERT (stream_session_is_valid (si, thread_index)); return pool_elt_at_index (session_manager_main.sessions[thread_index], si); } diff --git a/src/vnet/session/session_cli.c b/src/vnet/session/session_cli.c index 6b8341aa..e06bc586 100755 --- a/src/vnet/session/session_cli.c +++ b/src/vnet/session/session_cli.c @@ -47,7 +47,8 @@ format_stream_session (u8 * s, va_list * args) svm_fifo_max_enqueue (ss->server_tx_fifo), stream_session_get_index (ss)); - if (ss->session_state == SESSION_STATE_READY) + if (ss->session_state == SESSION_STATE_READY + || ss->session_state == SESSION_STATE_ACCEPTING) { s = format (s, "%U", tp_vft->format_connection, ss->connection_index, ss->thread_index, verbose); @@ -68,8 +69,9 @@ format_stream_session (u8 * s, va_list * args) } else if (ss->session_state == SESSION_STATE_CLOSED) { - s = format (s, "[CL] %-40U", tp_vft->format_connection, - ss->connection_index, ss->thread_index, verbose); + s = + format (s, "[CL] %U", tp_vft->format_connection, ss->connection_index, + ss->thread_index, verbose); if (verbose == 1) s = format (s, "%v", str); if (verbose > 1) @@ -93,7 +95,13 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, int verbose = 0, i; stream_session_t *pool; stream_session_t *s; - u8 *str = 0; + u8 *str = 0, one_session = 0, proto_set = 0, proto = 0; + u8 is_ip4 = 0, s_type = 0; + ip4_address_t lcl_ip4, rmt_ip4; + u32 lcl_port = 0, rmt_port = 0; + + memset (&lcl_ip4, 0, sizeof (lcl_ip4)); + memset (&rmt_ip4, 0, sizeof (rmt_ip4)); if (!smm->is_enabled) { @@ -106,10 +114,43 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, ; else if (unformat (input, "verbose")) verbose = 1; + else if (unformat (input, "tcp")) + { + proto_set = 1; + proto = TRANSPORT_PROTO_TCP; + } + else if (unformat (input, "%U:%d->%U:%d", + unformat_ip4_address, &lcl_ip4, &lcl_port, + unformat_ip4_address, &rmt_ip4, &rmt_port)) + { + one_session = 1; + is_ip4 = 1; + } + else break; } + if (one_session) + { + if (!proto_set) + { + vlib_cli_output (vm, "proto not set"); + return clib_error_return (0, "proto not set"); + } + + s_type = session_type_from_proto_and_ip (proto, is_ip4); + s = stream_session_lookup4 (&lcl_ip4, &rmt_ip4, + clib_host_to_net_u16 (lcl_port), + clib_host_to_net_u16 (rmt_port), s_type); + if (s) + vlib_cli_output (vm, "%U", format_stream_session, s, 2); + else + vlib_cli_output (vm, "session does not exist"); + + return 0; + } + for (i = 0; i < vec_len (smm->sessions); i++) { u32 once_per_pool; @@ -146,6 +187,7 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, } else vlib_cli_output (vm, "Thread %d: no active sessions", i); + vec_reset_length (str); } vec_free (str); @@ -161,15 +203,22 @@ VLIB_CLI_COMMAND (show_session_command, static) = }; /* *INDENT-ON* */ +static int +clear_session (stream_session_t * s) +{ + application_t *server = application_get (s->app_index); + server->cb_fns.session_disconnect_callback (s); + return 0; +} + static clib_error_t * clear_session_command_fn (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) { session_manager_main_t *smm = &session_manager_main; - u32 thread_index = 0; + u32 thread_index = 0, clear_all = 0; u32 session_index = ~0; - stream_session_t *pool, *session; - application_t *server; + stream_session_t **pool, *session; if (!smm->is_enabled) { @@ -182,28 +231,36 @@ clear_session_command_fn (vlib_main_t * vm, unformat_input_t * input, ; else if (unformat (input, "session %d", &session_index)) ; + else if (unformat (input, "all")) + clear_all = 1; else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); } - if (session_index == ~0) + if (!clear_all && session_index == ~0) return clib_error_return (0, "session required, but not set."); - if (thread_index > vec_len (smm->sessions)) - return clib_error_return (0, "thread %d out of range [0-%d]", - thread_index, vec_len (smm->sessions)); - - pool = smm->sessions[thread_index]; - - if (pool_is_free_index (pool, session_index)) - return clib_error_return (0, "session %d not active", session_index); - - session = pool_elt_at_index (pool, session_index); - server = application_get (session->app_index); + if (session_index != ~0) + { + session = stream_session_get_if_valid (session_index, thread_index); + if (!session) + return clib_error_return (0, "no session %d on thread %d", + session_index, thread_index); + clear_session (session); + } - /* Disconnect both app and transport */ - server->cb_fns.session_disconnect_callback (session); + if (clear_all) + { + /* *INDENT-OFF* */ + vec_foreach (pool, smm->sessions) + { + pool_foreach(session, *pool, ({ + clear_session (session); + })); + }; + /* *INDENT-ON* */ + } return 0; } diff --git a/src/vnet/session/transport.h b/src/vnet/session/transport.h index 561a9257..9c38bab9 100644 --- a/src/vnet/session/transport.h +++ b/src/vnet/session/transport.h @@ -225,6 +225,12 @@ make_v6_ss_kv_from_tc (session_kv6_t * kv, transport_connection_t * t) t->rmt_port, t->proto); } +typedef enum _transport_proto +{ + TRANSPORT_PROTO_TCP, + TRANSPORT_PROTO_UDP +} transport_proto_t; + typedef struct _transport_endpoint { ip46_address_t ip; /** ip address */ diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index 6f8be082..a6c8a235 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -170,62 +170,90 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, { tclient_main_t *tm = &tclient_main; int my_thread_index = vlib_get_thread_index (); - vl_api_disconnect_session_t *dmp; session_t *sp; int i; int delete_session; u32 *connection_indices; - u32 tx_quota = 0; - u32 delta, prev_bytes_received_this_session; + u32 *connections_this_batch; + u32 nconnections_this_batch; connection_indices = tm->connection_index_by_thread[my_thread_index]; + connections_this_batch = + tm->connections_this_batch_by_thread[my_thread_index]; - if (tm->run_test == 0 || vec_len (connection_indices) == 0) + if ((tm->run_test == 0) || + ((vec_len (connection_indices) == 0) + && vec_len (connections_this_batch) == 0)) return 0; - for (i = 0; i < vec_len (connection_indices); i++) + /* Grab another pile of connections */ + if (PREDICT_FALSE (vec_len (connections_this_batch) == 0)) + { + nconnections_this_batch = + clib_min (tm->connections_per_batch, vec_len (connection_indices)); + + ASSERT (nconnections_this_batch > 0); + vec_validate (connections_this_batch, nconnections_this_batch - 1); + clib_memcpy (connections_this_batch, + connection_indices + vec_len (connection_indices) + - nconnections_this_batch, + nconnections_this_batch * sizeof (u32)); + _vec_len (connection_indices) -= nconnections_this_batch; + } + + if (PREDICT_FALSE (tm->prev_conns != tm->connections_per_batch + && tm->prev_conns == vec_len (connections_this_batch))) + { + tm->repeats++; + tm->prev_conns = vec_len (connections_this_batch); + if (tm->repeats == 500000) + { + clib_warning ("stuck clients"); + } + } + else + { + tm->prev_conns = vec_len (connections_this_batch); + tm->repeats = 0; + } + + for (i = 0; i < vec_len (connections_this_batch); i++) { delete_session = 1; - sp = pool_elt_at_index (tm->sessions, connection_indices[i]); + sp = pool_elt_at_index (tm->sessions, connections_this_batch[i]); - if ((tm->no_return || tx_quota < 60) && sp->bytes_to_send > 0) + if (sp->bytes_to_send > 0) { send_test_chunk (tm, sp); delete_session = 0; - tx_quota++; } - if (!tm->no_return && sp->bytes_to_receive > 0) + if (sp->bytes_to_receive > 0) { - prev_bytes_received_this_session = sp->bytes_received; receive_test_chunk (tm, sp); - delta = sp->bytes_received - prev_bytes_received_this_session; - if (delta > 0) - tx_quota--; delete_session = 0; } if (PREDICT_FALSE (delete_session == 1)) { - __sync_fetch_and_add (&tm->tx_total, tm->bytes_to_send); + u32 index, thread_index; + stream_session_t *s; + + __sync_fetch_and_add (&tm->tx_total, sp->bytes_sent); __sync_fetch_and_add (&tm->rx_total, sp->bytes_received); - dmp = vl_msg_api_alloc_as_if_client (sizeof (*dmp)); - memset (dmp, 0, sizeof (*dmp)); - dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION); - dmp->client_index = tm->my_client_index; - dmp->handle = sp->vpp_session_handle; - if (!unix_shared_memory_queue_add (tm->vl_input_queue, (u8 *) & dmp, - 1)) + stream_session_parse_handle (sp->vpp_session_handle, + &index, &thread_index); + s = stream_session_get_if_valid (index, thread_index); + + if (s) { - vec_delete (connection_indices, 1, i); - tm->connection_index_by_thread[my_thread_index] = - connection_indices; + stream_session_disconnect (s); + vec_delete (connections_this_batch, 1, i); + i--; __sync_fetch_and_add (&tm->ready_connections, -1); } else - { - vl_msg_api_free (dmp); - } + clib_warning ("session AWOL?"); /* Kick the debug CLI process */ if (tm->ready_connections == 0) @@ -236,6 +264,10 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, } } } + + tm->connection_index_by_thread[my_thread_index] = connection_indices; + tm->connections_this_batch_by_thread[my_thread_index] = + connections_this_batch; return 0; } @@ -356,6 +388,8 @@ tcp_test_clients_init (vlib_main_t * vm) tm->vlib_main = vm; vec_validate (tm->connection_index_by_thread, thread_main->n_vlib_mains); + vec_validate (tm->connections_this_batch_by_thread, + thread_main->n_vlib_mains); return 0; } @@ -388,7 +422,8 @@ builtin_session_connected_callback (u32 app_index, u32 api_context, pool_get (tm->sessions, session); memset (session, 0, sizeof (*session)); session_index = session - tm->sessions; - session->bytes_to_receive = session->bytes_to_send = tm->bytes_to_send; + session->bytes_to_send = tm->bytes_to_send; + session->bytes_to_receive = tm->no_return ? 0ULL : tm->bytes_to_send; session->server_rx_fifo = s->server_rx_fifo; session->server_rx_fifo->client_session_index = session_index; session->server_tx_fifo = s->server_tx_fifo; @@ -485,6 +520,8 @@ attach_builtin_test_clients_app (void) options[SESSION_OPTIONS_SEGMENT_SIZE] = (2ULL << 32); options[SESSION_OPTIONS_RX_FIFO_SIZE] = tm->fifo_size; options[SESSION_OPTIONS_TX_FIFO_SIZE] = tm->fifo_size / 2; + options[APP_OPTIONS_PRIVATE_SEGMENT_COUNT] = tm->private_segment_count; + options[APP_OPTIONS_PRIVATE_SEGMENT_SIZE] = tm->private_segment_size; options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = prealloc_fifos; options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP; @@ -561,6 +598,9 @@ test_tcp_clients_command_fn (vlib_main_t * vm, tm->bytes_to_send = 8192; tm->no_return = 0; tm->fifo_size = 64 << 10; + tm->connections_per_batch = 1000; + tm->private_segment_count = 0; + tm->private_segment_size = 0; vec_free (tm->connect_uri); @@ -582,6 +622,20 @@ test_tcp_clients_command_fn (vlib_main_t * vm, tm->no_return = 1; else if (unformat (input, "fifo-size %d", &tm->fifo_size)) tm->fifo_size <<= 10; + else if (unformat (input, "private-segment-count %d", + &tm->private_segment_count)) + ; + else if (unformat (input, "private-segment-size %dm", &tmp)) + tm->private_segment_size = tmp << 20; + else if (unformat (input, "private-segment-size %dg", &tmp)) + tm->private_segment_size = tmp << 30; + else if (unformat (input, "private-segment-size %d", &tmp)) + tm->private_segment_size = tmp; + else if (unformat (input, "preallocate-fifos")) + tm->prealloc_fifos = 1; + else + if (unformat (input, "client-batch %d", &tm->connections_per_batch)) + ; else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); @@ -688,9 +742,13 @@ test_tcp_clients_command_fn (vlib_main_t * vm, vlib_cli_output (vm, "zero delta-t?"); cleanup: - pool_free (tm->sessions); + tm->run_test = 0; for (i = 0; i < vec_len (tm->connection_index_by_thread); i++) - vec_reset_length (tm->connection_index_by_thread[i]); + { + vec_reset_length (tm->connection_index_by_thread[i]); + vec_reset_length (tm->connections_this_batch_by_thread[i]); + } + pool_free (tm->sessions); return 0; } diff --git a/src/vnet/tcp/builtin_client.h b/src/vnet/tcp/builtin_client.h index 3462e0ee..38af231d 100644 --- a/src/vnet/tcp/builtin_client.h +++ b/src/vnet/tcp/builtin_client.h @@ -63,6 +63,9 @@ typedef struct u32 configured_segment_size; u32 fifo_size; u32 expected_connections; /**< Number of clients/connections */ + u32 connections_per_batch; /**< Connections to rx/tx at once */ + u32 private_segment_count; /**< Number of private fifo segs */ + u32 private_segment_size; /**< size of private fifo segs */ /* * Test state variables @@ -72,6 +75,7 @@ typedef struct uword *session_index_by_vpp_handles; /**< Hash table for disconnecting */ u8 *connect_test_data; /**< Pre-computed test data */ u32 **connection_index_by_thread; + u32 **connections_this_batch_by_thread; /**< active connection batch */ pthread_t client_thread_handle; volatile u32 ready_connections; @@ -82,7 +86,8 @@ typedef struct f64 test_start_time; f64 test_end_time; - + u32 prev_conns; + u32 repeats; /* * Flags */ diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c index 775bfc26..8e958ac0 100644 --- a/src/vnet/tcp/builtin_server.c +++ b/src/vnet/tcp/builtin_server.c @@ -56,12 +56,15 @@ typedef struct u32 fifo_size; /**< Fifo size */ u32 rcv_buffer_size; /**< Rcv buffer size */ u32 prealloc_fifos; /**< Preallocate fifos */ + u32 private_segment_count; /**< Number of private segments */ + u32 private_segment_size; /**< Size of private segments */ /* * Test state */ u8 **rx_buf; /**< Per-thread RX buffer */ u64 byte_index; + u32 **rx_retries; vlib_main_t *vlib_main; } builtin_server_main_t; @@ -77,6 +80,8 @@ builtin_session_accept_callback (stream_session_t * s) session_manager_get_vpp_event_queue (s->thread_index); s->session_state = SESSION_STATE_READY; bsm->byte_index = 0; + vec_validate (bsm->rx_retries[s->thread_index], s->session_index); + bsm->rx_retries[s->thread_index][s->session_index] = 0; return 0; } @@ -173,11 +178,16 @@ builtin_server_rx_callback (stream_session_t * s) builtin_server_main_t *bsm = &builtin_server_main; session_fifo_event_t evt; static int serial_number = 0; - u32 my_thread_id = vlib_get_thread_index (); + u32 thread_index = vlib_get_thread_index (); + + ASSERT (s->thread_index == thread_index); rx_fifo = s->server_rx_fifo; tx_fifo = s->server_tx_fifo; + ASSERT (rx_fifo->master_thread_index == thread_index); + ASSERT (tx_fifo->master_thread_index == thread_index); + max_dequeue = svm_fifo_max_dequeue (s->server_rx_fifo); max_enqueue = svm_fifo_max_enqueue (s->server_tx_fifo); @@ -201,21 +211,31 @@ builtin_server_rx_callback (stream_session_t * s) evt.event_type = FIFO_EVENT_BUILTIN_RX; evt.event_id = 0; - q = bsm->vpp_queue[s->thread_index]; + q = bsm->vpp_queue[thread_index]; if (PREDICT_FALSE (q->cursize == q->maxsize)) clib_warning ("out of event queue space"); - else - unix_shared_memory_queue_add (q, (u8 *) & evt, - 0 /* don't wait for mutex */ ); + else if (unix_shared_memory_queue_add (q, (u8 *) & evt, 0 /* don't wait for mutex */ + )) + clib_warning ("failed to enqueue self-tap"); + + bsm->rx_retries[thread_index][s->session_index]++; + if (bsm->rx_retries[thread_index][s->session_index] == 500000) + { + clib_warning ("session stuck: %U", format_stream_session, s, 2); + } + } + else + { + bsm->rx_retries[thread_index][s->session_index] = 0; } return 0; } - _vec_len (bsm->rx_buf[my_thread_id]) = max_transfer; + _vec_len (bsm->rx_buf[thread_index]) = max_transfer; actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, max_transfer, - bsm->rx_buf[my_thread_id]); + bsm->rx_buf[thread_index]); ASSERT (actual_transfer == max_transfer); // test_bytes (bsm, actual_transfer); @@ -225,7 +245,7 @@ builtin_server_rx_callback (stream_session_t * s) */ n_written = svm_fifo_enqueue_nowait (tx_fifo, actual_transfer, - bsm->rx_buf[my_thread_id]); + bsm->rx_buf[thread_index]); if (n_written != max_transfer) clib_warning ("short trout!"); @@ -237,11 +257,13 @@ builtin_server_rx_callback (stream_session_t * s) evt.event_type = FIFO_EVENT_APP_TX; evt.event_id = serial_number++; - unix_shared_memory_queue_add (bsm->vpp_queue[s->thread_index], - (u8 *) & evt, 0 /* do wait for mutex */ ); + if (unix_shared_memory_queue_add (bsm->vpp_queue[s->thread_index], + (u8 *) & evt, + 0 /* do wait for mutex */ )) + clib_warning ("failed to enqueue tx evt"); } - if (PREDICT_FALSE (max_enqueue < max_dequeue)) + if (PREDICT_FALSE (n_written < max_dequeue)) goto rx_event; return 0; @@ -328,9 +350,13 @@ server_attach () a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 512 << 20; a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = bsm->fifo_size; a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = bsm->fifo_size; - a->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP; + a->options[APP_OPTIONS_PRIVATE_SEGMENT_COUNT] = bsm->private_segment_count; + a->options[APP_OPTIONS_PRIVATE_SEGMENT_SIZE] = bsm->private_segment_size; a->options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = bsm->prealloc_fifos ? bsm->prealloc_fifos : 1; + + a->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP; + a->segment_name = segment_name; a->segment_name_length = ARRAY_LEN (segment_name); @@ -374,6 +400,8 @@ server_create (vlib_main_t * vm) num_threads = 1 /* main thread */ + vtm->n_threads; vec_validate (builtin_server_main.vpp_queue, num_threads - 1); vec_validate (bsm->rx_buf, num_threads - 1); + vec_validate (bsm->rx_retries, num_threads - 1); + for (i = 0; i < num_threads; i++) vec_validate (bsm->rx_buf[i], bsm->rcv_buffer_size); @@ -435,11 +463,14 @@ server_create_command_fn (vlib_main_t * vm, unformat_input_t * input, { builtin_server_main_t *bsm = &builtin_server_main; int rv; + u32 tmp; bsm->no_echo = 0; bsm->fifo_size = 64 << 10; bsm->rcv_buffer_size = 128 << 10; bsm->prealloc_fifos = 0; + bsm->private_segment_count = 0; + bsm->private_segment_size = 0; while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { @@ -449,8 +480,17 @@ server_create_command_fn (vlib_main_t * vm, unformat_input_t * input, bsm->fifo_size <<= 10; else if (unformat (input, "rcv-buf-size %d", &bsm->rcv_buffer_size)) ; - else if (unformat (input, "prealloc-fifos", &bsm->prealloc_fifos)) + else if (unformat (input, "prealloc-fifos %d", &bsm->prealloc_fifos)) + ; + else if (unformat (input, "private-segment-count %d", + &bsm->private_segment_count)) ; + else if (unformat (input, "private-segment-size %dm", &tmp)) + bsm->private_segment_size = tmp << 20; + else if (unformat (input, "private-segment-size %dg", &tmp)) + bsm->private_segment_size = tmp << 30; + else if (unformat (input, "private-segment-size %d", &tmp)) + bsm->private_segment_size = tmp; else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 4e85eb3f..f379e699 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -74,8 +74,16 @@ static void tcp_connection_unbind (u32 listener_index) { tcp_main_t *tm = vnet_get_tcp_main (); - TCP_EVT_DBG (TCP_EVT_UNBIND, - pool_elt_at_index (tm->listener_pool, listener_index)); + tcp_connection_t *tc; + + tc = pool_elt_at_index (tm->listener_pool, listener_index); + + TCP_EVT_DBG (TCP_EVT_UNBIND, tc); + + /* Poison the entry */ + if (CLIB_DEBUG > 0) + memset (tc, 0xFA, sizeof (*tc)); + pool_put_index (tm->listener_pool, listener_index); } @@ -124,9 +132,20 @@ tcp_connection_cleanup (tcp_connection_t * tc) /* Check if half-open */ if (tc->state == TCP_STATE_SYN_SENT) - pool_put (tm->half_open_connections, tc); + { + /* Poison the entry */ + if (CLIB_DEBUG > 0) + memset (tc, 0xFA, sizeof (*tc)); + pool_put (tm->half_open_connections, tc); + } else - pool_put (tm->connections[tc->c_thread_index], tc); + { + int thread_index = tc->c_thread_index; + /* Poison the entry */ + if (CLIB_DEBUG > 0) + memset (tc, 0xFA, sizeof (*tc)); + pool_put (tm->connections[thread_index], tc); + } } /** @@ -168,13 +187,14 @@ tcp_connection_reset (tcp_connection_t * tc) /* Make sure all timers are cleared */ tcp_connection_timers_reset (tc); - stream_session_reset_notify (&tc->connection); + + /* Wait for cleanup from session layer but not forever */ + tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); break; case TCP_STATE_CLOSED: return; } - } /** @@ -278,6 +298,9 @@ tcp_allocate_local_port (tcp_main_t * tm, ip46_address_t * ip) tries = max - min; time_now = tcp_time_now (); + /* Only support active opens from thread 0 */ + ASSERT (vlib_get_thread_index () == 0); + /* Start at random point or max */ pool_get (tm->local_endpoints, tep); clib_memcpy (&tep->ip, ip, sizeof (*ip)); @@ -343,6 +366,7 @@ tcp_connection_timers_reset (tcp_connection_t * tc) } } +#if 0 typedef struct ip4_tcp_hdr { ip4_header_t ip; @@ -435,6 +459,7 @@ tcp_connection_fib_attach (tcp_connection_t * tc) tcp_connection_stack_on_fib_entry (tc); } +#endif /* 0 */ /** Initialize tcp connection variables * @@ -447,7 +472,7 @@ tcp_connection_init_vars (tcp_connection_t * tc) tcp_init_mss (tc); scoreboard_init (&tc->sack_sb); tcp_cc_init (tc); - tcp_connection_fib_attach (tc); + // tcp_connection_fib_attach (tc); } int @@ -485,14 +510,38 @@ tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4) if (is_ip4) { ip4_address_t *ip4; - ip4 = ip_interface_get_first_ip (sw_if_index, 1); - lcl_addr.ip4.as_u32 = ip4->as_u32; + int index; + if (vec_len (tm->ip4_src_addresses)) + { + index = tm->last_v4_address_rotor++; + if (tm->last_v4_address_rotor >= vec_len (tm->ip4_src_addresses)) + tm->last_v4_address_rotor = 0; + lcl_addr.ip4.as_u32 = tm->ip4_src_addresses[index].as_u32; + } + else + { + ip4 = ip_interface_get_first_ip (sw_if_index, 1); + lcl_addr.ip4.as_u32 = ip4->as_u32; + } } else { ip6_address_t *ip6; - ip6 = ip_interface_get_first_ip (sw_if_index, 0); - clib_memcpy (&lcl_addr.ip6, ip6, sizeof (*ip6)); + int index; + + if (vec_len (tm->ip6_src_addresses)) + { + index = tm->last_v6_address_rotor++; + if (tm->last_v6_address_rotor >= vec_len (tm->ip6_src_addresses)) + tm->last_v6_address_rotor = 0; + clib_memcpy (&lcl_addr.ip6, &tm->ip6_src_addresses[index], + sizeof (*ip6)); + } + else + { + ip6 = ip_interface_get_first_ip (sw_if_index, 0); + clib_memcpy (&lcl_addr.ip6, ip6, sizeof (*ip6)); + } } /* Allocate source port */ @@ -614,7 +663,7 @@ u8 * format_tcp_vars (u8 * s, va_list * args) { tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - s = format (s, " snd_una %u snd_nxt %u snd_una_max %u\n", + s = format (s, " snd_una %u snd_nxt %u snd_una_max %u", tc->snd_una - tc->iss, tc->snd_nxt - tc->iss, tc->snd_una_max - tc->iss); s = format (s, " rcv_nxt %u rcv_las %u\n", @@ -628,12 +677,17 @@ format_tcp_vars (u8 * s, va_list * args) s = format (s, " cong %U ", format_tcp_congestion_status, tc); s = format (s, "cwnd %u ssthresh %u rtx_bytes %u bytes_acked %u\n", tc->cwnd, tc->ssthresh, tc->snd_rxt_bytes, tc->bytes_acked); - s = format (s, " prev_ssthresh %u snd_congestion %u dupack %u\n", + s = format (s, " prev_ssthresh %u snd_congestion %u dupack %u", tc->prev_ssthresh, tc->snd_congestion - tc->iss, tc->rcv_dupacks); + s = format (s, " limited_transmit %u\n", tc->limited_transmit - tc->iss); + s = format (s, " tsecr %u tsecr_last_ack %u\n", tc->rcv_opts.tsecr, + tc->tsecr_last_ack); s = format (s, " rto %u rto_boff %u srtt %u rttvar %u rtt_ts %u ", tc->rto, tc->rto_boff, tc->srtt, tc->rttvar, tc->rtt_ts); s = format (s, "rtt_seq %u\n", tc->rtt_seq); + s = format (s, " tsval_recent %u tsval_recent_age %u\n", tc->tsval_recent, + tcp_time_now () - tc->tsval_recent_age); s = format (s, " scoreboard: %U\n", format_tcp_scoreboard, &tc->sack_sb); if (vec_len (tc->snd_sacks)) s = format (s, " sacks tx: %U\n", format_tcp_sacks, tc); @@ -719,11 +773,21 @@ format_tcp_sacks (u8 * s, va_list * args) tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); sack_block_t *sacks = tc->snd_sacks; sack_block_t *block; - vec_foreach (block, sacks) - { - s = format (s, " start %u end %u\n", block->start - tc->irs, - block->end - tc->irs); - } + int i, len = 0; + + len = vec_len (sacks); + for (i = 0; i < len - 1; i++) + { + block = &sacks[i]; + s = format (s, " start %u end %u\n", block->start - tc->irs, + block->end - tc->irs); + } + if (len) + { + block = &sacks[len - 1]; + s = format (s, " start %u end %u", block->start - tc->irs, + block->end - tc->irs); + } return s; } @@ -796,14 +860,18 @@ tcp_session_send_mss (transport_connection_t * trans_conn) always_inline u32 tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space) { - if (tc->snd_wnd < tc->snd_mss) + if (PREDICT_FALSE (tc->snd_wnd < tc->snd_mss)) { return tc->snd_wnd <= snd_space ? tc->snd_wnd : 0; } /* If we can't write at least a segment, don't try at all */ - if (snd_space < tc->snd_mss) - return 0; + if (PREDICT_FALSE (snd_space < tc->snd_mss)) + { + if (snd_space > clib_min (tc->mss, tc->rcv_opts.mss) - TCP_HDR_LEN_MAX) + return snd_space; + return 0; + } /* round down to mss multiple */ return snd_space - (snd_space % tc->snd_mss); @@ -1042,6 +1110,8 @@ tcp_main_enable (vlib_main_t * vm) vlib_thread_main_t *vtm = vlib_get_thread_main (); clib_error_t *error = 0; u32 num_threads; + int thread, i; + tcp_connection_t *tc __attribute__ ((unused)); if ((error = vlib_call_init_function (vm, ip_main_init))) return error; @@ -1074,6 +1144,27 @@ tcp_main_enable (vlib_main_t * vm) num_threads = 1 /* main thread */ + vtm->n_threads; vec_validate (tm->connections, num_threads - 1); + /* + * Preallocate connections + */ + for (thread = 0; thread < num_threads; thread++) + { + for (i = 0; i < tm->preallocated_connections; i++) + pool_get (tm->connections[thread], tc); + + for (i = 0; i < tm->preallocated_connections; i++) + pool_put_index (tm->connections[thread], i); + } + + /* + * Preallocate half-open connections + */ + for (i = 0; i < tm->preallocated_half_open_connections; i++) + pool_get (tm->half_open_connections, tc); + + for (i = 0; i < tm->preallocated_half_open_connections; i++) + pool_put_index (tm->half_open_connections, i); + /* Initialize per worker thread tx buffers (used for control messages) */ vec_validate (tm->tx_buffers, num_threads - 1); @@ -1116,7 +1207,6 @@ tcp_init (vlib_main_t * vm) { tcp_main_t *tm = vnet_get_tcp_main (); - tm->vlib_main = vm; tm->vnet_main = vnet_get_main (); tm->is_enabled = 0; @@ -1125,6 +1215,97 @@ tcp_init (vlib_main_t * vm) VLIB_INIT_FUNCTION (tcp_init); + +static clib_error_t * +tcp_config_fn (vlib_main_t * vm, unformat_input_t * input) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (input, "preallocated-connections %d", + &tm->preallocated_connections)) + ; + else if (unformat (input, "preallocated-half-open-connections %d", + &tm->preallocated_half_open_connections)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + return 0; +} + +VLIB_CONFIG_FUNCTION (tcp_config_fn, "tcp"); + +static clib_error_t * +tcp_src_address (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd_arg) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + ip4_address_t v4start, v4end; + ip6_address_t v6start, v6end; + int v4set = 0; + int v6set = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "%U - %U", unformat_ip4_address, &v4start, + unformat_ip4_address, &v4end)) + v4set = 1; + else if (unformat (input, "%U", unformat_ip4_address, &v4start)) + { + memcpy (&v4end, &v4start, sizeof (v4start)); + v4set = 1; + } + else if (unformat (input, "%U - %U", unformat_ip6_address, &v6start, + unformat_ip4_address, &v6end)) + v6set = 1; + else if (unformat (input, "%U", unformat_ip6_address, &v6start)) + { + memcpy (&v6end, &v6start, sizeof (v4start)); + v6set = 1; + } + else + break; + } + + if (!v4set && !v6set) + return clib_error_return (0, "at least one v4 or v6 address required"); + + if (v4set) + { + u32 tmp; + + do + { + vec_add1 (tm->ip4_src_addresses, v4start); + tmp = clib_net_to_host_u32 (v4start.as_u32); + tmp++; + v4start.as_u32 = clib_host_to_net_u32 (tmp); + } + while (clib_host_to_net_u32 (v4start.as_u32) <= + clib_host_to_net_u32 (v4end.as_u32)); + } + if (v6set) + { + clib_warning ("v6 src address list unimplemented..."); + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (tcp_src_address_command, static) = +{ + .path = "tcp src-address", + .short_help = "tcp src-address [- ] add src address range", + .function = tcp_src_address, +}; +/* *INDENT-ON* */ + + + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 12d804b8..37b10fd4 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -348,6 +348,16 @@ typedef struct _tcp_main /* Flag that indicates if stack is on or off */ u8 is_enabled; + /** Number of preallocated connections */ + u32 preallocated_connections; + u32 preallocated_half_open_connections; + + /** Vectors of src addresses. Optional unless one needs > 63K active-opens */ + ip4_address_t *ip4_src_addresses; + u32 last_v4_address_rotor; + u32 last_v6_address_rotor; + ip6_address_t *ip6_src_addresses; + /* convenience */ vlib_main_t *vlib_main; vnet_main_t *vnet_main; @@ -569,6 +579,7 @@ tcp_connection_force_ack (tcp_connection_t * tc, vlib_buffer_t * b) always_inline void tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval) { + ASSERT (tc->c_thread_index == vlib_get_thread_index ()); tc->timers[timer_id] = tw_timer_start_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index], tc->c_c_index, timer_id, interval); @@ -577,6 +588,7 @@ tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval) always_inline void tcp_timer_reset (tcp_connection_t * tc, u8 timer_id) { + ASSERT (tc->c_thread_index == vlib_get_thread_index ()); if (tc->timers[timer_id] == TCP_TIMER_HANDLE_INVALID) return; @@ -588,6 +600,7 @@ tcp_timer_reset (tcp_connection_t * tc, u8 timer_id) always_inline void tcp_timer_update (tcp_connection_t * tc, u8 timer_id, u32 interval) { + ASSERT (tc->c_thread_index == vlib_get_thread_index ()); if (tc->timers[timer_id] != TCP_TIMER_HANDLE_INVALID) tw_timer_stop_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index], tc->timers[timer_id]); diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index ae68ad1b..be51bca2 100755 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -383,9 +383,16 @@ typedef enum _tcp_dbg_evt "establish", \ }, \ }; \ - DECLARE_ETD(_tc, _e, 2); \ - ed->data[0] = _timer_id; \ - ed->data[1] = _timer_id; \ + if (_tc) \ + { \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = _timer_id; \ + ed->data[1] = _timer_id; \ + } \ + else \ + { \ + clib_warning ("pop for unexisting connection %d", _tc_index); \ + } \ } #define TCP_EVT_SEG_INVALID_HANDLER(_tc, _seq, _end, ...) \ diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index a2e6dad1..45db0da6 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -251,6 +251,7 @@ tcp_update_timestamp (tcp_connection_t * tc, u32 seq, u32 seq_end) if (tcp_opts_tstamp (&tc->rcv_opts) && tc->tsval_recent && seq_leq (seq, tc->rcv_las) && seq_leq (tc->rcv_las, seq_end)) { + ASSERT (timestamp_leq (tc->tsval_recent, tc->rcv_opts.tsval)); tc->tsval_recent = tc->rcv_opts.tsval; tc->tsval_recent_age = tcp_time_now (); } @@ -383,12 +384,9 @@ tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt) if (tc->srtt != 0) { err = mrtt - tc->srtt; -// tc->srtt += err >> 3; /* XXX Drop in RTT results in RTTVAR increase and bigger RTO. * The increase should be bound */ -// tc->rttvar += ((int) clib_abs (err) - (int) tc->rttvar) >> 2; - tc->srtt = clib_max ((int) tc->srtt + (err >> 3), 1); diff = (clib_abs (err) - (int) tc->rttvar) >> 2; tc->rttvar = clib_max ((int) tc->rttvar + diff, 1); @@ -491,6 +489,14 @@ tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 prev_snd_wnd, && (prev_snd_wnd == tc->snd_wnd)); } +static u8 +tcp_is_lost_fin (tcp_connection_t * tc) +{ + if ((tc->flags & TCP_CONN_FINSNT) && tc->snd_una_max - tc->snd_una == 1) + return 1; + return 0; +} + /** * Checks if ack is a congestion control event. */ @@ -503,7 +509,7 @@ tcp_ack_is_cc_event (tcp_connection_t * tc, vlib_buffer_t * b, *is_dack = tc->sack_sb.last_sacked_bytes || tcp_ack_is_dupack (tc, b, prev_snd_wnd, prev_snd_una); - return (*is_dack || tcp_in_cong_recovery (tc)); + return ((*is_dack || tcp_in_cong_recovery (tc)) && !tcp_is_lost_fin (tc)); } void @@ -750,10 +756,20 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) * last hole end */ tmp = tc->rcv_opts.sacks[vec_len (tc->rcv_opts.sacks) - 1]; last_hole = scoreboard_last_hole (sb); - if (seq_gt (tc->snd_una_max, sb->high_sacked) - && seq_gt (tc->snd_una_max, last_hole->end)) - last_hole->end = tc->snd_una_max; - /* keep track of max byte sacked for when the last hole + if (seq_gt (tc->snd_una_max, last_hole->end)) + { + if (seq_geq (last_hole->start, sb->high_sacked)) + { + last_hole->end = tc->snd_una_max; + } + /* New hole after high sacked block */ + else if (seq_lt (sb->high_sacked, tc->snd_una_max)) + { + scoreboard_insert_hole (sb, sb->tail, sb->high_sacked, + tc->snd_una_max); + } + } + /* Keep track of max byte sacked for when the last hole * is acked */ if (seq_gt (tmp.end, sb->high_sacked)) sb->high_sacked = tmp.end; @@ -764,7 +780,6 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) while (hole && blk_index < vec_len (tc->rcv_opts.sacks)) { blk = &tc->rcv_opts.sacks[blk_index]; - if (seq_leq (blk->start, hole->start)) { /* Block covers hole. Remove hole */ @@ -784,6 +799,7 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) } else if (!next_hole) { + ASSERT (seq_geq (sb->high_sacked, ack)); sb->snd_una_adv = sb->high_sacked - ack; sb->last_bytes_delivered += sb->high_sacked - hole->end; } @@ -819,7 +835,6 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { hole->end = blk->start; } - hole = scoreboard_next_hole (sb, hole); } } @@ -827,10 +842,13 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) scoreboard_update_bytes (tc, sb); sb->last_sacked_bytes = sb->sacked_bytes - (old_sacked_bytes - sb->last_bytes_delivered); + ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes); ASSERT (sb->sacked_bytes == 0 || sb->sacked_bytes < tc->snd_una_max - seq_max (tc->snd_una, ack)); ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_una_max - seq_max (tc->snd_una, ack)); + ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc) + || sb->holes[sb->head].start == ack + sb->snd_una_adv); } /** @@ -916,7 +934,8 @@ tcp_cc_congestion_undo (tcp_connection_t * tc) static u8 tcp_cc_is_spurious_retransmit (tcp_connection_t * tc) { - return (tc->snd_rxt_ts + return (tcp_in_recovery (tc) + && tc->snd_rxt_ts && tcp_opts_tstamp (&tc->rcv_opts) && timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts)); } @@ -994,6 +1013,7 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) { ASSERT (tc->snd_una != tc->snd_una_max || tc->sack_sb.last_sacked_bytes); + tc->rcv_dupacks++; if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD && !tc->bytes_acked) @@ -1012,17 +1032,20 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) goto partial_ack_test; } - /* If of of the two conditions lower hold, reset dupacks - * 1) Cumulative ack does not cover more than congestion threshold, - * and the following doesn't hold: the congestion window is - * greater than SMSS bytes and the difference between highest_ack - * and prev_highest_ack is at most 4*SMSS bytes (XXX) - * 2) RFC6582 heuristic to avoid multiple fast retransmits + /* If of of the two conditions lower hold, reset dupacks because + * we're probably after timeout (RFC6582 heuristics). + * If Cumulative ack does not cover more than congestion threshold, + * and: + * 1) The following doesn't hold: The congestion window is greater + * than SMSS bytes and the difference between highest_ack + * and prev_highest_ack is at most 4*SMSS bytes + * 2) Echoed timestamp in the last non-dup ack does not equal the + * stored timestamp */ - if ((seq_gt (tc->snd_una, tc->snd_congestion) - || !(tc->cwnd > tc->snd_mss - && tc->bytes_acked <= 4 * tc->snd_mss)) - || tc->rcv_opts.tsecr != tc->tsecr_last_ack) + if (seq_leq (tc->snd_una, tc->snd_congestion) + && ((!(tc->cwnd > tc->snd_mss + && tc->bytes_acked <= 4 * tc->snd_mss)) + || (tc->rcv_opts.tsecr != tc->tsecr_last_ack))) { tc->rcv_dupacks = 0; return; @@ -1038,6 +1061,7 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) * three segments that have left the network and should've been * buffered at the receiver XXX */ tc->cwnd = tc->ssthresh + tc->rcv_dupacks * tc->snd_mss; + ASSERT (tc->cwnd >= tc->snd_mss); /* If cwnd allows, send more data */ if (tcp_opts_sack_permitted (&tc->rcv_opts) @@ -1112,7 +1136,7 @@ partial_ack: >= tc->sack_sb.last_bytes_delivered); rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv - tc->sack_sb.last_bytes_delivered; - if (rxt_delivered && seq_gt (tc->sack_sb.high_rxt, tc->snd_una)) + if (0 && rxt_delivered && seq_gt (tc->sack_sb.high_rxt, tc->snd_una)) { /* If we have sacks and we haven't gotten an ack beyond high_rxt, * remove sacked bytes delivered */ @@ -1301,6 +1325,8 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, { int written; + ASSERT (seq_geq (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt)); + /* Pure ACK. Update rcv_nxt and be done. */ if (PREDICT_FALSE (data_len == 0)) { @@ -1450,6 +1476,7 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, /* Chop off the bytes in the past */ n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number; n_data_bytes -= n_bytes_to_drop; + vnet_buffer (b)->tcp.seq_number = tc->rcv_nxt; vlib_buffer_advance (b, n_bytes_to_drop); goto in_order; @@ -1912,11 +1939,12 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; } - stream_session_init_fifos_pointers (&new_tc0->connection, - new_tc0->irs + 1, - new_tc0->iss + 1); /* Make sure after data segment processing ACK is sent */ new_tc0->flags |= TCP_CONN_SNDACK; + + /* Update rtt with the syn-ack sample */ + new_tc0->bytes_acked = 1; + tcp_update_rtt (new_tc0, vnet_buffer (b0)->tcp.ack_number); } /* SYN: Simultaneous open. Change state to SYN-RCVD and send SYN-ACK */ else @@ -1932,9 +1960,8 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; } - stream_session_init_fifos_pointers (&new_tc0->connection, - new_tc0->irs + 1, - new_tc0->iss + 1); + tc0->rtt_ts = 0; + tcp_make_synack (new_tc0, b0); next0 = tcp_next_output (is_ip4); @@ -2151,8 +2178,6 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, << tc0->rcv_opts.wscale; tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number; tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number; - - /* Shoulder tap the server */ stream_session_accept_notify (&tc0->connection); /* Reset SYN-ACK retransmit timer */ @@ -2175,6 +2200,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* If FIN is ACKed */ if (tc0->snd_una == tc0->snd_una_max) { + ASSERT (tcp_fin (tcp0)); tc0->state = TCP_STATE_FIN_WAIT_2; /* Stop all timers, 2MSL will be set lower */ tcp_connection_timers_reset (tc0); @@ -2545,10 +2571,6 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_make_synack (child0, b0); next0 = tcp_next_output (is_ip4); - /* Init fifo pointers after we have iss */ - stream_session_init_fifos_pointers (&child0->connection, - child0->irs + 1, - child0->iss + 1); drop: if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { @@ -2886,9 +2908,12 @@ do { \ _(LISTEN, TCP_FLAG_SYN, TCP_INPUT_NEXT_LISTEN, TCP_ERROR_NONE); _(LISTEN, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_NONE); _(LISTEN, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_NONE); + _(LISTEN, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, + TCP_ERROR_NONE); /* ACK for for a SYN-ACK -> tcp-rcv-process. */ _(SYN_RCVD, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(SYN_RCVD, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(SYN_RCVD, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); /* SYN-ACK for a SYN */ _(SYN_SENT, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE); @@ -2905,12 +2930,14 @@ do { \ _(ESTABLISHED, TCP_FLAG_RST, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); _(ESTABLISHED, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); + _(ESTABLISHED, TCP_FLAG_SYN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); /* ACK or FIN-ACK to our FIN */ _(FIN_WAIT_1, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(FIN_WAIT_1, TCP_FLAG_ACK | TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); /* FIN in reply to our FIN from the other side */ _(FIN_WAIT_1, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(FIN_WAIT_1, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); /* FIN confirming that the peer (app) has closed */ _(FIN_WAIT_2, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(FIN_WAIT_2, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); @@ -2929,6 +2956,8 @@ do { \ TCP_ERROR_NONE); _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED); _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); + _(CLOSED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, + TCP_ERROR_CONNECTION_CLOSED); #undef _ } diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c index c825e952..103fea4c 100644 --- a/src/vnet/tcp/tcp_newreno.c +++ b/src/vnet/tcp/tcp_newreno.c @@ -63,8 +63,8 @@ newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type) * window deflation" attempts to ensure that, when fast recovery * eventually ends, approximately ssthresh amount of data will be * outstanding in the network.*/ - tc->cwnd = (tc->cwnd > tc->bytes_acked) ? - tc->cwnd - tc->bytes_acked : 0; + tc->cwnd = (tc->cwnd > tc->bytes_acked + tc->snd_mss) ? + tc->cwnd - tc->bytes_acked : tc->snd_mss; if (tc->bytes_acked > tc->snd_mss) tc->cwnd += tc->snd_mss; } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 41bebcb3..b418e8ba 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -19,17 +19,20 @@ vlib_node_registration_t tcp4_output_node; vlib_node_registration_t tcp6_output_node; -typedef enum _tcp_output_nect +typedef enum _tcp_output_next { TCP_OUTPUT_NEXT_DROP, + TCP_OUTPUT_NEXT_IP_LOOKUP, TCP_OUTPUT_N_NEXT } tcp_output_next_t; #define foreach_tcp4_output_next \ _ (DROP, "error-drop") \ + _ (IP_LOOKUP, "ip4-lookup") #define foreach_tcp6_output_next \ _ (DROP, "error-drop") \ + _ (IP_LOOKUP, "ip6-lookup") static char *tcp_error_strings[] = { #define tcp_error(n,s) s, @@ -427,16 +430,16 @@ tcp_init_mss (tcp_connection_t * tc) #define tcp_get_free_buffer_index(tm, bidx) \ do { \ u32 *my_tx_buffers, n_free_buffers; \ - u32 thread_index = vlib_get_thread_index(); \ - my_tx_buffers = tm->tx_buffers[thread_index]; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ if (PREDICT_FALSE(vec_len (my_tx_buffers) == 0)) \ { \ n_free_buffers = 32; /* TODO config or macro */ \ vec_validate (my_tx_buffers, n_free_buffers - 1); \ _vec_len(my_tx_buffers) = vlib_buffer_alloc_from_free_list ( \ - tm->vlib_main, my_tx_buffers, n_free_buffers, \ + vlib_get_main(), my_tx_buffers, n_free_buffers, \ VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); \ - tm->tx_buffers[thread_index] = my_tx_buffers; \ + tm->tx_buffers[thread_index] = my_tx_buffers; \ } \ /* buffer shortage */ \ if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) \ @@ -445,12 +448,12 @@ do { \ _vec_len (my_tx_buffers) -= 1; \ } while (0) -#define tcp_return_buffer(tm) \ -do { \ - u32 *my_tx_buffers; \ - u32 thread_index = vlib_get_thread_index(); \ - my_tx_buffers = tm->tx_buffers[thread_index]; \ - _vec_len (my_tx_buffers) +=1; \ +#define tcp_return_buffer(tm) \ +do { \ + u32 *my_tx_buffers; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ + _vec_len (my_tx_buffers) +=1; \ } while (0) always_inline void @@ -757,23 +760,22 @@ void tcp_push_ip_hdr (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b) { tcp_header_t *th = vlib_buffer_get_current (b); - + vlib_main_t *vm = vlib_get_main (); if (tc->c_is_ip4) { ip4_header_t *ih; - ih = vlib_buffer_push_ip4 (tm->vlib_main, b, &tc->c_lcl_ip4, + ih = vlib_buffer_push_ip4 (vm, b, &tc->c_lcl_ip4, &tc->c_rmt_ip4, IP_PROTOCOL_TCP); - th->checksum = ip4_tcp_udp_compute_checksum (tm->vlib_main, b, ih); + th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih); } else { ip6_header_t *ih; int bogus = ~0; - ih = vlib_buffer_push_ip6 (tm->vlib_main, b, &tc->c_lcl_ip6, + ih = vlib_buffer_push_ip6 (vm, b, &tc->c_lcl_ip6, &tc->c_rmt_ip6, IP_PROTOCOL_TCP); - th->checksum = ip6_tcp_udp_icmp_compute_checksum (tm->vlib_main, b, ih, - &bogus); + th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih, &bogus); ASSERT (!bogus); } } @@ -851,6 +853,13 @@ tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) /* Decide where to send the packet */ next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index; + /* Initialize the trajectory trace, if configured */ + if (VLIB_BUFFER_TRACE_TRAJECTORY > 0) + { + b->pre_data[0] = 1; + b->pre_data[1] = next_index; + } + /* Enqueue the packet */ f = vlib_get_frame_to_node (vm, next_index); to_next = vlib_frame_vector_args (f); @@ -1144,6 +1153,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Account for the SYN */ tc->snd_nxt += 1; + tc->rtt_ts = 0; } else { @@ -1232,7 +1242,7 @@ tcp_timer_persist_handler (u32 index) /* Nothing to send */ if (n_bytes <= 0) { - clib_warning ("persist found nothing to send"); + // clib_warning ("persist found nothing to send"); tcp_return_buffer (tm); return; } @@ -1448,7 +1458,7 @@ tcp46_output_inline (vlib_main_t * vm, tcp_connection_t *tc0; tcp_tx_trace_t *t0; tcp_header_t *th0 = 0; - u32 error0 = TCP_ERROR_PKTS_SENT, next0 = TCP_OUTPUT_NEXT_DROP; + u32 error0 = TCP_ERROR_PKTS_SENT, next0 = TCP_OUTPUT_NEXT_IP_LOOKUP; bi0 = from[0]; to_next[0] = bi0; @@ -1527,6 +1537,7 @@ tcp46_output_inline (vlib_main_t * vm, tc0->rto_boff = 0; } +#if 0 /* Make sure we haven't lost route to our peer */ if (PREDICT_FALSE (tc0->last_fib_check < tc0->snd_opts.tsval + TCP_FIB_RECHECK_PERIOD)) @@ -1547,6 +1558,10 @@ tcp46_output_inline (vlib_main_t * vm, /* Use pre-computed dpo to set next node */ next0 = tc0->c_rmt_dpo.dpoi_next_node; vnet_buffer (b0)->ip.adj_index[VLIB_TX] = tc0->c_rmt_dpo.dpoi_index; +#endif + + vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0; b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; done: diff --git a/src/vnet/tcp/tcp_packet.h b/src/vnet/tcp/tcp_packet.h index a6f62ee1..9ccfe655 100644 --- a/src/vnet/tcp/tcp_packet.h +++ b/src/vnet/tcp/tcp_packet.h @@ -168,6 +168,7 @@ typedef struct #define TCP_OPTION_LEN_TIMESTAMP 10 #define TCP_OPTION_LEN_SACK_BLOCK 8 +#define TCP_HDR_LEN_MAX 60 #define TCP_WND_MAX 65535U #define TCP_MAX_WND_SCALE 14 /* See RFC 1323 */ #define TCP_OPTS_ALIGN 4 diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c index a461e3b8..510deb4f 100644 --- a/src/vnet/tcp/tcp_test.c +++ b/src/vnet/tcp/tcp_test.c @@ -290,7 +290,7 @@ tcp_test_sack_tx (vlib_main_t * vm, unformat_input_t * input) { tcp_connection_t _tc, *tc = &_tc; sack_block_t *sacks; - int i, verbose = 0; + int i, verbose = 0, expected; while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { @@ -326,8 +326,12 @@ tcp_test_sack_tx (vlib_main_t * vm, unformat_input_t * input) sacks = vec_dup (tc->snd_sacks); tcp_update_sack_list (tc, 1100, 1200); - TCP_TEST ((vec_len (tc->snd_sacks) == 5), "sack blocks %d expected %d", - vec_len (tc->snd_sacks), 5); + if (verbose) + vlib_cli_output (vm, "add new segment [1100, 1200]\n%U", + format_tcp_sacks, tc); + expected = 5 < TCP_MAX_SACK_BLOCKS ? 6 : 5; + TCP_TEST ((vec_len (tc->snd_sacks) == expected), + "sack blocks %d expected %d", vec_len (tc->snd_sacks), expected); TCP_TEST ((tc->snd_sacks[0].start == 1100), "first sack block start %u expected %u", tc->snd_sacks[0].start, 1100); diff --git a/src/vnet/udp/udp_input.c b/src/vnet/udp/udp_input.c index e6b4f8fc..9a8ff076 100644 --- a/src/vnet/udp/udp_input.c +++ b/src/vnet/udp/udp_input.c @@ -123,7 +123,7 @@ udp4_uri_input_node_fn (vlib_main_t * vm, /* lookup session */ s0 = stream_session_lookup4 (&ip0->dst_address, &ip0->src_address, udp0->dst_port, udp0->src_port, - SESSION_TYPE_IP4_UDP, my_thread_index); + SESSION_TYPE_IP4_UDP); /* no listener */ if (PREDICT_FALSE (s0 == 0)) -- cgit 1.2.3-korg