diff options
-rw-r--r-- | src/plugins/unittest/tcp_test.c | 1 | ||||
-rw-r--r-- | src/vnet/CMakeLists.txt | 9 | ||||
-rw-r--r-- | src/vnet/tcp/tcp.c | 1053 | ||||
-rw-r--r-- | src/vnet/tcp/tcp.h | 963 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_api.c | 5 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_bt.c | 2 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_bt.h | 91 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_cc.h | 111 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_cli.c | 1030 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_cubic.c | 1 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_inlines.h | 457 | ||||
-rwxr-xr-x | src/vnet/tcp/tcp_input.c | 864 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_newreno.c | 1 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_output.c | 89 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_packet.h | 213 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_sack.c | 607 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_sack.h | 126 | ||||
-rw-r--r-- | src/vnet/tcp/tcp_types.h | 451 |
18 files changed, 3128 insertions, 2946 deletions
diff --git a/src/plugins/unittest/tcp_test.c b/src/plugins/unittest/tcp_test.c index a485823ffc7..535f0552b10 100644 --- a/src/plugins/unittest/tcp_test.c +++ b/src/plugins/unittest/tcp_test.c @@ -13,6 +13,7 @@ * limitations under the License. */ #include <vnet/tcp/tcp.h> +#include <vnet/tcp/tcp_inlines.h> #define TCP_TEST_I(_cond, _comment, _args...) \ ({ \ diff --git a/src/vnet/CMakeLists.txt b/src/vnet/CMakeLists.txt index 1574c3daa70..21780e2beb4 100644 --- a/src/vnet/CMakeLists.txt +++ b/src/vnet/CMakeLists.txt @@ -659,9 +659,11 @@ list(APPEND VNET_SOURCES tcp/tcp_output.c tcp/tcp_input.c tcp/tcp_newreno.c - tcp/tcp_cubic.c tcp/tcp_bt.c + tcp/tcp_cli.c + tcp/tcp_cubic.c tcp/tcp_debug.c + tcp/tcp_sack.c tcp/tcp.c ) @@ -674,7 +676,12 @@ list(APPEND VNET_MULTIARCH_SOURCES list(APPEND VNET_HEADERS tcp/tcp_packet.h tcp/tcp_timer.h + tcp/tcp_bt.h + tcp/tcp_cc.h tcp/tcp_debug.h + tcp/tcp_inlines.h + tcp/tcp_sack.h + tcp/tcp_types.h tcp/tcp.h tcp/tcp_error.def ) diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index a1d774dc89b..2ac938a1b1e 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -19,11 +19,10 @@ */ #include <vnet/tcp/tcp.h> +#include <vnet/tcp/tcp_inlines.h> #include <vnet/session/session.h> #include <vnet/fib/fib.h> #include <vnet/dpo/load_balance.h> -#include <vnet/dpo/receive_dpo.h> -#include <vnet/ip-neighbor/ip_neighbor.h> #include <math.h> tcp_main_t tcp_main; @@ -707,51 +706,6 @@ tcp_connection_init_vars (tcp_connection_t * tc) tc->start_ts = tcp_time_now_us (tc->c_thread_index); } -void -tcp_init_w_buffer (tcp_connection_t * tc, vlib_buffer_t * b, u8 is_ip4) -{ - tcp_header_t *th = tcp_buffer_hdr (b); - - tc->c_lcl_port = th->dst_port; - tc->c_rmt_port = th->src_port; - tc->c_is_ip4 = is_ip4; - - if (is_ip4) - { - ip4_header_t *ip4 = vlib_buffer_get_current (b); - tc->c_lcl_ip4.as_u32 = ip4->dst_address.as_u32; - tc->c_rmt_ip4.as_u32 = ip4->src_address.as_u32; - } - else - { - ip6_header_t *ip6 = vlib_buffer_get_current (b); - clib_memcpy_fast (&tc->c_lcl_ip6, &ip6->dst_address, - sizeof (ip6_address_t)); - clib_memcpy_fast (&tc->c_rmt_ip6, &ip6->src_address, - sizeof (ip6_address_t)); - } - - tc->irs = vnet_buffer (b)->tcp.seq_number; - tc->rcv_nxt = vnet_buffer (b)->tcp.seq_number + 1; - tc->rcv_las = tc->rcv_nxt; - tc->sw_if_index = vnet_buffer (b)->sw_if_index[VLIB_RX]; - tc->snd_wl1 = vnet_buffer (b)->tcp.seq_number; - tc->snd_wl2 = vnet_buffer (b)->tcp.ack_number; - - /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK} - * segments are used to initialize PAWS. */ - if (tcp_opts_tstamp (&tc->rcv_opts)) - { - tc->tsval_recent = tc->rcv_opts.tsval; - tc->tsval_recent_age = tcp_time_now (); - } - - if (tcp_opts_wscale (&tc->rcv_opts)) - tc->snd_wscale = tc->rcv_opts.wscale; - - tc->snd_wnd = clib_net_to_host_u16 (th->window) << tc->snd_wscale; -} - static int tcp_alloc_custom_local_endpoint (tcp_main_t * tm, ip46_address_t * lcl_addr, u16 * lcl_port, u8 is_ip4) @@ -831,256 +785,6 @@ tcp_session_open (transport_endpoint_cfg_t * rmt) return tc->c_c_index; } -const char *tcp_fsm_states[] = { -#define _(sym, str) str, - foreach_tcp_fsm_state -#undef _ -}; - -u8 * -format_tcp_state (u8 * s, va_list * args) -{ - u32 state = va_arg (*args, u32); - - if (state < TCP_N_STATES) - s = format (s, "%s", tcp_fsm_states[state]); - else - s = format (s, "UNKNOWN (%d (0x%x))", state, state); - return s; -} - -const char *tcp_cfg_flags_str[] = { -#define _(sym, str) str, - foreach_tcp_cfg_flag -#undef _ -}; - -static u8 * -format_tcp_cfg_flags (u8 * s, va_list * args) -{ - tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - int i, last = -1; - - for (i = 0; i < TCP_CFG_N_FLAG_BITS; i++) - if (tc->cfg_flags & (1 << i)) - last = i; - for (i = 0; i < last; i++) - { - if (tc->cfg_flags & (1 << i)) - s = format (s, "%s, ", tcp_cfg_flags_str[i]); - } - if (last >= 0) - s = format (s, "%s", tcp_cfg_flags_str[last]); - return s; -} - -const char *tcp_connection_flags_str[] = { -#define _(sym, str) str, - foreach_tcp_connection_flag -#undef _ -}; - -static u8 * -format_tcp_connection_flags (u8 * s, va_list * args) -{ - tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - int i, last = -1; - - for (i = 0; i < TCP_CONN_N_FLAG_BITS; i++) - if (tc->flags & (1 << i)) - last = i; - for (i = 0; i < last; i++) - { - if (tc->flags & (1 << i)) - s = format (s, "%s, ", tcp_connection_flags_str[i]); - } - if (last >= 0) - s = format (s, "%s", tcp_connection_flags_str[last]); - return s; -} - -const char *tcp_conn_timers[] = { -#define _(sym, str) str, - foreach_tcp_timer -#undef _ -}; - -static u8 * -format_tcp_timers (u8 * s, va_list * args) -{ - tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - int i, last = -1; - - for (i = 0; i < TCP_N_TIMERS; i++) - if (tc->timers[i] != TCP_TIMER_HANDLE_INVALID) - last = i; - - for (i = 0; i < last; i++) - { - if (tc->timers[i] != TCP_TIMER_HANDLE_INVALID) - s = format (s, "%s,", tcp_conn_timers[i]); - } - - if (last >= 0) - s = format (s, "%s", tcp_conn_timers[i]); - - return s; -} - -static u8 * -format_tcp_congestion_status (u8 * s, va_list * args) -{ - tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - if (tcp_in_recovery (tc)) - s = format (s, "recovery"); - else if (tcp_in_fastrecovery (tc)) - s = format (s, "fastrecovery"); - else - s = format (s, "none"); - return s; -} - -static i32 -tcp_rcv_wnd_available (tcp_connection_t * tc) -{ - return (i32) tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las); -} - -static u8 * -format_tcp_congestion (u8 * s, va_list * args) -{ - tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - u32 indent = format_get_indent (s), prr_space = 0; - - s = format (s, "%U ", format_tcp_congestion_status, tc); - s = format (s, "algo %s cwnd %u ssthresh %u bytes_acked %u\n", - tc->cc_algo->name, tc->cwnd, tc->ssthresh, tc->bytes_acked); - s = format (s, "%Ucc space %u prev_cwnd %u prev_ssthresh %u\n", - format_white_space, indent, tcp_available_cc_snd_space (tc), - tc->prev_cwnd, tc->prev_ssthresh); - s = format (s, "%Usnd_cong %u dupack %u limited_tx %u\n", - format_white_space, indent, tc->snd_congestion - tc->iss, - tc->rcv_dupacks, tc->limited_transmit - tc->iss); - s = format (s, "%Urxt_bytes %u rxt_delivered %u rxt_head %u rxt_ts %u\n", - format_white_space, indent, tc->snd_rxt_bytes, - tc->rxt_delivered, tc->rxt_head - tc->iss, - tcp_time_now_w_thread (tc->c_thread_index) - tc->snd_rxt_ts); - if (tcp_in_fastrecovery (tc)) - prr_space = tcp_fastrecovery_prr_snd_space (tc); - s = format (s, "%Uprr_start %u prr_delivered %u prr space %u\n", - format_white_space, indent, tc->prr_start - tc->iss, - tc->prr_delivered, prr_space); - return s; -} - -static u8 * -format_tcp_stats (u8 * s, va_list * args) -{ - tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - u32 indent = format_get_indent (s); - s = format (s, "in segs %lu dsegs %lu bytes %lu dupacks %u\n", - tc->segs_in, tc->data_segs_in, tc->bytes_in, tc->dupacks_in); - s = format (s, "%Uout segs %lu dsegs %lu bytes %lu dupacks %u\n", - format_white_space, indent, tc->segs_out, - tc->data_segs_out, tc->bytes_out, tc->dupacks_out); - s = format (s, "%Ufr %u tr %u rxt segs %lu bytes %lu duration %.3f\n", - format_white_space, indent, tc->fr_occurences, - tc->tr_occurences, tc->segs_retrans, tc->bytes_retrans, - tcp_time_now_us (tc->c_thread_index) - tc->start_ts); - s = format (s, "%Uerr wnd data below %u above %u ack below %u above %u", - format_white_space, indent, tc->errors.below_data_wnd, - tc->errors.above_data_wnd, tc->errors.below_ack_wnd, - tc->errors.above_ack_wnd); - return s; -} - -static u8 * -format_tcp_vars (u8 * s, va_list * args) -{ - tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - s = format (s, " index: %u cfg: %U flags: %U timers: %U\n", tc->c_c_index, - format_tcp_cfg_flags, tc, format_tcp_connection_flags, tc, - format_tcp_timers, tc); - s = format (s, " snd_una %u snd_nxt %u snd_una_max %u", - tc->snd_una - tc->iss, tc->snd_nxt - tc->iss, - tc->snd_una_max - tc->iss); - s = format (s, " rcv_nxt %u rcv_las %u\n", - tc->rcv_nxt - tc->irs, tc->rcv_las - tc->irs); - s = format (s, " snd_wnd %u rcv_wnd %u rcv_wscale %u ", - tc->snd_wnd, tc->rcv_wnd, tc->rcv_wscale); - s = format (s, "snd_wl1 %u snd_wl2 %u\n", tc->snd_wl1 - tc->irs, - tc->snd_wl2 - tc->iss); - s = format (s, " flight size %u out space %u rcv_wnd_av %u", - tcp_flight_size (tc), tcp_available_output_snd_space (tc), - tcp_rcv_wnd_available (tc)); - s = format (s, " tsval_recent %u\n", tc->tsval_recent); - s = format (s, " tsecr %u tsecr_last_ack %u tsval_recent_age %u", - tc->rcv_opts.tsecr, tc->tsecr_last_ack, - tcp_time_now () - tc->tsval_recent_age); - s = format (s, " snd_mss %u\n", tc->snd_mss); - s = format (s, " rto %u rto_boff %u srtt %u us %.3f rttvar %u rtt_ts %.4f", - tc->rto, tc->rto_boff, tc->srtt, tc->mrtt_us * 1000, tc->rttvar, - tc->rtt_ts); - s = format (s, " rtt_seq %u\n", tc->rtt_seq - tc->iss); - s = format (s, " next_node %u opaque 0x%x fib_index %u\n", - tc->next_node_index, tc->next_node_opaque, tc->c_fib_index); - s = format (s, " cong: %U", format_tcp_congestion, tc); - - if (tc->state >= TCP_STATE_ESTABLISHED) - { - s = format (s, " sboard: %U\n", format_tcp_scoreboard, &tc->sack_sb, - tc); - s = format (s, " stats: %U\n", format_tcp_stats, tc); - } - if (vec_len (tc->snd_sacks)) - s = format (s, " sacks tx: %U\n", format_tcp_sacks, tc); - - return s; -} - -u8 * -format_tcp_connection_id (u8 * s, va_list * args) -{ - tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - if (!tc) - return s; - if (tc->c_is_ip4) - { - s = format (s, "[%d:%d][%s] %U:%d->%U:%d", tc->c_thread_index, - tc->c_s_index, "T", format_ip4_address, &tc->c_lcl_ip4, - clib_net_to_host_u16 (tc->c_lcl_port), format_ip4_address, - &tc->c_rmt_ip4, clib_net_to_host_u16 (tc->c_rmt_port)); - } - else - { - s = format (s, "[%d:%d][%s] %U:%d->%U:%d", tc->c_thread_index, - tc->c_s_index, "T", format_ip6_address, &tc->c_lcl_ip6, - clib_net_to_host_u16 (tc->c_lcl_port), format_ip6_address, - &tc->c_rmt_ip6, clib_net_to_host_u16 (tc->c_rmt_port)); - } - - return s; -} - -u8 * -format_tcp_connection (u8 * s, va_list * args) -{ - tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - u32 verbose = va_arg (*args, u32); - - if (!tc) - return s; - s = format (s, "%-50U", format_tcp_connection_id, tc); - if (verbose) - { - s = format (s, "%-15U", format_tcp_state, tc->state); - if (verbose > 1) - s = format (s, "\n%U", format_tcp_vars, tc); - } - - return s; -} - static u8 * format_tcp_session (u8 * s, va_list * args) { @@ -1119,100 +823,6 @@ format_tcp_half_open_session (u8 * s, va_list * args) return format (s, "%U", format_tcp_connection_id, tc); } -u8 * -format_tcp_sacks (u8 * s, va_list * args) -{ - tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - sack_block_t *sacks = tc->snd_sacks; - sack_block_t *block; - int i, len = 0; - - len = vec_len (sacks); - for (i = 0; i < len - 1; i++) - { - block = &sacks[i]; - s = format (s, " start %u end %u\n", block->start - tc->irs, - block->end - tc->irs); - } - if (len) - { - block = &sacks[len - 1]; - s = format (s, " start %u end %u", block->start - tc->irs, - block->end - tc->irs); - } - return s; -} - -u8 * -format_tcp_rcv_sacks (u8 * s, va_list * args) -{ - tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - sack_block_t *sacks = tc->rcv_opts.sacks; - sack_block_t *block; - int i, len = 0; - - len = vec_len (sacks); - for (i = 0; i < len - 1; i++) - { - block = &sacks[i]; - s = format (s, " start %u end %u\n", block->start - tc->iss, - block->end - tc->iss); - } - if (len) - { - block = &sacks[len - 1]; - s = format (s, " start %u end %u", block->start - tc->iss, - block->end - tc->iss); - } - return s; -} - -static u8 * -format_tcp_sack_hole (u8 * s, va_list * args) -{ - sack_scoreboard_hole_t *hole = va_arg (*args, sack_scoreboard_hole_t *); - tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - if (tc) - s = format (s, " [%u, %u]", hole->start - tc->iss, hole->end - tc->iss); - else - s = format (s, " [%u, %u]", hole->start, hole->end); - return s; -} - -u8 * -format_tcp_scoreboard (u8 * s, va_list * args) -{ - sack_scoreboard_t *sb = va_arg (*args, sack_scoreboard_t *); - tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - sack_scoreboard_hole_t *hole; - u32 indent = format_get_indent (s); - - s = format (s, "sacked %u last_sacked %u lost %u last_lost %u" - " rxt_sacked %u\n", - sb->sacked_bytes, sb->last_sacked_bytes, sb->lost_bytes, - sb->last_lost_bytes, sb->rxt_sacked); - s = format (s, "%Ulast_delivered %u high_sacked %u is_reneging %u\n", - format_white_space, indent, sb->last_bytes_delivered, - sb->high_sacked - tc->iss, sb->is_reneging); - s = format (s, "%Ucur_rxt_hole %u high_rxt %u rescue_rxt %u", - format_white_space, indent, sb->cur_rxt_hole, - sb->high_rxt - tc->iss, sb->rescue_rxt - tc->iss); - - hole = scoreboard_first_hole (sb); - if (hole) - s = format (s, "\n%Uhead %u tail %u %u holes:\n%U", format_white_space, - indent, sb->head, sb->tail, pool_elts (sb->holes), - format_white_space, indent); - - while (hole) - { - s = format (s, "%U", format_tcp_sack_hole, hole, tc); - hole = scoreboard_next_hole (sb, hole); - } - - return s; -} - static transport_connection_t * tcp_session_get_transport (u32 conn_index, u32 thread_index) { @@ -1813,7 +1423,6 @@ tcp_init (vlib_main_t * vm) transport_register_protocol (TRANSPORT_PROTO_TCP, &tcp_proto, FIB_PROTOCOL_IP6, tcp6_output_node.index); - tcp_api_reference (); tcp_configuration_init (); tm->cc_algo_by_name = hash_create_string (0, sizeof (uword)); @@ -1823,666 +1432,6 @@ tcp_init (vlib_main_t * vm) VLIB_INIT_FUNCTION (tcp_init); -uword -unformat_tcp_cc_algo (unformat_input_t * input, va_list * va) -{ - tcp_cc_algorithm_type_e *result = va_arg (*va, tcp_cc_algorithm_type_e *); - tcp_main_t *tm = &tcp_main; - char *cc_algo_name; - u8 found = 0; - uword *p; - - if (unformat (input, "%s", &cc_algo_name) - && ((p = hash_get_mem (tm->cc_algo_by_name, cc_algo_name)))) - { - *result = *p; - found = 1; - } - - vec_free (cc_algo_name); - return found; -} - -uword -unformat_tcp_cc_algo_cfg (unformat_input_t * input, va_list * va) -{ - tcp_main_t *tm = vnet_get_tcp_main (); - tcp_cc_algorithm_t *cc_alg; - unformat_input_t sub_input; - int found = 0; - - vec_foreach (cc_alg, tm->cc_algos) - { - if (!unformat (input, cc_alg->name)) - continue; - - if (cc_alg->unformat_cfg - && unformat (input, "%U", unformat_vlib_cli_sub_input, &sub_input)) - { - if (cc_alg->unformat_cfg (&sub_input)) - found = 1; - } - } - return found; -} - -static clib_error_t * -tcp_config_fn (vlib_main_t * vm, unformat_input_t * input) -{ - u32 cwnd_multiplier, tmp_time; - uword memory_size; - - while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) - { - if (unformat (input, "preallocated-connections %d", - &tcp_cfg.preallocated_connections)) - ; - else if (unformat (input, "preallocated-half-open-connections %d", - &tcp_cfg.preallocated_half_open_connections)) - ; - else if (unformat (input, "buffer-fail-fraction %f", - &tcp_cfg.buffer_fail_fraction)) - ; - else if (unformat (input, "max-rx-fifo %U", unformat_memory_size, - &memory_size)) - { - if (memory_size >= 0x100000000) - { - return clib_error_return - (0, "max-rx-fifo %llu (0x%llx) too large", memory_size, - memory_size); - } - tcp_cfg.max_rx_fifo = memory_size; - } - else if (unformat (input, "min-rx-fifo %U", unformat_memory_size, - &memory_size)) - { - if (memory_size >= 0x100000000) - { - return clib_error_return - (0, "min-rx-fifo %llu (0x%llx) too large", memory_size, - memory_size); - } - tcp_cfg.min_rx_fifo = memory_size; - } - else if (unformat (input, "mtu %u", &tcp_cfg.default_mtu)) - ; - else if (unformat (input, "rwnd-min-update-ack %d", - &tcp_cfg.rwnd_min_update_ack)) - ; - else if (unformat (input, "initial-cwnd-multiplier %u", - &cwnd_multiplier)) - tcp_cfg.initial_cwnd_multiplier = cwnd_multiplier; - else if (unformat (input, "no-tx-pacing")) - tcp_cfg.enable_tx_pacing = 0; - else if (unformat (input, "tso")) - tcp_cfg.allow_tso = 1; - else if (unformat (input, "no-csum-offload")) - tcp_cfg.csum_offload = 0; - else if (unformat (input, "cc-algo %U", unformat_tcp_cc_algo, - &tcp_cfg.cc_algo)) - ; - else if (unformat (input, "%U", unformat_tcp_cc_algo_cfg)) - ; - else if (unformat (input, "closewait-time %u", &tmp_time)) - tcp_cfg.closewait_time = tmp_time / TCP_TIMER_TICK; - else if (unformat (input, "timewait-time %u", &tmp_time)) - tcp_cfg.timewait_time = tmp_time / TCP_TIMER_TICK; - else if (unformat (input, "finwait1-time %u", &tmp_time)) - tcp_cfg.finwait1_time = tmp_time / TCP_TIMER_TICK; - else if (unformat (input, "finwait2-time %u", &tmp_time)) - tcp_cfg.finwait2_time = tmp_time / TCP_TIMER_TICK; - else if (unformat (input, "lastack-time %u", &tmp_time)) - tcp_cfg.lastack_time = tmp_time / TCP_TIMER_TICK; - else if (unformat (input, "closing-time %u", &tmp_time)) - tcp_cfg.closing_time = tmp_time / TCP_TIMER_TICK; - else if (unformat (input, "cleanup-time %u", &tmp_time)) - tcp_cfg.cleanup_time = tmp_time / 1000.0; - else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); - } - return 0; -} - -VLIB_CONFIG_FUNCTION (tcp_config_fn, "tcp"); - - -/** - * \brief Configure an ipv4 source address range - * @param vm vlib_main_t pointer - * @param start first ipv4 address in the source address range - * @param end last ipv4 address in the source address range - * @param table_id VRF / table ID, 0 for the default FIB - * @return 0 if all OK, else an error indication from api_errno.h - */ - -int -tcp_configure_v4_source_address_range (vlib_main_t * vm, - ip4_address_t * start, - ip4_address_t * end, u32 table_id) -{ - u32 start_host_byte_order, end_host_byte_order; - fib_prefix_t prefix; - fib_node_index_t fei; - u32 fib_index = 0; - u32 sw_if_index; - int rv; - - clib_memset (&prefix, 0, sizeof (prefix)); - - fib_index = fib_table_find (FIB_PROTOCOL_IP4, table_id); - - if (fib_index == ~0) - return VNET_API_ERROR_NO_SUCH_FIB; - - start_host_byte_order = clib_net_to_host_u32 (start->as_u32); - end_host_byte_order = clib_net_to_host_u32 (end->as_u32); - - /* sanity check for reversed args or some such */ - if ((end_host_byte_order - start_host_byte_order) > (10 << 10)) - return VNET_API_ERROR_INVALID_ARGUMENT; - - /* Lookup the last address, to identify the interface involved */ - prefix.fp_len = 32; - prefix.fp_proto = FIB_PROTOCOL_IP4; - memcpy (&prefix.fp_addr.ip4, end, sizeof (ip4_address_t)); - - fei = fib_table_lookup (fib_index, &prefix); - - /* Couldn't find route to destination. Bail out. */ - if (fei == FIB_NODE_INDEX_INVALID) - return VNET_API_ERROR_NEXT_HOP_NOT_IN_FIB; - - sw_if_index = fib_entry_get_resolving_interface (fei); - - /* Configure proxy arp across the range */ - rv = ip4_neighbor_proxy_add (fib_index, start, end); - - if (rv) - return rv; - - rv = ip4_neighbor_proxy_enable (sw_if_index); - - if (rv) - return rv; - - do - { - dpo_id_t dpo = DPO_INVALID; - - vec_add1 (tcp_cfg.ip4_src_addrs, start[0]); - - /* Add local adjacencies for the range */ - - receive_dpo_add_or_lock (DPO_PROTO_IP4, ~0 /* sw_if_index */ , - NULL, &dpo); - prefix.fp_len = 32; - prefix.fp_proto = FIB_PROTOCOL_IP4; - prefix.fp_addr.ip4.as_u32 = start->as_u32; - - fib_table_entry_special_dpo_update (fib_index, - &prefix, - FIB_SOURCE_API, - FIB_ENTRY_FLAG_EXCLUSIVE, &dpo); - dpo_reset (&dpo); - - start_host_byte_order++; - start->as_u32 = clib_host_to_net_u32 (start_host_byte_order); - } - while (start_host_byte_order <= end_host_byte_order); - - return 0; -} - -/** - * \brief Configure an ipv6 source address range - * @param vm vlib_main_t pointer - * @param start first ipv6 address in the source address range - * @param end last ipv6 address in the source address range - * @param table_id VRF / table ID, 0 for the default FIB - * @return 0 if all OK, else an error indication from api_errno.h - */ - -int -tcp_configure_v6_source_address_range (vlib_main_t * vm, - ip6_address_t * start, - ip6_address_t * end, u32 table_id) -{ - fib_prefix_t prefix; - u32 fib_index = 0; - fib_node_index_t fei; - u32 sw_if_index; - - clib_memset (&prefix, 0, sizeof (prefix)); - - fib_index = fib_table_find (FIB_PROTOCOL_IP6, table_id); - - if (fib_index == ~0) - return VNET_API_ERROR_NO_SUCH_FIB; - - while (1) - { - int i; - ip6_address_t tmp; - dpo_id_t dpo = DPO_INVALID; - - /* Remember this address */ - vec_add1 (tcp_cfg.ip6_src_addrs, start[0]); - - /* Lookup the prefix, to identify the interface involved */ - prefix.fp_len = 128; - prefix.fp_proto = FIB_PROTOCOL_IP6; - memcpy (&prefix.fp_addr.ip6, start, sizeof (ip6_address_t)); - - fei = fib_table_lookup (fib_index, &prefix); - - /* Couldn't find route to destination. Bail out. */ - if (fei == FIB_NODE_INDEX_INVALID) - return VNET_API_ERROR_NEXT_HOP_NOT_IN_FIB; - - sw_if_index = fib_entry_get_resolving_interface (fei); - - if (sw_if_index == (u32) ~ 0) - return VNET_API_ERROR_NO_MATCHING_INTERFACE; - - /* Add a proxy neighbor discovery entry for this address */ - ip6_neighbor_proxy_add (sw_if_index, start); - - /* Add a receive adjacency for this address */ - receive_dpo_add_or_lock (DPO_PROTO_IP6, ~0 /* sw_if_index */ , - NULL, &dpo); - - fib_table_entry_special_dpo_update (fib_index, - &prefix, - FIB_SOURCE_API, - FIB_ENTRY_FLAG_EXCLUSIVE, &dpo); - dpo_reset (&dpo); - - /* Done with the entire range? */ - if (!memcmp (start, end, sizeof (start[0]))) - break; - - /* Increment the address. DGMS. */ - tmp = start[0]; - for (i = 15; i >= 0; i--) - { - tmp.as_u8[i] += 1; - if (tmp.as_u8[i] != 0) - break; - } - start[0] = tmp; - } - return 0; -} - -static clib_error_t * -tcp_src_address_fn (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd_arg) -{ - ip4_address_t v4start, v4end; - ip6_address_t v6start, v6end; - u32 table_id = 0; - int v4set = 0; - int v6set = 0; - int rv; - - while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) - { - if (unformat (input, "%U - %U", unformat_ip4_address, &v4start, - unformat_ip4_address, &v4end)) - v4set = 1; - else if (unformat (input, "%U", unformat_ip4_address, &v4start)) - { - memcpy (&v4end, &v4start, sizeof (v4start)); - v4set = 1; - } - else if (unformat (input, "%U - %U", unformat_ip6_address, &v6start, - unformat_ip6_address, &v6end)) - v6set = 1; - else if (unformat (input, "%U", unformat_ip6_address, &v6start)) - { - memcpy (&v6end, &v6start, sizeof (v6start)); - v6set = 1; - } - else if (unformat (input, "fib-table %d", &table_id)) - ; - else - break; - } - - if (!v4set && !v6set) - return clib_error_return (0, "at least one v4 or v6 address required"); - - if (v4set) - { - rv = tcp_configure_v4_source_address_range (vm, &v4start, &v4end, - table_id); - switch (rv) - { - case 0: - break; - - case VNET_API_ERROR_NO_SUCH_FIB: - return clib_error_return (0, "Invalid table-id %d", table_id); - - case VNET_API_ERROR_INVALID_ARGUMENT: - return clib_error_return (0, "Invalid address range %U - %U", - format_ip4_address, &v4start, - format_ip4_address, &v4end); - default: - return clib_error_return (0, "error %d", rv); - break; - } - } - if (v6set) - { - rv = tcp_configure_v6_source_address_range (vm, &v6start, &v6end, - table_id); - switch (rv) - { - case 0: - break; - - case VNET_API_ERROR_NO_SUCH_FIB: - return clib_error_return (0, "Invalid table-id %d", table_id); - - default: - return clib_error_return (0, "error %d", rv); - break; - } - } - return 0; -} - -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (tcp_src_address_command, static) = -{ - .path = "tcp src-address", - .short_help = "tcp src-address <ip-addr> [- <ip-addr>] add src address range", - .function = tcp_src_address_fn, -}; -/* *INDENT-ON* */ - -static u8 * -tcp_scoreboard_dump_trace (u8 * s, sack_scoreboard_t * sb) -{ -#if TCP_SCOREBOARD_TRACE - - scoreboard_trace_elt_t *block; - int i = 0; - - if (!sb->trace) - return s; - - s = format (s, "scoreboard trace:"); - vec_foreach (block, sb->trace) - { - s = format (s, "{%u, %u, %u, %u, %u}, ", block->start, block->end, - block->ack, block->snd_una_max, block->group); - if ((++i % 3) == 0) - s = format (s, "\n"); - } - return s; -#else - return 0; -#endif -} - -static clib_error_t * -tcp_show_scoreboard_trace_fn (vlib_main_t * vm, unformat_input_t * input, - vlib_cli_command_t * cmd_arg) -{ - transport_connection_t *tconn = 0; - tcp_connection_t *tc; - u8 *s = 0; - while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) - { - if (unformat (input, "%U", unformat_transport_connection, &tconn, - TRANSPORT_PROTO_TCP)) - ; - else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); - } - - if (!TCP_SCOREBOARD_TRACE) - { - vlib_cli_output (vm, "scoreboard tracing not enabled"); - return 0; - } - - tc = tcp_get_connection_from_transport (tconn); - s = tcp_scoreboard_dump_trace (s, &tc->sack_sb); - vlib_cli_output (vm, "%v", s); - return 0; -} - -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (tcp_show_scoreboard_trace_command, static) = -{ - .path = "show tcp scoreboard trace", - .short_help = "show tcp scoreboard trace <connection>", - .function = tcp_show_scoreboard_trace_fn, -}; -/* *INDENT-ON* */ - -u8 * -tcp_scoreboard_replay (u8 * s, tcp_connection_t * tc, u8 verbose) -{ - int i, trace_len; - scoreboard_trace_elt_t *trace; - u32 next_ack, left, group, has_new_ack = 0; - tcp_connection_t _dummy_tc, *dummy_tc = &_dummy_tc; - sack_block_t *block; - - if (!TCP_SCOREBOARD_TRACE) - { - s = format (s, "scoreboard tracing not enabled"); - return s; - } - - if (!tc) - return s; - - clib_memset (dummy_tc, 0, sizeof (*dummy_tc)); - tcp_connection_timers_init (dummy_tc); - scoreboard_init (&dummy_tc->sack_sb); - dummy_tc->rcv_opts.flags |= TCP_OPTS_FLAG_SACK; - -#if TCP_SCOREBOARD_TRACE - trace = tc->sack_sb.trace; - trace_len = vec_len (tc->sack_sb.trace); -#endif - - for (i = 0; i < trace_len; i++) - { - if (trace[i].ack != 0) - { - dummy_tc->snd_una = trace[i].ack - 1448; - dummy_tc->snd_una_max = trace[i].ack; - } - } - - left = 0; - while (left < trace_len) - { - group = trace[left].group; - vec_reset_length (dummy_tc->rcv_opts.sacks); - has_new_ack = 0; - while (trace[left].group == group) - { - if (trace[left].ack != 0) - { - if (verbose) - s = format (s, "Adding ack %u, snd_una_max %u, segs: ", - trace[left].ack, trace[left].snd_una_max); - dummy_tc->snd_una_max = trace[left].snd_una_max; - next_ack = trace[left].ack; - has_new_ack = 1; - } - else - { - if (verbose) - s = format (s, "[%u, %u], ", trace[left].start, - trace[left].end); - vec_add2 (dummy_tc->rcv_opts.sacks, block, 1); - block->start = trace[left].start; - block->end = trace[left].end; - } - left++; - } - - /* Push segments */ - tcp_rcv_sacks (dummy_tc, next_ack); - if (has_new_ack) - dummy_tc->snd_una = next_ack; - - if (verbose) - s = format (s, "result: %U", format_tcp_scoreboard, - &dummy_tc->sack_sb); - - } - s = format (s, "result: %U", format_tcp_scoreboard, &dummy_tc->sack_sb); - - return s; -} - -static clib_error_t * -tcp_scoreboard_trace_fn (vlib_main_t * vm, unformat_input_t * input, - vlib_cli_command_t * cmd_arg) -{ - transport_connection_t *tconn = 0; - tcp_connection_t *tc = 0; - u8 *str = 0; - while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) - { - if (unformat (input, "%U", unformat_transport_connection, &tconn, - TRANSPORT_PROTO_TCP)) - ; - else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); - } - - if (!TCP_SCOREBOARD_TRACE) - { - vlib_cli_output (vm, "scoreboard tracing not enabled"); - return 0; - } - - tc = tcp_get_connection_from_transport (tconn); - if (!tc) - { - vlib_cli_output (vm, "connection not found"); - return 0; - } - str = tcp_scoreboard_replay (str, tc, 1); - vlib_cli_output (vm, "%v", str); - return 0; -} - -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (tcp_replay_scoreboard_command, static) = -{ - .path = "tcp replay scoreboard", - .short_help = "tcp replay scoreboard <connection>", - .function = tcp_scoreboard_trace_fn, -}; -/* *INDENT-ON* */ - -static clib_error_t * -show_tcp_punt_fn (vlib_main_t * vm, unformat_input_t * input, - vlib_cli_command_t * cmd_arg) -{ - tcp_main_t *tm = vnet_get_tcp_main (); - if (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) - return clib_error_return (0, "unknown input `%U'", format_unformat_error, - input); - vlib_cli_output (vm, "IPv4 TCP punt: %s", - tm->punt_unknown4 ? "enabled" : "disabled"); - vlib_cli_output (vm, "IPv6 TCP punt: %s", - tm->punt_unknown6 ? "enabled" : "disabled"); - return 0; -} -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (show_tcp_punt_command, static) = -{ - .path = "show tcp punt", - .short_help = "show tcp punt", - .function = show_tcp_punt_fn, -}; -/* *INDENT-ON* */ - -static clib_error_t * -show_tcp_stats_fn (vlib_main_t * vm, unformat_input_t * input, - vlib_cli_command_t * cmd) -{ - tcp_main_t *tm = vnet_get_tcp_main (); - tcp_worker_ctx_t *wrk; - u32 thread; - - if (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) - return clib_error_return (0, "unknown input `%U'", format_unformat_error, - input); - for (thread = 0; thread < vec_len (tm->wrk_ctx); thread++) - { - wrk = tcp_get_worker (thread); - vlib_cli_output (vm, "Thread %u:\n", thread); - - if (clib_fifo_elts (wrk->pending_timers)) - vlib_cli_output (vm, " %lu pending timers", - clib_fifo_elts (wrk->pending_timers)); - -#define _(name,type,str) \ - if (wrk->stats.name) \ - vlib_cli_output (vm, " %lu %s", wrk->stats.name, str); - foreach_tcp_wrk_stat -#undef _ - } - - return 0; -} - -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (show_tcp_stats_command, static) = -{ - .path = "show tcp stats", - .short_help = "show tcp stats", - .function = show_tcp_stats_fn, -}; -/* *INDENT-ON* */ - -static clib_error_t * -clear_tcp_stats_fn (vlib_main_t * vm, unformat_input_t * input, - vlib_cli_command_t * cmd) -{ - tcp_main_t *tm = vnet_get_tcp_main (); - tcp_worker_ctx_t *wrk; - u32 thread; - - if (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) - return clib_error_return (0, "unknown input `%U'", format_unformat_error, - input); - - for (thread = 0; thread < vec_len (tm->wrk_ctx); thread++) - { - wrk = tcp_get_worker (thread); - clib_memset (&wrk->stats, 0, sizeof (wrk->stats)); - } - - return 0; -} - -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (clear_tcp_stats_command, static) = -{ - .path = "clear tcp stats", - .short_help = "clear tcp stats", - .function = clear_tcp_stats_fn, -}; -/* *INDENT-ON* */ - /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 30c95a48ffa..708d7566eb2 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -18,464 +18,13 @@ #include <vnet/vnet.h> #include <vnet/ip/ip.h> -#include <vnet/tcp/tcp_packet.h> -#include <vnet/tcp/tcp_timer.h> -#include <vnet/session/transport.h> #include <vnet/session/session.h> +#include <vnet/tcp/tcp_types.h> +#include <vnet/tcp/tcp_timer.h> #include <vnet/tcp/tcp_debug.h> - -#define TCP_TICK 0.001 /**< TCP tick period (s) */ -#define THZ (u32) (1/TCP_TICK) /**< TCP tick frequency */ -#define TCP_TSTAMP_RESOLUTION TCP_TICK /**< Time stamp resolution */ -#define TCP_PAWS_IDLE 24 * 24 * 60 * 60 * THZ /**< 24 days */ -#define TCP_FIB_RECHECK_PERIOD 1 * THZ /**< Recheck every 1s */ -#define TCP_MAX_OPTION_SPACE 40 -#define TCP_CC_DATA_SZ 24 -#define TCP_MAX_GSO_SZ 65536 -#define TCP_RXT_MAX_BURST 10 - -#define TCP_DUPACK_THRESHOLD 3 -#define TCP_IW_N_SEGMENTS 10 -#define TCP_ALWAYS_ACK 1 /**< On/off delayed acks */ -#define TCP_USE_SACKS 1 /**< Disable only for testing */ - -/** TCP FSM state definitions as per RFC793. */ -#define foreach_tcp_fsm_state \ - _(CLOSED, "CLOSED") \ - _(LISTEN, "LISTEN") \ - _(SYN_SENT, "SYN_SENT") \ - _(SYN_RCVD, "SYN_RCVD") \ - _(ESTABLISHED, "ESTABLISHED") \ - _(CLOSE_WAIT, "CLOSE_WAIT") \ - _(FIN_WAIT_1, "FIN_WAIT_1") \ - _(LAST_ACK, "LAST_ACK") \ - _(CLOSING, "CLOSING") \ - _(FIN_WAIT_2, "FIN_WAIT_2") \ - _(TIME_WAIT, "TIME_WAIT") - -typedef enum _tcp_state -{ -#define _(sym, str) TCP_STATE_##sym, - foreach_tcp_fsm_state -#undef _ - TCP_N_STATES -} tcp_state_t; - -format_function_t format_tcp_state; -format_function_t format_tcp_flags; -format_function_t format_tcp_sacks; -format_function_t format_tcp_rcv_sacks; - -/** TCP timers */ -#define foreach_tcp_timer \ - _(RETRANSMIT, "RETRANSMIT") \ - _(DELACK, "DELAYED ACK") \ - _(PERSIST, "PERSIST") \ - _(WAITCLOSE, "WAIT CLOSE") \ - _(RETRANSMIT_SYN, "RETRANSMIT SYN") \ - -typedef enum _tcp_timers -{ -#define _(sym, str) TCP_TIMER_##sym, - foreach_tcp_timer -#undef _ - TCP_N_TIMERS -} tcp_timers_e; - -#define TCP_TIMER_HANDLE_INVALID ((u32) ~0) - -#define TCP_TIMER_TICK 0.1 /**< Timer tick in seconds */ -#define TCP_TO_TIMER_TICK TCP_TICK*10 /**< Factor for converting - ticks to timer ticks */ - -#define TCP_RTO_MAX 60 * THZ /* Min max RTO (60s) as per RFC6298 */ -#define TCP_RTO_MIN 0.2 * THZ /* Min RTO (200ms) - lower than standard */ -#define TCP_RTT_MAX 30 * THZ /* 30s (probably too much) */ -#define TCP_RTO_SYN_RETRIES 3 /* SYN retries without doubling RTO */ -#define TCP_RTO_INIT 1 * THZ /* Initial retransmit timer */ -#define TCP_RTO_BOFF_MAX 8 /* Max number of retries before reset */ -#define TCP_ESTABLISH_TIME (60 * THZ) /* Connection establish timeout */ - -/** Connection configuration flags */ -#define foreach_tcp_cfg_flag \ - _(RATE_SAMPLE, "Rate sampling") \ - _(NO_CSUM_OFFLOAD, "No csum offload") \ - _(NO_TSO, "TSO off") \ - _(TSO, "TSO") \ - _(NO_ENDPOINT,"No endpoint") \ - -typedef enum tcp_cfg_flag_bits_ -{ -#define _(sym, str) TCP_CFG_F_##sym##_BIT, - foreach_tcp_cfg_flag -#undef _ - TCP_CFG_N_FLAG_BITS -} tcp_cfg_flag_bits_e; - -typedef enum tcp_cfg_flag_ -{ -#define _(sym, str) TCP_CFG_F_##sym = 1 << TCP_CFG_F_##sym##_BIT, - foreach_tcp_cfg_flag -#undef _ - TCP_CFG_N_FLAGS -} tcp_cfg_flags_e; - -/** TCP connection flags */ -#define foreach_tcp_connection_flag \ - _(SNDACK, "Send ACK") \ - _(FINSNT, "FIN sent") \ - _(RECOVERY, "Recovery") \ - _(FAST_RECOVERY, "Fast Recovery") \ - _(DCNT_PENDING, "Disconnect pending") \ - _(HALF_OPEN_DONE, "Half-open completed") \ - _(FINPNDG, "FIN pending") \ - _(RXT_PENDING, "Retransmit pending") \ - _(FRXT_FIRST, "Retransmit first") \ - _(DEQ_PENDING, "Dequeue pending ") \ - _(PSH_PENDING, "PSH pending") \ - _(FINRCVD, "FIN received") \ - _(ZERO_RWND_SENT, "Zero RWND sent") \ - -typedef enum tcp_connection_flag_bits_ -{ -#define _(sym, str) TCP_CONN_##sym##_BIT, - foreach_tcp_connection_flag -#undef _ - TCP_CONN_N_FLAG_BITS -} tcp_connection_flag_bits_e; - -typedef enum tcp_connection_flag_ -{ -#define _(sym, str) TCP_CONN_##sym = 1 << TCP_CONN_##sym##_BIT, - foreach_tcp_connection_flag -#undef _ - TCP_CONN_N_FLAGS -} tcp_connection_flags_e; - -#define TCP_SCOREBOARD_TRACE (0) -#define TCP_MAX_SACK_BLOCKS 256 /**< Max number of SACK blocks stored */ -#define TCP_INVALID_SACK_HOLE_INDEX ((u32)~0) - -typedef struct _scoreboard_trace_elt -{ - u32 start; - u32 end; - u32 ack; - u32 snd_una_max; - u32 group; -} scoreboard_trace_elt_t; - -typedef struct _sack_scoreboard_hole -{ - u32 next; /**< Index for next entry in linked list */ - u32 prev; /**< Index for previous entry in linked list */ - u32 start; /**< Start sequence number */ - u32 end; /**< End sequence number */ - u8 is_lost; /**< Mark hole as lost */ -} sack_scoreboard_hole_t; - -typedef struct _sack_scoreboard -{ - sack_scoreboard_hole_t *holes; /**< Pool of holes */ - u32 head; /**< Index of first entry */ - u32 tail; /**< Index of last entry */ - u32 sacked_bytes; /**< Number of bytes sacked in sb */ - u32 last_sacked_bytes; /**< Number of bytes last sacked */ - u32 last_bytes_delivered; /**< Sack bytes delivered to app */ - u32 rxt_sacked; /**< Rxt bytes last delivered */ - u32 high_sacked; /**< Highest byte sacked (fack) */ - u32 high_rxt; /**< Highest retransmitted sequence */ - u32 rescue_rxt; /**< Rescue sequence number */ - u32 lost_bytes; /**< Bytes lost as per RFC6675 */ - u32 last_lost_bytes; /**< Number of bytes last lost */ - u32 cur_rxt_hole; /**< Retransmitting from this hole */ - u8 is_reneging; - -#if TCP_SCOREBOARD_TRACE - scoreboard_trace_elt_t *trace; -#endif - -} sack_scoreboard_t; - -#if TCP_SCOREBOARD_TRACE -#define tcp_scoreboard_trace_add(_tc, _ack) \ -{ \ - static u64 _group = 0; \ - sack_scoreboard_t *_sb = &_tc->sack_sb; \ - sack_block_t *_sack, *_sacks; \ - scoreboard_trace_elt_t *_elt; \ - int i; \ - _group++; \ - _sacks = _tc->rcv_opts.sacks; \ - for (i = 0; i < vec_len (_sacks); i++) \ - { \ - _sack = &_sacks[i]; \ - vec_add2 (_sb->trace, _elt, 1); \ - _elt->start = _sack->start; \ - _elt->end = _sack->end; \ - _elt->ack = _elt->end == _ack ? _ack : 0; \ - _elt->snd_una_max = _elt->end == _ack ? _tc->snd_una_max : 0; \ - _elt->group = _group; \ - } \ -} -#else -#define tcp_scoreboard_trace_add(_tc, _ack) -#endif - -sack_scoreboard_hole_t *scoreboard_next_rxt_hole (sack_scoreboard_t * sb, - sack_scoreboard_hole_t * - start, u8 have_sent_1_smss, - u8 * can_rescue, - u8 * snd_limited); -sack_scoreboard_hole_t *scoreboard_get_hole (sack_scoreboard_t * sb, - u32 index); - -sack_scoreboard_hole_t *scoreboard_next_hole (sack_scoreboard_t * sb, - sack_scoreboard_hole_t * hole); -sack_scoreboard_hole_t *scoreboard_prev_hole (sack_scoreboard_t * sb, - sack_scoreboard_hole_t * hole); -sack_scoreboard_hole_t *scoreboard_first_hole (sack_scoreboard_t * sb); -sack_scoreboard_hole_t *scoreboard_last_hole (sack_scoreboard_t * sb); - -void scoreboard_clear (sack_scoreboard_t * sb); -void scoreboard_clear_reneging (sack_scoreboard_t * sb, u32 start, u32 end); -void scoreboard_init (sack_scoreboard_t * sb); -void scoreboard_init_rxt (sack_scoreboard_t * sb, u32 snd_una); -u8 *format_tcp_scoreboard (u8 * s, va_list * args); - -#define TCP_BTS_INVALID_INDEX ((u32)~0) - -typedef enum tcp_bts_flags_ -{ - TCP_BTS_IS_RXT = 1, - TCP_BTS_IS_APP_LIMITED = 1 << 1, - TCP_BTS_IS_SACKED = 1 << 2, - TCP_BTS_IS_RXT_LOST = 1 << 3, -} __clib_packed tcp_bts_flags_t; - -typedef struct tcp_bt_sample_ -{ - u32 next; /**< Next sample index in list */ - u32 prev; /**< Previous sample index in list */ - u32 min_seq; /**< Min seq number in sample */ - u32 max_seq; /**< Max seq number. Set for rxt samples */ - u64 delivered; /**< Total delivered bytes for sample */ - f64 delivered_time; /**< Delivered time when sample taken */ - f64 tx_time; /**< Transmit time for the burst */ - f64 first_tx_time; /**< Connection first tx time at tx */ - u64 tx_in_flight; /**< In flight at tx time */ - u64 tx_lost; /**< Lost at tx time */ - tcp_bts_flags_t flags; /**< Sample flag */ -} tcp_bt_sample_t; - -typedef struct tcp_rate_sample_ -{ - u64 prior_delivered; /**< Delivered of sample used for rate, i.e., - total bytes delivered at prior_time */ - f64 prior_time; /**< Delivered time of sample used for rate */ - f64 interval_time; /**< Time to ack the bytes delivered */ - f64 rtt_time; /**< RTT for sample */ - u64 tx_in_flight; /**< In flight at (re)transmit time */ - u64 tx_lost; /**< Lost over interval */ - u32 delivered; /**< Bytes delivered in interval_time */ - u32 acked_and_sacked; /**< Bytes acked + sacked now */ - u32 last_lost; /**< Bytes lost now */ - u32 lost; /**< Number of bytes lost over interval */ - tcp_bts_flags_t flags; /**< Rate sample flags from bt sample */ -} tcp_rate_sample_t; - -typedef struct tcp_byte_tracker_ -{ - tcp_bt_sample_t *samples; /**< Pool of samples */ - rb_tree_t sample_lookup; /**< Rbtree for sample lookup by min_seq */ - u32 head; /**< Head of samples linked list */ - u32 tail; /**< Tail of samples linked list */ - u32 last_ooo; /**< Cached last ooo sample */ -} tcp_byte_tracker_t; - -typedef enum _tcp_cc_algorithm_type -{ - TCP_CC_NEWRENO, - TCP_CC_CUBIC, - TCP_CC_LAST = TCP_CC_CUBIC -} tcp_cc_algorithm_type_e; - -typedef struct _tcp_cc_algorithm tcp_cc_algorithm_t; - -typedef enum _tcp_cc_ack_t -{ - TCP_CC_ACK, - TCP_CC_DUPACK, - TCP_CC_PARTIALACK -} tcp_cc_ack_t; - -typedef enum tcp_cc_event_ -{ - TCP_CC_EVT_START_TX, -} tcp_cc_event_t; - -/* - * As per RFC4898 tcpEStatsStackSoftErrors - */ -typedef struct tcp_errors_ -{ - u32 below_data_wnd; /**< All data in seg is below snd_una */ - u32 above_data_wnd; /**< Some data in segment is above snd_wnd */ - u32 below_ack_wnd; /**< Acks for data below snd_una */ - u32 above_ack_wnd; /**< Acks for data not sent */ -} tcp_errors_t; - -typedef struct _tcp_connection -{ - CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); - transport_connection_t connection; /**< Common transport data. First! */ - - u8 state; /**< TCP state as per tcp_state_t */ - u8 cfg_flags; /**< Connection configuration flags */ - u16 flags; /**< Connection flags (see tcp_conn_flags_e) */ - u32 timers[TCP_N_TIMERS]; /**< Timer handles into timer wheel */ - - u64 segs_in; /** RFC4022/4898 tcpHCInSegs/tcpEStatsPerfSegsIn */ - u64 bytes_in; /** RFC4898 tcpEStatsPerfHCDataOctetsIn */ - u64 segs_out; /** RFC4898 tcpEStatsPerfSegsOut */ - u64 bytes_out; /** RFC4898 tcpEStatsPerfHCDataOctetsOut */ - - /** Send sequence variables RFC793 */ - u32 snd_una; /**< oldest unacknowledged sequence number */ - u32 snd_una_max; /**< newest unacknowledged sequence number + 1*/ - u32 snd_wnd; /**< send window */ - u32 snd_wl1; /**< seq number used for last snd.wnd update */ - u32 snd_wl2; /**< ack number used for last snd.wnd update */ - u32 snd_nxt; /**< next seq number to be sent */ - u16 snd_mss; /**< Effective send max seg (data) size */ - - u64 data_segs_in; /** RFC4898 tcpEStatsPerfDataSegsIn */ - u64 data_segs_out; /** RFC4898 tcpEStatsPerfDataSegsOut */ - - /** Receive sequence variables RFC793 */ - u32 rcv_nxt; /**< next sequence number expected */ - u32 rcv_wnd; /**< receive window we expect */ - - u32 rcv_las; /**< rcv_nxt at last ack sent/rcv_wnd update */ - u32 iss; /**< initial sent sequence */ - u32 irs; /**< initial remote sequence */ - - /* Options */ - u8 snd_opts_len; /**< Tx options len */ - u8 rcv_wscale; /**< Window scale to advertise to peer */ - u8 snd_wscale; /**< Window scale to use when sending */ - u32 tsval_recent; /**< Last timestamp received */ - u32 tsval_recent_age; /**< When last updated tstamp_recent*/ - tcp_options_t snd_opts; /**< Tx options for connection */ - tcp_options_t rcv_opts; /**< Rx options for connection */ - - sack_block_t *snd_sacks; /**< Vector of SACKs to send. XXX Fixed size? */ - u8 snd_sack_pos; /**< Position in vec of first block to send */ - sack_block_t *snd_sacks_fl; /**< Vector for building new list */ - sack_scoreboard_t sack_sb; /**< SACK "scoreboard" that tracks holes */ - - u16 rcv_dupacks; /**< Number of recent DUPACKs received */ - u32 dupacks_in; /**< RFC4898 tcpEStatsStackDupAcksIn*/ - u8 pending_dupacks; /**< Number of DUPACKs to be sent */ - u32 dupacks_out; /**< RFC4898 tcpEStatsPathDupAcksOut */ - - /* Congestion control */ - u32 cwnd; /**< Congestion window */ - u32 cwnd_acc_bytes; /**< Bytes accumulated for cwnd increment */ - u32 ssthresh; /**< Slow-start threshold */ - u32 prev_ssthresh; /**< ssthresh before congestion */ - u32 prev_cwnd; /**< ssthresh before congestion */ - u32 bytes_acked; /**< Bytes acknowledged by current segment */ - u32 burst_acked; /**< Bytes acknowledged in current burst */ - u32 snd_rxt_bytes; /**< Retransmitted bytes during current cc event */ - u32 snd_rxt_ts; /**< Timestamp when first packet is retransmitted */ - u32 prr_delivered; /**< RFC6937 bytes delivered during current event */ - u32 prr_start; /**< snd_una when prr starts */ - u32 rxt_delivered; /**< Rxt bytes delivered during current cc event */ - u32 rxt_head; /**< snd_una last time we re rxted the head */ - u32 tsecr_last_ack; /**< Timestamp echoed to us in last healthy ACK */ - u32 snd_congestion; /**< snd_una_max when congestion is detected */ - u32 tx_fifo_size; /**< Tx fifo size. Used to constrain cwnd */ - tcp_cc_algorithm_t *cc_algo; /**< Congestion control algorithm */ - u8 cc_data[TCP_CC_DATA_SZ]; /**< Congestion control algo private data */ - - u32 fr_occurences; /**< fast-retransmit occurrences RFC4898 - tcpEStatsStackFastRetran */ - u32 tr_occurences; /**< timer-retransmit occurrences */ - u64 bytes_retrans; /**< RFC4898 tcpEStatsPerfOctetsRetrans */ - u64 segs_retrans; /**< RFC4898 tcpEStatsPerfSegsRetrans*/ - - /* RTT and RTO */ - u32 rto; /**< Retransmission timeout */ - u32 rto_boff; /**< Index for RTO backoff */ - u32 srtt; /**< Smoothed RTT */ - u32 rttvar; /**< Smoothed mean RTT difference. Approximates variance */ - u32 rtt_seq; /**< Sequence number for tracked ACK */ - f64 rtt_ts; /**< Timestamp for tracked ACK */ - f64 mrtt_us; /**< High precision mrtt from tracked acks */ - - u32 psh_seq; /**< Add psh header for seg that includes this */ - u32 next_node_index; /**< Can be used to control next node in output */ - u32 next_node_opaque; /**< Opaque to pass to next node */ - u32 limited_transmit; /**< snd_nxt when limited transmit starts */ - u32 sw_if_index; /**< Interface for the connection */ - - /* Delivery rate estimation */ - u64 delivered; /**< Total bytes delivered to peer */ - u64 app_limited; /**< Delivered when app-limited detected */ - f64 delivered_time; /**< Time last bytes were acked */ - f64 first_tx_time; /**< Send time for recently delivered/sent */ - u64 lost; /**< Total bytes lost */ - tcp_byte_tracker_t *bt; /**< Tx byte tracker */ - - tcp_errors_t errors; /**< Soft connection errors */ - - f64 start_ts; /**< Timestamp when connection initialized */ - u32 last_fib_check; /**< Last time we checked fib route for peer */ - u16 mss; /**< Our max seg size that includes options */ - u32 timestamp_delta; /**< Offset for timestamp */ - u32 ipv6_flow_label; /**< flow label for ipv6 header */ - -#define rst_state snd_wl1 -} tcp_connection_t; - -/* *INDENT-OFF* */ -struct _tcp_cc_algorithm -{ - const char *name; - uword (*unformat_cfg) (unformat_input_t * input); - void (*init) (tcp_connection_t * tc); - void (*cleanup) (tcp_connection_t * tc); - void (*rcv_ack) (tcp_connection_t * tc, tcp_rate_sample_t *rs); - void (*rcv_cong_ack) (tcp_connection_t * tc, tcp_cc_ack_t ack, - tcp_rate_sample_t *rs); - void (*congestion) (tcp_connection_t * tc); - void (*loss) (tcp_connection_t * tc); - void (*recovered) (tcp_connection_t * tc); - void (*undo_recovery) (tcp_connection_t * tc); - void (*event) (tcp_connection_t *tc, tcp_cc_event_t evt); - u64 (*get_pacing_rate) (tcp_connection_t *tc); -}; -/* *INDENT-ON* */ - -#define tcp_fastrecovery_on(tc) (tc)->flags |= TCP_CONN_FAST_RECOVERY -#define tcp_fastrecovery_off(tc) (tc)->flags &= ~TCP_CONN_FAST_RECOVERY -#define tcp_recovery_on(tc) (tc)->flags |= TCP_CONN_RECOVERY -#define tcp_recovery_off(tc) (tc)->flags &= ~TCP_CONN_RECOVERY -#define tcp_in_fastrecovery(tc) ((tc)->flags & TCP_CONN_FAST_RECOVERY) -#define tcp_in_recovery(tc) ((tc)->flags & (TCP_CONN_RECOVERY)) -#define tcp_in_slowstart(tc) (tc->cwnd < tc->ssthresh) -#define tcp_disconnect_pending(tc) ((tc)->flags & TCP_CONN_DCNT_PENDING) -#define tcp_disconnect_pending_on(tc) ((tc)->flags |= TCP_CONN_DCNT_PENDING) -#define tcp_disconnect_pending_off(tc) ((tc)->flags &= ~TCP_CONN_DCNT_PENDING) -#define tcp_fastrecovery_first(tc) ((tc)->flags & TCP_CONN_FRXT_FIRST) -#define tcp_fastrecovery_first_on(tc) ((tc)->flags |= TCP_CONN_FRXT_FIRST) -#define tcp_fastrecovery_first_off(tc) ((tc)->flags &= ~TCP_CONN_FRXT_FIRST) - -#define tcp_in_cong_recovery(tc) ((tc)->flags & \ - (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY)) - -#define tcp_csum_offload(tc) (!((tc)->cfg_flags & TCP_CFG_F_NO_CSUM_OFFLOAD)) +#include <vnet/tcp/tcp_sack.h> +#include <vnet/tcp/tcp_bt.h> +#include <vnet/tcp/tcp_cc.h> typedef void (timer_expiration_handler) (tcp_connection_t * tc); @@ -484,17 +33,6 @@ extern timer_expiration_handler tcp_timer_retransmit_handler; extern timer_expiration_handler tcp_timer_persist_handler; extern timer_expiration_handler tcp_timer_retransmit_syn_handler; -always_inline void -tcp_cong_recovery_off (tcp_connection_t * tc) -{ - tc->flags &= ~(TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY); - tcp_fastrecovery_first_off (tc); -} - -#define tcp_zero_rwnd_sent(tc) ((tc)->flags & TCP_CONN_ZERO_RWND_SENT) -#define tcp_zero_rwnd_sent_on(tc) (tc)->flags |= TCP_CONN_ZERO_RWND_SENT -#define tcp_zero_rwnd_sent_off(tc) (tc)->flags &= ~TCP_CONN_ZERO_RWND_SENT - typedef enum _tcp_error { #define tcp_error(n,s) TCP_ERROR_##n, @@ -752,14 +290,6 @@ tcp_get_worker (u32 thread_index) return &tcp_main.wrk_ctx[thread_index]; } -always_inline tcp_header_t * -tcp_buffer_hdr (vlib_buffer_t * b) -{ - ASSERT ((signed) b->current_data >= (signed) -VLIB_BUFFER_PRE_DATA_SIZE); - return (tcp_header_t *) (b->data + b->current_data - + vnet_buffer (b)->tcp.hdr_offset); -} - #if (VLIB_BUFFER_TRACE_TRAJECTORY) #define tcp_trajectory_add_start(b, start) \ { \ @@ -769,84 +299,15 @@ tcp_buffer_hdr (vlib_buffer_t * b) #define tcp_trajectory_add_start(b, start) #endif -clib_error_t *vnet_tcp_enable_disable (vlib_main_t * vm, u8 is_en); - -void tcp_punt_unknown (vlib_main_t * vm, u8 is_ip4, u8 is_add); - -always_inline tcp_connection_t * -tcp_connection_get (u32 conn_index, u32 thread_index) -{ - tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index); - if (PREDICT_FALSE (pool_is_free_index (wrk->connections, conn_index))) - return 0; - return pool_elt_at_index (wrk->connections, conn_index); -} - -always_inline tcp_connection_t * -tcp_connection_get_if_valid (u32 conn_index, u32 thread_index) -{ - tcp_worker_ctx_t *wrk; - if (thread_index >= vec_len (tcp_main.wrk_ctx)) - return 0; - wrk = tcp_get_worker (thread_index); - if (pool_is_free_index (wrk->connections, conn_index)) - return 0; - return pool_elt_at_index (wrk->connections, conn_index); -} - -always_inline tcp_connection_t * -tcp_get_connection_from_transport (transport_connection_t * tconn) -{ - return (tcp_connection_t *) tconn; -} - -always_inline void -tcp_connection_set_state (tcp_connection_t * tc, tcp_state_t state) -{ - tc->state = state; - TCP_EVT (TCP_EVT_STATE_CHANGE, tc); -} - -void tcp_connection_close (tcp_connection_t * tc); -void tcp_connection_cleanup (tcp_connection_t * tc); -void tcp_connection_del (tcp_connection_t * tc); -int tcp_half_open_connection_cleanup (tcp_connection_t * tc); tcp_connection_t *tcp_connection_alloc (u8 thread_index); tcp_connection_t *tcp_connection_alloc_w_base (u8 thread_index, tcp_connection_t * base); void tcp_connection_free (tcp_connection_t * tc); -int tcp_configure_v4_source_address_range (vlib_main_t * vm, - ip4_address_t * start, - ip4_address_t * end, u32 table_id); -int tcp_configure_v6_source_address_range (vlib_main_t * vm, - ip6_address_t * start, - ip6_address_t * end, u32 table_id); -void tcp_api_reference (void); -u8 *format_tcp_connection (u8 * s, va_list * args); -u8 *format_tcp_connection_id (u8 * s, va_list * args); - -always_inline tcp_connection_t * -tcp_listener_get (u32 tli) -{ - tcp_connection_t *tc = 0; - if (!pool_is_free_index (tcp_main.listener_pool, tli)) - tc = pool_elt_at_index (tcp_main.listener_pool, tli); - return tc; -} - -always_inline tcp_connection_t * -tcp_half_open_connection_get (u32 conn_index) -{ - tcp_connection_t *tc = 0; - clib_spinlock_lock_if_init (&tcp_main.half_open_lock); - if (!pool_is_free_index (tcp_main.half_open_connections, conn_index)) - tc = pool_elt_at_index (tcp_main.half_open_connections, conn_index); - clib_spinlock_unlock_if_init (&tcp_main.half_open_lock); - return tc; -} +void tcp_connection_close (tcp_connection_t * tc); +void tcp_connection_cleanup (tcp_connection_t * tc); +void tcp_connection_del (tcp_connection_t * tc); +int tcp_half_open_connection_cleanup (tcp_connection_t * tc); -void tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b); -void tcp_make_synack (tcp_connection_t * ts, vlib_buffer_t * b); void tcp_send_reset_w_pkt (tcp_connection_t * tc, vlib_buffer_t * pkt, u32 thread_index, u8 is_ip4); void tcp_send_reset (tcp_connection_t * tc); @@ -854,256 +315,17 @@ void tcp_send_syn (tcp_connection_t * tc); void tcp_send_synack (tcp_connection_t * tc); void tcp_send_fin (tcp_connection_t * tc); void tcp_send_ack (tcp_connection_t * tc); -void tcp_update_burst_snd_vars (tcp_connection_t * tc); -void tcp_update_rto (tcp_connection_t * tc); void tcp_send_window_update_ack (tcp_connection_t * tc); void tcp_program_ack (tcp_connection_t * tc); void tcp_program_dupack (tcp_connection_t * tc); void tcp_program_retransmit (tcp_connection_t * tc); -/* - * Rate estimation - */ - -/** - * Byte tracker initialize - * - * @param tc connection for which the byte tracker should be allocated and - * initialized - */ -void tcp_bt_init (tcp_connection_t * tc); -/** - * Byte tracker cleanup - * - * @param tc connection for which the byte tracker should be cleaned up - */ -void tcp_bt_cleanup (tcp_connection_t * tc); -/** - * Flush byte tracker samples - * - * @param tc tcp connection for which samples should be flushed - */ -void tcp_bt_flush_samples (tcp_connection_t * tc); -/** - * Track a tcp tx burst - * - * @param tc tcp connection - */ -void tcp_bt_track_tx (tcp_connection_t * tc, u32 len); -/** - * Track a tcp retransmission - * - * @param tc tcp connection - * @param start start sequence number - * @param end end sequence number - */ -void tcp_bt_track_rxt (tcp_connection_t * tc, u32 start, u32 end); -/** - * Generate a delivery rate sample from recently acked bytes - * - * @param tc tcp connection - * @param rs resulting rate sample - */ -void tcp_bt_sample_delivery_rate (tcp_connection_t * tc, - tcp_rate_sample_t * rs); -/** - * Check if sample to be generated is app limited - * - * @param tc tcp connection - */ -void tcp_bt_check_app_limited (tcp_connection_t * tc); -/** - * Check if the byte tracker is in sane state - * - * Should be used only for testing - * - * @param bt byte tracker - */ -int tcp_bt_is_sane (tcp_byte_tracker_t * bt); -u8 *format_tcp_bt (u8 * s, va_list * args); - -always_inline u32 -tcp_end_seq (tcp_header_t * th, u32 len) -{ - return th->seq_number + tcp_is_syn (th) + tcp_is_fin (th) + len; -} - -/* Modulo arithmetic for TCP sequence numbers */ -#define seq_lt(_s1, _s2) ((i32)((_s1)-(_s2)) < 0) -#define seq_leq(_s1, _s2) ((i32)((_s1)-(_s2)) <= 0) -#define seq_gt(_s1, _s2) ((i32)((_s1)-(_s2)) > 0) -#define seq_geq(_s1, _s2) ((i32)((_s1)-(_s2)) >= 0) -#define seq_max(_s1, _s2) (seq_gt((_s1), (_s2)) ? (_s1) : (_s2)) - -/* Modulo arithmetic for timestamps */ -#define timestamp_lt(_t1, _t2) ((i32)((_t1)-(_t2)) < 0) -#define timestamp_leq(_t1, _t2) ((i32)((_t1)-(_t2)) <= 0) - -/** - * Our estimate of the number of bytes that have left the network - */ -always_inline u32 -tcp_bytes_out (const tcp_connection_t * tc) -{ - if (tcp_opts_sack_permitted (&tc->rcv_opts)) - return tc->sack_sb.sacked_bytes + tc->sack_sb.lost_bytes; - else - return clib_min (tc->rcv_dupacks * tc->snd_mss, - tc->snd_nxt - tc->snd_una); -} - -/** - * Our estimate of the number of bytes in flight (pipe size) - */ -always_inline u32 -tcp_flight_size (const tcp_connection_t * tc) -{ - int flight_size; - - flight_size = (int) (tc->snd_nxt - tc->snd_una) - tcp_bytes_out (tc) - + tc->snd_rxt_bytes - tc->rxt_delivered; - - ASSERT (flight_size >= 0); - - return flight_size; -} - -/** - * Initial cwnd as per RFC5681 - */ -always_inline u32 -tcp_initial_cwnd (const tcp_connection_t * tc) -{ - if (tcp_cfg.initial_cwnd_multiplier > 0) - return tcp_cfg.initial_cwnd_multiplier * tc->snd_mss; - - if (tc->snd_mss > 2190) - return 2 * tc->snd_mss; - else if (tc->snd_mss > 1095) - return 3 * tc->snd_mss; - else - return 4 * tc->snd_mss; -} - -/* - * Accumulate acked bytes for cwnd increase - * - * Once threshold bytes are accumulated, snd_mss bytes are added - * to the cwnd. - */ -always_inline void -tcp_cwnd_accumulate (tcp_connection_t * tc, u32 thresh, u32 bytes) -{ - tc->cwnd_acc_bytes += bytes; - if (tc->cwnd_acc_bytes >= thresh) - { - u32 inc = tc->cwnd_acc_bytes / thresh; - tc->cwnd_acc_bytes -= inc * thresh; - tc->cwnd += inc * tc->snd_mss; - tc->cwnd = clib_min (tc->cwnd, tc->tx_fifo_size); - } -} - -always_inline u32 -tcp_loss_wnd (const tcp_connection_t * tc) -{ - /* Whatever we have in flight + the packet we're about to send */ - return tcp_flight_size (tc) + tc->snd_mss; -} - -always_inline u32 -tcp_available_snd_wnd (const tcp_connection_t * tc) -{ - return clib_min (tc->cwnd, tc->snd_wnd); -} - -always_inline u32 -tcp_available_output_snd_space (const tcp_connection_t * tc) -{ - u32 available_wnd = tcp_available_snd_wnd (tc); - int flight_size = (int) (tc->snd_nxt - tc->snd_una); - - if (available_wnd <= flight_size) - return 0; - - return available_wnd - flight_size; -} - -/** - * Estimate of how many bytes we can still push into the network - */ -always_inline u32 -tcp_available_cc_snd_space (const tcp_connection_t * tc) -{ - u32 available_wnd = tcp_available_snd_wnd (tc); - u32 flight_size = tcp_flight_size (tc); - - if (available_wnd <= flight_size) - return 0; - - return available_wnd - flight_size; -} - -static inline u8 -tcp_is_descheduled (tcp_connection_t * tc) -{ - return (transport_connection_is_descheduled (&tc->connection) ? 1 : 0); -} - -always_inline u8 -tcp_is_lost_fin (tcp_connection_t * tc) -{ - if ((tc->flags & TCP_CONN_FINSNT) && (tc->snd_una_max - tc->snd_una == 1)) - return 1; - return 0; -} - +void tcp_update_burst_snd_vars (tcp_connection_t * tc); u32 tcp_snd_space (tcp_connection_t * tc); int tcp_fastrecovery_prr_snd_space (tcp_connection_t * tc); void tcp_reschedule (tcp_connection_t * tc); - fib_node_index_t tcp_lookup_rmt_in_fib (tcp_connection_t * tc); - -/* Made public for unit testing only */ -void tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end); -u32 tcp_sack_list_bytes (tcp_connection_t * tc); - -always_inline u32 -tcp_time_now (void) -{ - return tcp_main.wrk_ctx[vlib_get_thread_index ()].time_now; -} - -always_inline u32 -tcp_time_now_w_thread (u32 thread_index) -{ - return tcp_main.wrk_ctx[thread_index].time_now; -} - -/** - * Generate timestamp for tcp connection - */ -always_inline u32 -tcp_tstamp (tcp_connection_t * tc) -{ - return (tcp_main.wrk_ctx[tc->c_thread_index].time_now - - tc->timestamp_delta); -} - -always_inline f64 -tcp_time_now_us (u32 thread_index) -{ - return transport_time_now (thread_index); -} - -always_inline u32 -tcp_set_time_now (tcp_worker_ctx_t * wrk) -{ - wrk->time_now = clib_cpu_time_now () * tcp_main.tstamp_ticks_per_clock; - return wrk->time_now; -} - u32 tcp_session_push_header (transport_connection_t * tconn, vlib_buffer_t * b); int tcp_session_custom_tx (void *conn, u32 max_burst_size); @@ -1117,64 +339,22 @@ void tcp_connection_tx_pacer_reset (tcp_connection_t * tc, u32 window, u32 start_bucket); void tcp_program_cleanup (tcp_worker_ctx_t * wrk, tcp_connection_t * tc); -always_inline void -tcp_cc_rcv_ack (tcp_connection_t * tc, tcp_rate_sample_t * rs) -{ - tc->cc_algo->rcv_ack (tc, rs); - tc->tsecr_last_ack = tc->rcv_opts.tsecr; -} - -static inline void -tcp_cc_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type, - tcp_rate_sample_t * rs) -{ - tc->cc_algo->rcv_cong_ack (tc, ack_type, rs); -} - -static inline void -tcp_cc_congestion (tcp_connection_t * tc) -{ - tc->cc_algo->congestion (tc); -} - -static inline void -tcp_cc_loss (tcp_connection_t * tc) -{ - tc->cc_algo->loss (tc); -} - -static inline void -tcp_cc_recovered (tcp_connection_t * tc) -{ - tc->cc_algo->recovered (tc); -} - -static inline void -tcp_cc_undo_recovery (tcp_connection_t * tc) -{ - if (tc->cc_algo->undo_recovery) - tc->cc_algo->undo_recovery (tc); -} - -static inline void -tcp_cc_event (tcp_connection_t * tc, tcp_cc_event_t evt) -{ - if (tc->cc_algo->event) - tc->cc_algo->event (tc, evt); -} - -static inline u64 -tcp_cc_get_pacing_rate (tcp_connection_t * tc) -{ - if (tc->cc_algo->get_pacing_rate) - return tc->cc_algo->get_pacing_rate (tc); +void tcp_punt_unknown (vlib_main_t * vm, u8 is_ip4, u8 is_add); +int tcp_configure_v4_source_address_range (vlib_main_t * vm, + ip4_address_t * start, + ip4_address_t * end, u32 table_id); +int tcp_configure_v6_source_address_range (vlib_main_t * vm, + ip6_address_t * start, + ip6_address_t * end, u32 table_id); - f64 srtt = clib_min ((f64) tc->srtt * TCP_TICK, tc->mrtt_us); +clib_error_t *vnet_tcp_enable_disable (vlib_main_t * vm, u8 is_en); - /* TODO should constrain to interface's max throughput but - * we don't have link speeds for sw ifs ..*/ - return ((f64) tc->cwnd / srtt); -} +format_function_t format_tcp_state; +format_function_t format_tcp_flags; +format_function_t format_tcp_sacks; +format_function_t format_tcp_rcv_sacks; +format_function_t format_tcp_connection; +format_function_t format_tcp_connection_id; always_inline void tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval) @@ -1287,101 +467,6 @@ tcp_timer_is_active (tcp_connection_t * tc, tcp_timers_e timer) ASSERT(_tc->state != TCP_STATE_ESTABLISHED \ || transport_max_tx_dequeue (&_tc->connection) >= _a) -void tcp_rcv_sacks (tcp_connection_t * tc, u32 ack); -u8 *tcp_scoreboard_replay (u8 * s, tcp_connection_t * tc, u8 verbose); - -/** - * Register exiting cc algo type - */ -void tcp_cc_algo_register (tcp_cc_algorithm_type_e type, - const tcp_cc_algorithm_t * vft); - -/** - * Register new cc algo type - */ -tcp_cc_algorithm_type_e tcp_cc_algo_new_type (const tcp_cc_algorithm_t * vft); -tcp_cc_algorithm_t *tcp_cc_algo_get (tcp_cc_algorithm_type_e type); - -static inline void * -tcp_cc_data (tcp_connection_t * tc) -{ - return (void *) tc->cc_data; -} - -void newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type, - tcp_rate_sample_t * rs); -/** - * Initialize connection by gleaning network and rcv params from buffer - * - * @param tc connection to initialize - * @param b buffer whose current data is pointing at ip - * @param is_ip4 flag set to 1 if using ip4 - */ -void tcp_init_w_buffer (tcp_connection_t * tc, vlib_buffer_t * b, u8 is_ip4); - -/** - * Push TCP header to buffer - * - * @param vm - vlib_main - * @param b - buffer to write the header to - * @param sp_net - source port net order - * @param dp_net - destination port net order - * @param seq - sequence number net order - * @param ack - ack number net order - * @param tcp_hdr_opts_len - header and options length in bytes - * @param flags - header flags - * @param wnd - window size - * - * @return - pointer to start of TCP header - */ -always_inline void * -vlib_buffer_push_tcp_net_order (vlib_buffer_t * b, u16 sp, u16 dp, u32 seq, - u32 ack, u8 tcp_hdr_opts_len, u8 flags, - u16 wnd) -{ - tcp_header_t *th; - - th = vlib_buffer_push_uninit (b, tcp_hdr_opts_len); - - th->src_port = sp; - th->dst_port = dp; - th->seq_number = seq; - th->ack_number = ack; - th->data_offset_and_reserved = (tcp_hdr_opts_len >> 2) << 4; - th->flags = flags; - th->window = wnd; - th->checksum = 0; - th->urgent_pointer = 0; - vnet_buffer (b)->l4_hdr_offset = (u8 *) th - b->data; - b->flags |= VNET_BUFFER_F_L4_HDR_OFFSET_VALID; - return th; -} - -/** - * Push TCP header to buffer - * - * @param b - buffer to write the header to - * @param sp_net - source port net order - * @param dp_net - destination port net order - * @param seq - sequence number host order - * @param ack - ack number host order - * @param tcp_hdr_opts_len - header and options length in bytes - * @param flags - header flags - * @param wnd - window size - * - * @return - pointer to start of TCP header - */ -always_inline void * -vlib_buffer_push_tcp (vlib_buffer_t * b, u16 sp_net, u16 dp_net, u32 seq, - u32 ack, u8 tcp_hdr_opts_len, u8 flags, u16 wnd) -{ - return vlib_buffer_push_tcp_net_order (b, sp_net, dp_net, - clib_host_to_net_u32 (seq), - clib_host_to_net_u32 (ack), - tcp_hdr_opts_len, flags, - clib_host_to_net_u16 (wnd)); -} - #endif /* _vnet_tcp_h_ */ /* diff --git a/src/vnet/tcp/tcp_api.c b/src/vnet/tcp/tcp_api.c index ac4314f0e83..8b169f8f1e3 100644 --- a/src/vnet/tcp/tcp_api.c +++ b/src/vnet/tcp/tcp_api.c @@ -115,11 +115,6 @@ tcp_api_hookup (vlib_main_t * vm) VLIB_API_INIT_FUNCTION (tcp_api_hookup); -void -tcp_api_reference (void) -{ -} - /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/tcp/tcp_bt.c b/src/vnet/tcp/tcp_bt.c index e8dc5c9c068..6f9ee0168f7 100644 --- a/src/vnet/tcp/tcp_bt.c +++ b/src/vnet/tcp/tcp_bt.c @@ -16,7 +16,9 @@ * draft-cheng-iccrg-delivery-rate-estimation-00 */ +#include <vnet/tcp/tcp_bt.h> #include <vnet/tcp/tcp.h> +#include <vnet/tcp/tcp_inlines.h> static tcp_bt_sample_t * bt_get_sample (tcp_byte_tracker_t * bt, u32 bts_index) diff --git a/src/vnet/tcp/tcp_bt.h b/src/vnet/tcp/tcp_bt.h new file mode 100644 index 00000000000..b9d0e571a5d --- /dev/null +++ b/src/vnet/tcp/tcp_bt.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2020 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Rate estimation + */ + +#ifndef SRC_VNET_TCP_TCP_BT_H_ +#define SRC_VNET_TCP_TCP_BT_H_ + +#include <vnet/tcp/tcp_types.h> + +/** + * Byte tracker initialize + * + * @param tc connection for which the byte tracker should be allocated and + * initialized + */ +void tcp_bt_init (tcp_connection_t * tc); +/** + * Byte tracker cleanup + * + * @param tc connection for which the byte tracker should be cleaned up + */ +void tcp_bt_cleanup (tcp_connection_t * tc); +/** + * Flush byte tracker samples + * + * @param tc tcp connection for which samples should be flushed + */ +void tcp_bt_flush_samples (tcp_connection_t * tc); +/** + * Track a tcp tx burst + * + * @param tc tcp connection + */ +void tcp_bt_track_tx (tcp_connection_t * tc, u32 len); +/** + * Track a tcp retransmission + * + * @param tc tcp connection + * @param start start sequence number + * @param end end sequence number + */ +void tcp_bt_track_rxt (tcp_connection_t * tc, u32 start, u32 end); +/** + * Generate a delivery rate sample from recently acked bytes + * + * @param tc tcp connection + * @param rs resulting rate sample + */ +void tcp_bt_sample_delivery_rate (tcp_connection_t * tc, + tcp_rate_sample_t * rs); +/** + * Check if sample to be generated is app limited + * + * @param tc tcp connection + */ +void tcp_bt_check_app_limited (tcp_connection_t * tc); +/** + * Check if the byte tracker is in sane state + * + * Should be used only for testing + * + * @param bt byte tracker + */ +int tcp_bt_is_sane (tcp_byte_tracker_t * bt); + +format_function_t format_tcp_bt; + +#endif /* SRC_VNET_TCP_TCP_BT_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_cc.h b/src/vnet/tcp/tcp_cc.h new file mode 100644 index 00000000000..54d2dc6334b --- /dev/null +++ b/src/vnet/tcp/tcp_cc.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2020 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SRC_VNET_TCP_TCP_CC_H_ +#define SRC_VNET_TCP_TCP_CC_H_ + +#include <vnet/tcp/tcp_types.h> + +always_inline void +tcp_cc_rcv_ack (tcp_connection_t * tc, tcp_rate_sample_t * rs) +{ + tc->cc_algo->rcv_ack (tc, rs); + tc->tsecr_last_ack = tc->rcv_opts.tsecr; +} + +static inline void +tcp_cc_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type, + tcp_rate_sample_t * rs) +{ + tc->cc_algo->rcv_cong_ack (tc, ack_type, rs); +} + +static inline void +tcp_cc_congestion (tcp_connection_t * tc) +{ + tc->cc_algo->congestion (tc); +} + +static inline void +tcp_cc_loss (tcp_connection_t * tc) +{ + tc->cc_algo->loss (tc); +} + +static inline void +tcp_cc_recovered (tcp_connection_t * tc) +{ + tc->cc_algo->recovered (tc); +} + +static inline void +tcp_cc_undo_recovery (tcp_connection_t * tc) +{ + if (tc->cc_algo->undo_recovery) + tc->cc_algo->undo_recovery (tc); +} + +static inline void +tcp_cc_event (tcp_connection_t * tc, tcp_cc_event_t evt) +{ + if (tc->cc_algo->event) + tc->cc_algo->event (tc, evt); +} + +static inline u64 +tcp_cc_get_pacing_rate (tcp_connection_t * tc) +{ + if (tc->cc_algo->get_pacing_rate) + return tc->cc_algo->get_pacing_rate (tc); + + f64 srtt = clib_min ((f64) tc->srtt * TCP_TICK, tc->mrtt_us); + + /* TODO should constrain to interface's max throughput but + * we don't have link speeds for sw ifs ..*/ + return ((f64) tc->cwnd / srtt); +} + +static inline void * +tcp_cc_data (tcp_connection_t * tc) +{ + return (void *) tc->cc_data; +} + +/** + * Register exiting cc algo type + */ +void tcp_cc_algo_register (tcp_cc_algorithm_type_e type, + const tcp_cc_algorithm_t * vft); + +/** + * Register new cc algo type + */ +tcp_cc_algorithm_type_e tcp_cc_algo_new_type (const tcp_cc_algorithm_t * vft); +tcp_cc_algorithm_t *tcp_cc_algo_get (tcp_cc_algorithm_type_e type); + + +void newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type, + tcp_rate_sample_t * rs); + + +#endif /* SRC_VNET_TCP_TCP_CC_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_cli.c b/src/vnet/tcp/tcp_cli.c new file mode 100644 index 00000000000..a28e2c83659 --- /dev/null +++ b/src/vnet/tcp/tcp_cli.c @@ -0,0 +1,1030 @@ +/* + * Copyright (c) 2020 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/tcp/tcp.h> +#include <vnet/tcp/tcp_inlines.h> +#include <vnet/dpo/receive_dpo.h> +#include <vnet/ip-neighbor/ip_neighbor.h> + +const char *tcp_fsm_states[] = { +#define _(sym, str) str, + foreach_tcp_fsm_state +#undef _ +}; + +u8 * +format_tcp_state (u8 * s, va_list * args) +{ + u32 state = va_arg (*args, u32); + + if (state < TCP_N_STATES) + s = format (s, "%s", tcp_fsm_states[state]); + else + s = format (s, "UNKNOWN (%d (0x%x))", state, state); + return s; +} + +const char *tcp_cfg_flags_str[] = { +#define _(sym, str) str, + foreach_tcp_cfg_flag +#undef _ +}; + +static u8 * +format_tcp_cfg_flags (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + int i, last = -1; + + for (i = 0; i < TCP_CFG_N_FLAG_BITS; i++) + if (tc->cfg_flags & (1 << i)) + last = i; + for (i = 0; i < last; i++) + { + if (tc->cfg_flags & (1 << i)) + s = format (s, "%s, ", tcp_cfg_flags_str[i]); + } + if (last >= 0) + s = format (s, "%s", tcp_cfg_flags_str[last]); + return s; +} + +const char *tcp_connection_flags_str[] = { +#define _(sym, str) str, + foreach_tcp_connection_flag +#undef _ +}; + +static u8 * +format_tcp_connection_flags (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + int i, last = -1; + + for (i = 0; i < TCP_CONN_N_FLAG_BITS; i++) + if (tc->flags & (1 << i)) + last = i; + for (i = 0; i < last; i++) + { + if (tc->flags & (1 << i)) + s = format (s, "%s, ", tcp_connection_flags_str[i]); + } + if (last >= 0) + s = format (s, "%s", tcp_connection_flags_str[last]); + return s; +} + +const char *tcp_conn_timers[] = { +#define _(sym, str) str, + foreach_tcp_timer +#undef _ +}; + +static u8 * +format_tcp_timers (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + int i, last = -1; + + for (i = 0; i < TCP_N_TIMERS; i++) + if (tc->timers[i] != TCP_TIMER_HANDLE_INVALID) + last = i; + + for (i = 0; i < last; i++) + { + if (tc->timers[i] != TCP_TIMER_HANDLE_INVALID) + s = format (s, "%s,", tcp_conn_timers[i]); + } + + if (last >= 0) + s = format (s, "%s", tcp_conn_timers[i]); + + return s; +} + +static u8 * +format_tcp_congestion_status (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + if (tcp_in_recovery (tc)) + s = format (s, "recovery"); + else if (tcp_in_fastrecovery (tc)) + s = format (s, "fastrecovery"); + else + s = format (s, "none"); + return s; +} + +static i32 +tcp_rcv_wnd_available (tcp_connection_t * tc) +{ + return (i32) tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las); +} + +static u8 * +format_tcp_congestion (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + u32 indent = format_get_indent (s), prr_space = 0; + + s = format (s, "%U ", format_tcp_congestion_status, tc); + s = format (s, "algo %s cwnd %u ssthresh %u bytes_acked %u\n", + tc->cc_algo->name, tc->cwnd, tc->ssthresh, tc->bytes_acked); + s = format (s, "%Ucc space %u prev_cwnd %u prev_ssthresh %u\n", + format_white_space, indent, tcp_available_cc_snd_space (tc), + tc->prev_cwnd, tc->prev_ssthresh); + s = format (s, "%Usnd_cong %u dupack %u limited_tx %u\n", + format_white_space, indent, tc->snd_congestion - tc->iss, + tc->rcv_dupacks, tc->limited_transmit - tc->iss); + s = format (s, "%Urxt_bytes %u rxt_delivered %u rxt_head %u rxt_ts %u\n", + format_white_space, indent, tc->snd_rxt_bytes, + tc->rxt_delivered, tc->rxt_head - tc->iss, + tcp_time_now_w_thread (tc->c_thread_index) - tc->snd_rxt_ts); + if (tcp_in_fastrecovery (tc)) + prr_space = tcp_fastrecovery_prr_snd_space (tc); + s = format (s, "%Uprr_start %u prr_delivered %u prr space %u\n", + format_white_space, indent, tc->prr_start - tc->iss, + tc->prr_delivered, prr_space); + return s; +} + +static u8 * +format_tcp_stats (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + u32 indent = format_get_indent (s); + s = format (s, "in segs %lu dsegs %lu bytes %lu dupacks %u\n", + tc->segs_in, tc->data_segs_in, tc->bytes_in, tc->dupacks_in); + s = format (s, "%Uout segs %lu dsegs %lu bytes %lu dupacks %u\n", + format_white_space, indent, tc->segs_out, + tc->data_segs_out, tc->bytes_out, tc->dupacks_out); + s = format (s, "%Ufr %u tr %u rxt segs %lu bytes %lu duration %.3f\n", + format_white_space, indent, tc->fr_occurences, + tc->tr_occurences, tc->segs_retrans, tc->bytes_retrans, + tcp_time_now_us (tc->c_thread_index) - tc->start_ts); + s = format (s, "%Uerr wnd data below %u above %u ack below %u above %u", + format_white_space, indent, tc->errors.below_data_wnd, + tc->errors.above_data_wnd, tc->errors.below_ack_wnd, + tc->errors.above_ack_wnd); + return s; +} + +static u8 * +format_tcp_vars (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + s = format (s, " index: %u cfg: %U flags: %U timers: %U\n", tc->c_c_index, + format_tcp_cfg_flags, tc, format_tcp_connection_flags, tc, + format_tcp_timers, tc); + s = format (s, " snd_una %u snd_nxt %u snd_una_max %u", + tc->snd_una - tc->iss, tc->snd_nxt - tc->iss, + tc->snd_una_max - tc->iss); + s = format (s, " rcv_nxt %u rcv_las %u\n", + tc->rcv_nxt - tc->irs, tc->rcv_las - tc->irs); + s = format (s, " snd_wnd %u rcv_wnd %u rcv_wscale %u ", + tc->snd_wnd, tc->rcv_wnd, tc->rcv_wscale); + s = format (s, "snd_wl1 %u snd_wl2 %u\n", tc->snd_wl1 - tc->irs, + tc->snd_wl2 - tc->iss); + s = format (s, " flight size %u out space %u rcv_wnd_av %u", + tcp_flight_size (tc), tcp_available_output_snd_space (tc), + tcp_rcv_wnd_available (tc)); + s = format (s, " tsval_recent %u\n", tc->tsval_recent); + s = format (s, " tsecr %u tsecr_last_ack %u tsval_recent_age %u", + tc->rcv_opts.tsecr, tc->tsecr_last_ack, + tcp_time_now () - tc->tsval_recent_age); + s = format (s, " snd_mss %u\n", tc->snd_mss); + s = format (s, " rto %u rto_boff %u srtt %u us %.3f rttvar %u rtt_ts %.4f", + tc->rto, tc->rto_boff, tc->srtt, tc->mrtt_us * 1000, tc->rttvar, + tc->rtt_ts); + s = format (s, " rtt_seq %u\n", tc->rtt_seq - tc->iss); + s = format (s, " next_node %u opaque 0x%x fib_index %u\n", + tc->next_node_index, tc->next_node_opaque, tc->c_fib_index); + s = format (s, " cong: %U", format_tcp_congestion, tc); + + if (tc->state >= TCP_STATE_ESTABLISHED) + { + s = format (s, " sboard: %U\n", format_tcp_scoreboard, &tc->sack_sb, + tc); + s = format (s, " stats: %U\n", format_tcp_stats, tc); + } + if (vec_len (tc->snd_sacks)) + s = format (s, " sacks tx: %U\n", format_tcp_sacks, tc); + + return s; +} + +u8 * +format_tcp_connection_id (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + if (!tc) + return s; + if (tc->c_is_ip4) + { + s = format (s, "[%d:%d][%s] %U:%d->%U:%d", tc->c_thread_index, + tc->c_s_index, "T", format_ip4_address, &tc->c_lcl_ip4, + clib_net_to_host_u16 (tc->c_lcl_port), format_ip4_address, + &tc->c_rmt_ip4, clib_net_to_host_u16 (tc->c_rmt_port)); + } + else + { + s = format (s, "[%d:%d][%s] %U:%d->%U:%d", tc->c_thread_index, + tc->c_s_index, "T", format_ip6_address, &tc->c_lcl_ip6, + clib_net_to_host_u16 (tc->c_lcl_port), format_ip6_address, + &tc->c_rmt_ip6, clib_net_to_host_u16 (tc->c_rmt_port)); + } + + return s; +} + +u8 * +format_tcp_connection (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + u32 verbose = va_arg (*args, u32); + + if (!tc) + return s; + s = format (s, "%-50U", format_tcp_connection_id, tc); + if (verbose) + { + s = format (s, "%-15U", format_tcp_state, tc->state); + if (verbose > 1) + s = format (s, "\n%U", format_tcp_vars, tc); + } + + return s; +} + +u8 * +format_tcp_sacks (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + sack_block_t *sacks = tc->snd_sacks; + sack_block_t *block; + int i, len = 0; + + len = vec_len (sacks); + for (i = 0; i < len - 1; i++) + { + block = &sacks[i]; + s = format (s, " start %u end %u\n", block->start - tc->irs, + block->end - tc->irs); + } + if (len) + { + block = &sacks[len - 1]; + s = format (s, " start %u end %u", block->start - tc->irs, + block->end - tc->irs); + } + return s; +} + +u8 * +format_tcp_rcv_sacks (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + sack_block_t *sacks = tc->rcv_opts.sacks; + sack_block_t *block; + int i, len = 0; + + len = vec_len (sacks); + for (i = 0; i < len - 1; i++) + { + block = &sacks[i]; + s = format (s, " start %u end %u\n", block->start - tc->iss, + block->end - tc->iss); + } + if (len) + { + block = &sacks[len - 1]; + s = format (s, " start %u end %u", block->start - tc->iss, + block->end - tc->iss); + } + return s; +} + +static u8 * +format_tcp_sack_hole (u8 * s, va_list * args) +{ + sack_scoreboard_hole_t *hole = va_arg (*args, sack_scoreboard_hole_t *); + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + if (tc) + s = format (s, " [%u, %u]", hole->start - tc->iss, hole->end - tc->iss); + else + s = format (s, " [%u, %u]", hole->start, hole->end); + return s; +} + +u8 * +format_tcp_scoreboard (u8 * s, va_list * args) +{ + sack_scoreboard_t *sb = va_arg (*args, sack_scoreboard_t *); + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + sack_scoreboard_hole_t *hole; + u32 indent = format_get_indent (s); + + s = format (s, "sacked %u last_sacked %u lost %u last_lost %u" + " rxt_sacked %u\n", + sb->sacked_bytes, sb->last_sacked_bytes, sb->lost_bytes, + sb->last_lost_bytes, sb->rxt_sacked); + s = format (s, "%Ulast_delivered %u high_sacked %u is_reneging %u\n", + format_white_space, indent, sb->last_bytes_delivered, + sb->high_sacked - tc->iss, sb->is_reneging); + s = format (s, "%Ucur_rxt_hole %u high_rxt %u rescue_rxt %u", + format_white_space, indent, sb->cur_rxt_hole, + sb->high_rxt - tc->iss, sb->rescue_rxt - tc->iss); + + hole = scoreboard_first_hole (sb); + if (hole) + s = format (s, "\n%Uhead %u tail %u %u holes:\n%U", format_white_space, + indent, sb->head, sb->tail, pool_elts (sb->holes), + format_white_space, indent); + + while (hole) + { + s = format (s, "%U", format_tcp_sack_hole, hole, tc); + hole = scoreboard_next_hole (sb, hole); + } + + return s; +} + +/** + * \brief Configure an ipv4 source address range + * @param vm vlib_main_t pointer + * @param start first ipv4 address in the source address range + * @param end last ipv4 address in the source address range + * @param table_id VRF / table ID, 0 for the default FIB + * @return 0 if all OK, else an error indication from api_errno.h + */ + +int +tcp_configure_v4_source_address_range (vlib_main_t * vm, + ip4_address_t * start, + ip4_address_t * end, u32 table_id) +{ + u32 start_host_byte_order, end_host_byte_order; + fib_prefix_t prefix; + fib_node_index_t fei; + u32 fib_index = 0; + u32 sw_if_index; + int rv; + + clib_memset (&prefix, 0, sizeof (prefix)); + + fib_index = fib_table_find (FIB_PROTOCOL_IP4, table_id); + + if (fib_index == ~0) + return VNET_API_ERROR_NO_SUCH_FIB; + + start_host_byte_order = clib_net_to_host_u32 (start->as_u32); + end_host_byte_order = clib_net_to_host_u32 (end->as_u32); + + /* sanity check for reversed args or some such */ + if ((end_host_byte_order - start_host_byte_order) > (10 << 10)) + return VNET_API_ERROR_INVALID_ARGUMENT; + + /* Lookup the last address, to identify the interface involved */ + prefix.fp_len = 32; + prefix.fp_proto = FIB_PROTOCOL_IP4; + memcpy (&prefix.fp_addr.ip4, end, sizeof (ip4_address_t)); + + fei = fib_table_lookup (fib_index, &prefix); + + /* Couldn't find route to destination. Bail out. */ + if (fei == FIB_NODE_INDEX_INVALID) + return VNET_API_ERROR_NEXT_HOP_NOT_IN_FIB; + + sw_if_index = fib_entry_get_resolving_interface (fei); + + /* Configure proxy arp across the range */ + rv = ip4_neighbor_proxy_add (fib_index, start, end); + + if (rv) + return rv; + + rv = ip4_neighbor_proxy_enable (sw_if_index); + + if (rv) + return rv; + + do + { + dpo_id_t dpo = DPO_INVALID; + + vec_add1 (tcp_cfg.ip4_src_addrs, start[0]); + + /* Add local adjacencies for the range */ + + receive_dpo_add_or_lock (DPO_PROTO_IP4, ~0 /* sw_if_index */ , + NULL, &dpo); + prefix.fp_len = 32; + prefix.fp_proto = FIB_PROTOCOL_IP4; + prefix.fp_addr.ip4.as_u32 = start->as_u32; + + fib_table_entry_special_dpo_update (fib_index, + &prefix, + FIB_SOURCE_API, + FIB_ENTRY_FLAG_EXCLUSIVE, &dpo); + dpo_reset (&dpo); + + start_host_byte_order++; + start->as_u32 = clib_host_to_net_u32 (start_host_byte_order); + } + while (start_host_byte_order <= end_host_byte_order); + + return 0; +} + +/** + * \brief Configure an ipv6 source address range + * @param vm vlib_main_t pointer + * @param start first ipv6 address in the source address range + * @param end last ipv6 address in the source address range + * @param table_id VRF / table ID, 0 for the default FIB + * @return 0 if all OK, else an error indication from api_errno.h + */ + +int +tcp_configure_v6_source_address_range (vlib_main_t * vm, + ip6_address_t * start, + ip6_address_t * end, u32 table_id) +{ + fib_prefix_t prefix; + u32 fib_index = 0; + fib_node_index_t fei; + u32 sw_if_index; + + clib_memset (&prefix, 0, sizeof (prefix)); + + fib_index = fib_table_find (FIB_PROTOCOL_IP6, table_id); + + if (fib_index == ~0) + return VNET_API_ERROR_NO_SUCH_FIB; + + while (1) + { + int i; + ip6_address_t tmp; + dpo_id_t dpo = DPO_INVALID; + + /* Remember this address */ + vec_add1 (tcp_cfg.ip6_src_addrs, start[0]); + + /* Lookup the prefix, to identify the interface involved */ + prefix.fp_len = 128; + prefix.fp_proto = FIB_PROTOCOL_IP6; + memcpy (&prefix.fp_addr.ip6, start, sizeof (ip6_address_t)); + + fei = fib_table_lookup (fib_index, &prefix); + + /* Couldn't find route to destination. Bail out. */ + if (fei == FIB_NODE_INDEX_INVALID) + return VNET_API_ERROR_NEXT_HOP_NOT_IN_FIB; + + sw_if_index = fib_entry_get_resolving_interface (fei); + + if (sw_if_index == (u32) ~ 0) + return VNET_API_ERROR_NO_MATCHING_INTERFACE; + + /* Add a proxy neighbor discovery entry for this address */ + ip6_neighbor_proxy_add (sw_if_index, start); + + /* Add a receive adjacency for this address */ + receive_dpo_add_or_lock (DPO_PROTO_IP6, ~0 /* sw_if_index */ , + NULL, &dpo); + + fib_table_entry_special_dpo_update (fib_index, + &prefix, + FIB_SOURCE_API, + FIB_ENTRY_FLAG_EXCLUSIVE, &dpo); + dpo_reset (&dpo); + + /* Done with the entire range? */ + if (!memcmp (start, end, sizeof (start[0]))) + break; + + /* Increment the address. DGMS. */ + tmp = start[0]; + for (i = 15; i >= 0; i--) + { + tmp.as_u8[i] += 1; + if (tmp.as_u8[i] != 0) + break; + } + start[0] = tmp; + } + return 0; +} + +static clib_error_t * +tcp_src_address_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd_arg) +{ + ip4_address_t v4start, v4end; + ip6_address_t v6start, v6end; + u32 table_id = 0; + int v4set = 0; + int v6set = 0; + int rv; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "%U - %U", unformat_ip4_address, &v4start, + unformat_ip4_address, &v4end)) + v4set = 1; + else if (unformat (input, "%U", unformat_ip4_address, &v4start)) + { + memcpy (&v4end, &v4start, sizeof (v4start)); + v4set = 1; + } + else if (unformat (input, "%U - %U", unformat_ip6_address, &v6start, + unformat_ip6_address, &v6end)) + v6set = 1; + else if (unformat (input, "%U", unformat_ip6_address, &v6start)) + { + memcpy (&v6end, &v6start, sizeof (v6start)); + v6set = 1; + } + else if (unformat (input, "fib-table %d", &table_id)) + ; + else + break; + } + + if (!v4set && !v6set) + return clib_error_return (0, "at least one v4 or v6 address required"); + + if (v4set) + { + rv = tcp_configure_v4_source_address_range (vm, &v4start, &v4end, + table_id); + switch (rv) + { + case 0: + break; + + case VNET_API_ERROR_NO_SUCH_FIB: + return clib_error_return (0, "Invalid table-id %d", table_id); + + case VNET_API_ERROR_INVALID_ARGUMENT: + return clib_error_return (0, "Invalid address range %U - %U", + format_ip4_address, &v4start, + format_ip4_address, &v4end); + default: + return clib_error_return (0, "error %d", rv); + break; + } + } + if (v6set) + { + rv = tcp_configure_v6_source_address_range (vm, &v6start, &v6end, + table_id); + switch (rv) + { + case 0: + break; + + case VNET_API_ERROR_NO_SUCH_FIB: + return clib_error_return (0, "Invalid table-id %d", table_id); + + default: + return clib_error_return (0, "error %d", rv); + break; + } + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (tcp_src_address_command, static) = +{ + .path = "tcp src-address", + .short_help = "tcp src-address <ip-addr> [- <ip-addr>] add src address range", + .function = tcp_src_address_fn, +}; +/* *INDENT-ON* */ + +static u8 * +tcp_scoreboard_dump_trace (u8 * s, sack_scoreboard_t * sb) +{ +#if TCP_SCOREBOARD_TRACE + + scoreboard_trace_elt_t *block; + int i = 0; + + if (!sb->trace) + return s; + + s = format (s, "scoreboard trace:"); + vec_foreach (block, sb->trace) + { + s = format (s, "{%u, %u, %u, %u, %u}, ", block->start, block->end, + block->ack, block->snd_una_max, block->group); + if ((++i % 3) == 0) + s = format (s, "\n"); + } + return s; +#else + return 0; +#endif +} + +static clib_error_t * +tcp_show_scoreboard_trace_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd_arg) +{ + transport_connection_t *tconn = 0; + tcp_connection_t *tc; + u8 *s = 0; + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "%U", unformat_transport_connection, &tconn, + TRANSPORT_PROTO_TCP)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + if (!TCP_SCOREBOARD_TRACE) + { + vlib_cli_output (vm, "scoreboard tracing not enabled"); + return 0; + } + + tc = tcp_get_connection_from_transport (tconn); + s = tcp_scoreboard_dump_trace (s, &tc->sack_sb); + vlib_cli_output (vm, "%v", s); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (tcp_show_scoreboard_trace_command, static) = +{ + .path = "show tcp scoreboard trace", + .short_help = "show tcp scoreboard trace <connection>", + .function = tcp_show_scoreboard_trace_fn, +}; +/* *INDENT-ON* */ + +u8 * +tcp_scoreboard_replay (u8 * s, tcp_connection_t * tc, u8 verbose) +{ + int i, trace_len; + scoreboard_trace_elt_t *trace; + u32 next_ack, left, group, has_new_ack = 0; + tcp_connection_t _dummy_tc, *dummy_tc = &_dummy_tc; + sack_block_t *block; + + if (!TCP_SCOREBOARD_TRACE) + { + s = format (s, "scoreboard tracing not enabled"); + return s; + } + + if (!tc) + return s; + + clib_memset (dummy_tc, 0, sizeof (*dummy_tc)); + tcp_connection_timers_init (dummy_tc); + scoreboard_init (&dummy_tc->sack_sb); + dummy_tc->rcv_opts.flags |= TCP_OPTS_FLAG_SACK; + +#if TCP_SCOREBOARD_TRACE + trace = tc->sack_sb.trace; + trace_len = vec_len (tc->sack_sb.trace); +#endif + + for (i = 0; i < trace_len; i++) + { + if (trace[i].ack != 0) + { + dummy_tc->snd_una = trace[i].ack - 1448; + dummy_tc->snd_una_max = trace[i].ack; + } + } + + left = 0; + while (left < trace_len) + { + group = trace[left].group; + vec_reset_length (dummy_tc->rcv_opts.sacks); + has_new_ack = 0; + while (trace[left].group == group) + { + if (trace[left].ack != 0) + { + if (verbose) + s = format (s, "Adding ack %u, snd_una_max %u, segs: ", + trace[left].ack, trace[left].snd_una_max); + dummy_tc->snd_una_max = trace[left].snd_una_max; + next_ack = trace[left].ack; + has_new_ack = 1; + } + else + { + if (verbose) + s = format (s, "[%u, %u], ", trace[left].start, + trace[left].end); + vec_add2 (dummy_tc->rcv_opts.sacks, block, 1); + block->start = trace[left].start; + block->end = trace[left].end; + } + left++; + } + + /* Push segments */ + tcp_rcv_sacks (dummy_tc, next_ack); + if (has_new_ack) + dummy_tc->snd_una = next_ack; + + if (verbose) + s = format (s, "result: %U", format_tcp_scoreboard, + &dummy_tc->sack_sb); + + } + s = format (s, "result: %U", format_tcp_scoreboard, &dummy_tc->sack_sb); + + return s; +} + +static clib_error_t * +tcp_scoreboard_trace_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd_arg) +{ + transport_connection_t *tconn = 0; + tcp_connection_t *tc = 0; + u8 *str = 0; + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "%U", unformat_transport_connection, &tconn, + TRANSPORT_PROTO_TCP)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + if (!TCP_SCOREBOARD_TRACE) + { + vlib_cli_output (vm, "scoreboard tracing not enabled"); + return 0; + } + + tc = tcp_get_connection_from_transport (tconn); + if (!tc) + { + vlib_cli_output (vm, "connection not found"); + return 0; + } + str = tcp_scoreboard_replay (str, tc, 1); + vlib_cli_output (vm, "%v", str); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (tcp_replay_scoreboard_command, static) = +{ + .path = "tcp replay scoreboard", + .short_help = "tcp replay scoreboard <connection>", + .function = tcp_scoreboard_trace_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +show_tcp_punt_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd_arg) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + if (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + return clib_error_return (0, "unknown input `%U'", format_unformat_error, + input); + vlib_cli_output (vm, "IPv4 TCP punt: %s", + tm->punt_unknown4 ? "enabled" : "disabled"); + vlib_cli_output (vm, "IPv6 TCP punt: %s", + tm->punt_unknown6 ? "enabled" : "disabled"); + return 0; +} +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_tcp_punt_command, static) = +{ + .path = "show tcp punt", + .short_help = "show tcp punt", + .function = show_tcp_punt_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +show_tcp_stats_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + tcp_worker_ctx_t *wrk; + u32 thread; + + if (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + return clib_error_return (0, "unknown input `%U'", format_unformat_error, + input); + for (thread = 0; thread < vec_len (tm->wrk_ctx); thread++) + { + wrk = tcp_get_worker (thread); + vlib_cli_output (vm, "Thread %u:\n", thread); + + if (clib_fifo_elts (wrk->pending_timers)) + vlib_cli_output (vm, " %lu pending timers", + clib_fifo_elts (wrk->pending_timers)); + +#define _(name,type,str) \ + if (wrk->stats.name) \ + vlib_cli_output (vm, " %lu %s", wrk->stats.name, str); + foreach_tcp_wrk_stat +#undef _ + } + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_tcp_stats_command, static) = +{ + .path = "show tcp stats", + .short_help = "show tcp stats", + .function = show_tcp_stats_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +clear_tcp_stats_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + tcp_worker_ctx_t *wrk; + u32 thread; + + if (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + return clib_error_return (0, "unknown input `%U'", format_unformat_error, + input); + + for (thread = 0; thread < vec_len (tm->wrk_ctx); thread++) + { + wrk = tcp_get_worker (thread); + clib_memset (&wrk->stats, 0, sizeof (wrk->stats)); + } + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (clear_tcp_stats_command, static) = +{ + .path = "clear tcp stats", + .short_help = "clear tcp stats", + .function = clear_tcp_stats_fn, +}; +/* *INDENT-ON* */ + +uword +unformat_tcp_cc_algo (unformat_input_t * input, va_list * va) +{ + tcp_cc_algorithm_type_e *result = va_arg (*va, tcp_cc_algorithm_type_e *); + tcp_main_t *tm = &tcp_main; + char *cc_algo_name; + u8 found = 0; + uword *p; + + if (unformat (input, "%s", &cc_algo_name) + && ((p = hash_get_mem (tm->cc_algo_by_name, cc_algo_name)))) + { + *result = *p; + found = 1; + } + + vec_free (cc_algo_name); + return found; +} + +uword +unformat_tcp_cc_algo_cfg (unformat_input_t * input, va_list * va) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + tcp_cc_algorithm_t *cc_alg; + unformat_input_t sub_input; + int found = 0; + + vec_foreach (cc_alg, tm->cc_algos) + { + if (!unformat (input, cc_alg->name)) + continue; + + if (cc_alg->unformat_cfg + && unformat (input, "%U", unformat_vlib_cli_sub_input, &sub_input)) + { + if (cc_alg->unformat_cfg (&sub_input)) + found = 1; + } + } + return found; +} + +static clib_error_t * +tcp_config_fn (vlib_main_t * vm, unformat_input_t * input) +{ + u32 cwnd_multiplier, tmp_time; + uword memory_size; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "preallocated-connections %d", + &tcp_cfg.preallocated_connections)) + ; + else if (unformat (input, "preallocated-half-open-connections %d", + &tcp_cfg.preallocated_half_open_connections)) + ; + else if (unformat (input, "buffer-fail-fraction %f", + &tcp_cfg.buffer_fail_fraction)) + ; + else if (unformat (input, "max-rx-fifo %U", unformat_memory_size, + &memory_size)) + { + if (memory_size >= 0x100000000) + { + return clib_error_return + (0, "max-rx-fifo %llu (0x%llx) too large", memory_size, + memory_size); + } + tcp_cfg.max_rx_fifo = memory_size; + } + else if (unformat (input, "min-rx-fifo %U", unformat_memory_size, + &memory_size)) + { + if (memory_size >= 0x100000000) + { + return clib_error_return + (0, "min-rx-fifo %llu (0x%llx) too large", memory_size, + memory_size); + } + tcp_cfg.min_rx_fifo = memory_size; + } + else if (unformat (input, "mtu %u", &tcp_cfg.default_mtu)) + ; + else if (unformat (input, "rwnd-min-update-ack %d", + &tcp_cfg.rwnd_min_update_ack)) + ; + else if (unformat (input, "initial-cwnd-multiplier %u", + &cwnd_multiplier)) + tcp_cfg.initial_cwnd_multiplier = cwnd_multiplier; + else if (unformat (input, "no-tx-pacing")) + tcp_cfg.enable_tx_pacing = 0; + else if (unformat (input, "tso")) + tcp_cfg.allow_tso = 1; + else if (unformat (input, "no-csum-offload")) + tcp_cfg.csum_offload = 0; + else if (unformat (input, "cc-algo %U", unformat_tcp_cc_algo, + &tcp_cfg.cc_algo)) + ; + else if (unformat (input, "%U", unformat_tcp_cc_algo_cfg)) + ; + else if (unformat (input, "closewait-time %u", &tmp_time)) + tcp_cfg.closewait_time = tmp_time / TCP_TIMER_TICK; + else if (unformat (input, "timewait-time %u", &tmp_time)) + tcp_cfg.timewait_time = tmp_time / TCP_TIMER_TICK; + else if (unformat (input, "finwait1-time %u", &tmp_time)) + tcp_cfg.finwait1_time = tmp_time / TCP_TIMER_TICK; + else if (unformat (input, "finwait2-time %u", &tmp_time)) + tcp_cfg.finwait2_time = tmp_time / TCP_TIMER_TICK; + else if (unformat (input, "lastack-time %u", &tmp_time)) + tcp_cfg.lastack_time = tmp_time / TCP_TIMER_TICK; + else if (unformat (input, "closing-time %u", &tmp_time)) + tcp_cfg.closing_time = tmp_time / TCP_TIMER_TICK; + else if (unformat (input, "cleanup-time %u", &tmp_time)) + tcp_cfg.cleanup_time = tmp_time / 1000.0; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + return 0; +} + +VLIB_CONFIG_FUNCTION (tcp_config_fn, "tcp"); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_cubic.c b/src/vnet/tcp/tcp_cubic.c index b79ef8342d3..b8ac80a8feb 100644 --- a/src/vnet/tcp/tcp_cubic.c +++ b/src/vnet/tcp/tcp_cubic.c @@ -14,6 +14,7 @@ */ #include <vnet/tcp/tcp.h> +#include <vnet/tcp/tcp_inlines.h> #include <math.h> #define beta_cubic 0.7 diff --git a/src/vnet/tcp/tcp_inlines.h b/src/vnet/tcp/tcp_inlines.h new file mode 100644 index 00000000000..2281cd3db7d --- /dev/null +++ b/src/vnet/tcp/tcp_inlines.h @@ -0,0 +1,457 @@ +/* + * Copyright (c) 2020 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SRC_VNET_TCP_TCP_INLINES_H_ +#define SRC_VNET_TCP_TCP_INLINES_H_ + +#include <vnet/tcp/tcp.h> + +always_inline tcp_header_t * +tcp_buffer_hdr (vlib_buffer_t * b) +{ + ASSERT ((signed) b->current_data >= (signed) -VLIB_BUFFER_PRE_DATA_SIZE); + return (tcp_header_t *) (b->data + b->current_data + + vnet_buffer (b)->tcp.hdr_offset); +} + +always_inline tcp_connection_t * +tcp_connection_get (u32 conn_index, u32 thread_index) +{ + tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index); + if (PREDICT_FALSE (pool_is_free_index (wrk->connections, conn_index))) + return 0; + return pool_elt_at_index (wrk->connections, conn_index); +} + +always_inline tcp_connection_t * +tcp_connection_get_if_valid (u32 conn_index, u32 thread_index) +{ + tcp_worker_ctx_t *wrk; + if (thread_index >= vec_len (tcp_main.wrk_ctx)) + return 0; + wrk = tcp_get_worker (thread_index); + if (pool_is_free_index (wrk->connections, conn_index)) + return 0; + return pool_elt_at_index (wrk->connections, conn_index); +} + +always_inline void +tcp_connection_set_state (tcp_connection_t * tc, tcp_state_t state) +{ + tc->state = state; + TCP_EVT (TCP_EVT_STATE_CHANGE, tc); +} + +always_inline tcp_connection_t * +tcp_listener_get (u32 tli) +{ + tcp_connection_t *tc = 0; + if (!pool_is_free_index (tcp_main.listener_pool, tli)) + tc = pool_elt_at_index (tcp_main.listener_pool, tli); + return tc; +} + +always_inline tcp_connection_t * +tcp_half_open_connection_get (u32 conn_index) +{ + tcp_connection_t *tc = 0; + clib_spinlock_lock_if_init (&tcp_main.half_open_lock); + if (!pool_is_free_index (tcp_main.half_open_connections, conn_index)) + tc = pool_elt_at_index (tcp_main.half_open_connections, conn_index); + clib_spinlock_unlock_if_init (&tcp_main.half_open_lock); + return tc; +} + +/** + * Our estimate of the number of bytes that have left the network + */ +always_inline u32 +tcp_bytes_out (const tcp_connection_t * tc) +{ + if (tcp_opts_sack_permitted (&tc->rcv_opts)) + return tc->sack_sb.sacked_bytes + tc->sack_sb.lost_bytes; + else + return clib_min (tc->rcv_dupacks * tc->snd_mss, + tc->snd_nxt - tc->snd_una); +} + +/** + * Our estimate of the number of bytes in flight (pipe size) + */ +always_inline u32 +tcp_flight_size (const tcp_connection_t * tc) +{ + int flight_size; + + flight_size = (int) (tc->snd_nxt - tc->snd_una) - tcp_bytes_out (tc) + + tc->snd_rxt_bytes - tc->rxt_delivered; + + ASSERT (flight_size >= 0); + + return flight_size; +} + +/** + * Initial cwnd as per RFC5681 + */ +always_inline u32 +tcp_initial_cwnd (const tcp_connection_t * tc) +{ + if (tcp_cfg.initial_cwnd_multiplier > 0) + return tcp_cfg.initial_cwnd_multiplier * tc->snd_mss; + + if (tc->snd_mss > 2190) + return 2 * tc->snd_mss; + else if (tc->snd_mss > 1095) + return 3 * tc->snd_mss; + else + return 4 * tc->snd_mss; +} + +/* + * Accumulate acked bytes for cwnd increase + * + * Once threshold bytes are accumulated, snd_mss bytes are added + * to the cwnd. + */ +always_inline void +tcp_cwnd_accumulate (tcp_connection_t * tc, u32 thresh, u32 bytes) +{ + tc->cwnd_acc_bytes += bytes; + if (tc->cwnd_acc_bytes >= thresh) + { + u32 inc = tc->cwnd_acc_bytes / thresh; + tc->cwnd_acc_bytes -= inc * thresh; + tc->cwnd += inc * tc->snd_mss; + tc->cwnd = clib_min (tc->cwnd, tc->tx_fifo_size); + } +} + +always_inline u32 +tcp_loss_wnd (const tcp_connection_t * tc) +{ + /* Whatever we have in flight + the packet we're about to send */ + return tcp_flight_size (tc) + tc->snd_mss; +} + +always_inline u32 +tcp_available_snd_wnd (const tcp_connection_t * tc) +{ + return clib_min (tc->cwnd, tc->snd_wnd); +} + +always_inline u32 +tcp_available_output_snd_space (const tcp_connection_t * tc) +{ + u32 available_wnd = tcp_available_snd_wnd (tc); + int flight_size = (int) (tc->snd_nxt - tc->snd_una); + + if (available_wnd <= flight_size) + return 0; + + return available_wnd - flight_size; +} + +/** + * Estimate of how many bytes we can still push into the network + */ +always_inline u32 +tcp_available_cc_snd_space (const tcp_connection_t * tc) +{ + u32 available_wnd = tcp_available_snd_wnd (tc); + u32 flight_size = tcp_flight_size (tc); + + if (available_wnd <= flight_size) + return 0; + + return available_wnd - flight_size; +} + +always_inline u8 +tcp_is_lost_fin (tcp_connection_t * tc) +{ + if ((tc->flags & TCP_CONN_FINSNT) && (tc->snd_una_max - tc->snd_una == 1)) + return 1; + return 0; +} + +always_inline u32 +tcp_time_now (void) +{ + return tcp_main.wrk_ctx[vlib_get_thread_index ()].time_now; +} + +always_inline u32 +tcp_time_now_w_thread (u32 thread_index) +{ + return tcp_main.wrk_ctx[thread_index].time_now; +} + +/** + * Generate timestamp for tcp connection + */ +always_inline u32 +tcp_tstamp (tcp_connection_t * tc) +{ + return (tcp_main.wrk_ctx[tc->c_thread_index].time_now - + tc->timestamp_delta); +} + +always_inline f64 +tcp_time_now_us (u32 thread_index) +{ + return transport_time_now (thread_index); +} + +always_inline u32 +tcp_set_time_now (tcp_worker_ctx_t * wrk) +{ + wrk->time_now = clib_cpu_time_now () * tcp_main.tstamp_ticks_per_clock; + return wrk->time_now; +} + +always_inline tcp_connection_t * +tcp_input_lookup_buffer (vlib_buffer_t * b, u8 thread_index, u32 * error, + u8 is_ip4, u8 is_nolookup) +{ + u32 fib_index = vnet_buffer (b)->ip.fib_index; + int n_advance_bytes, n_data_bytes; + transport_connection_t *tc; + tcp_header_t *tcp; + u8 result = 0; + + if (is_ip4) + { + ip4_header_t *ip4 = vlib_buffer_get_current (b); + int ip_hdr_bytes = ip4_header_bytes (ip4); + if (PREDICT_FALSE (b->current_length < ip_hdr_bytes + sizeof (*tcp))) + { + *error = TCP_ERROR_LENGTH; + return 0; + } + tcp = ip4_next_header (ip4); + vnet_buffer (b)->tcp.hdr_offset = (u8 *) tcp - (u8 *) ip4; + n_advance_bytes = (ip_hdr_bytes + tcp_header_bytes (tcp)); + n_data_bytes = clib_net_to_host_u16 (ip4->length) - n_advance_bytes; + + /* Length check. Checksum computed by ipx_local no need to compute again */ + if (PREDICT_FALSE (n_data_bytes < 0)) + { + *error = TCP_ERROR_LENGTH; + return 0; + } + + if (!is_nolookup) + tc = session_lookup_connection_wt4 (fib_index, &ip4->dst_address, + &ip4->src_address, tcp->dst_port, + tcp->src_port, + TRANSPORT_PROTO_TCP, thread_index, + &result); + } + else + { + ip6_header_t *ip6 = vlib_buffer_get_current (b); + if (PREDICT_FALSE (b->current_length < sizeof (*ip6) + sizeof (*tcp))) + { + *error = TCP_ERROR_LENGTH; + return 0; + } + tcp = ip6_next_header (ip6); + vnet_buffer (b)->tcp.hdr_offset = (u8 *) tcp - (u8 *) ip6; + n_advance_bytes = tcp_header_bytes (tcp); + n_data_bytes = clib_net_to_host_u16 (ip6->payload_length) + - n_advance_bytes; + n_advance_bytes += sizeof (ip6[0]); + + if (PREDICT_FALSE (n_data_bytes < 0)) + { + *error = TCP_ERROR_LENGTH; + return 0; + } + + if (!is_nolookup) + { + if (PREDICT_FALSE + (ip6_address_is_link_local_unicast (&ip6->dst_address))) + { + ip4_main_t *im = &ip4_main; + fib_index = vec_elt (im->fib_index_by_sw_if_index, + vnet_buffer (b)->sw_if_index[VLIB_RX]); + } + + tc = session_lookup_connection_wt6 (fib_index, &ip6->dst_address, + &ip6->src_address, + tcp->dst_port, tcp->src_port, + TRANSPORT_PROTO_TCP, + thread_index, &result); + } + } + + if (is_nolookup) + tc = + (transport_connection_t *) tcp_connection_get (vnet_buffer (b)-> + tcp.connection_index, + thread_index); + + vnet_buffer (b)->tcp.seq_number = clib_net_to_host_u32 (tcp->seq_number); + vnet_buffer (b)->tcp.ack_number = clib_net_to_host_u32 (tcp->ack_number); + vnet_buffer (b)->tcp.data_offset = n_advance_bytes; + vnet_buffer (b)->tcp.data_len = n_data_bytes; + vnet_buffer (b)->tcp.seq_end = vnet_buffer (b)->tcp.seq_number + + n_data_bytes; + vnet_buffer (b)->tcp.flags = 0; + + *error = result ? TCP_ERROR_NONE + result : *error; + + return tcp_get_connection_from_transport (tc); +} + +/** + * Initialize connection by gleaning network and rcv params from buffer + * + * @param tc connection to initialize + * @param b buffer whose current data is pointing at ip + * @param is_ip4 flag set to 1 if using ip4 + */ +always_inline void +tcp_init_w_buffer (tcp_connection_t * tc, vlib_buffer_t * b, u8 is_ip4) +{ + tcp_header_t *th = tcp_buffer_hdr (b); + + tc->c_lcl_port = th->dst_port; + tc->c_rmt_port = th->src_port; + tc->c_is_ip4 = is_ip4; + + if (is_ip4) + { + ip4_header_t *ip4 = vlib_buffer_get_current (b); + tc->c_lcl_ip4.as_u32 = ip4->dst_address.as_u32; + tc->c_rmt_ip4.as_u32 = ip4->src_address.as_u32; + } + else + { + ip6_header_t *ip6 = vlib_buffer_get_current (b); + clib_memcpy_fast (&tc->c_lcl_ip6, &ip6->dst_address, + sizeof (ip6_address_t)); + clib_memcpy_fast (&tc->c_rmt_ip6, &ip6->src_address, + sizeof (ip6_address_t)); + } + + tc->irs = vnet_buffer (b)->tcp.seq_number; + tc->rcv_nxt = vnet_buffer (b)->tcp.seq_number + 1; + tc->rcv_las = tc->rcv_nxt; + tc->sw_if_index = vnet_buffer (b)->sw_if_index[VLIB_RX]; + tc->snd_wl1 = vnet_buffer (b)->tcp.seq_number; + tc->snd_wl2 = vnet_buffer (b)->tcp.ack_number; + + /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK} + * segments are used to initialize PAWS. */ + if (tcp_opts_tstamp (&tc->rcv_opts)) + { + tc->tsval_recent = tc->rcv_opts.tsval; + tc->tsval_recent_age = tcp_time_now (); + } + + if (tcp_opts_wscale (&tc->rcv_opts)) + tc->snd_wscale = tc->rcv_opts.wscale; + + tc->snd_wnd = clib_net_to_host_u16 (th->window) << tc->snd_wscale; +} + +always_inline void +tcp_update_rto (tcp_connection_t * tc) +{ + tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); + tc->rto = clib_max (tc->rto, TCP_RTO_MIN); +} + +always_inline u8 +tcp_is_descheduled (tcp_connection_t * tc) +{ + return (transport_connection_is_descheduled (&tc->connection) ? 1 : 0); +} + +/** + * Push TCP header to buffer + * + * @param vm - vlib_main + * @param b - buffer to write the header to + * @param sp_net - source port net order + * @param dp_net - destination port net order + * @param seq - sequence number net order + * @param ack - ack number net order + * @param tcp_hdr_opts_len - header and options length in bytes + * @param flags - header flags + * @param wnd - window size + * + * @return - pointer to start of TCP header + */ +always_inline void * +vlib_buffer_push_tcp_net_order (vlib_buffer_t * b, u16 sp, u16 dp, u32 seq, + u32 ack, u8 tcp_hdr_opts_len, u8 flags, + u16 wnd) +{ + tcp_header_t *th; + + th = vlib_buffer_push_uninit (b, tcp_hdr_opts_len); + + th->src_port = sp; + th->dst_port = dp; + th->seq_number = seq; + th->ack_number = ack; + th->data_offset_and_reserved = (tcp_hdr_opts_len >> 2) << 4; + th->flags = flags; + th->window = wnd; + th->checksum = 0; + th->urgent_pointer = 0; + vnet_buffer (b)->l4_hdr_offset = (u8 *) th - b->data; + b->flags |= VNET_BUFFER_F_L4_HDR_OFFSET_VALID; + return th; +} + +/** + * Push TCP header to buffer + * + * @param b - buffer to write the header to + * @param sp_net - source port net order + * @param dp_net - destination port net order + * @param seq - sequence number host order + * @param ack - ack number host order + * @param tcp_hdr_opts_len - header and options length in bytes + * @param flags - header flags + * @param wnd - window size + * + * @return - pointer to start of TCP header + */ +always_inline void * +vlib_buffer_push_tcp (vlib_buffer_t * b, u16 sp_net, u16 dp_net, u32 seq, + u32 ack, u8 tcp_hdr_opts_len, u8 flags, u16 wnd) +{ + return vlib_buffer_push_tcp_net_order (b, sp_net, dp_net, + clib_host_to_net_u32 (seq), + clib_host_to_net_u32 (ack), + tcp_hdr_opts_len, flags, + clib_host_to_net_u16 (wnd)); +} + +#endif /* SRC_VNET_TCP_TCP_INLINES_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 86158b9d6c6..e27cffb9444 100755 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -16,8 +16,8 @@ #include <vppinfra/sparse_vec.h> #include <vnet/fib/ip4_fib.h> #include <vnet/fib/ip6_fib.h> -#include <vnet/tcp/tcp_packet.h> #include <vnet/tcp/tcp.h> +#include <vnet/tcp/tcp_inlines.h> #include <vnet/session/session.h> #include <math.h> @@ -116,119 +116,6 @@ tcp_segment_in_rcv_wnd (tcp_connection_t * tc, u32 seq, u32 end_seq) } /** - * Parse TCP header options. - * - * @param th TCP header - * @param to TCP options data structure to be populated - * @param is_syn set if packet is syn - * @return -1 if parsing failed - */ -static inline int -tcp_options_parse (tcp_header_t * th, tcp_options_t * to, u8 is_syn) -{ - const u8 *data; - u8 opt_len, opts_len, kind; - int j; - sack_block_t b; - - opts_len = (tcp_doff (th) << 2) - sizeof (tcp_header_t); - data = (const u8 *) (th + 1); - - /* Zero out all flags but those set in SYN */ - to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE - | TCP_OPTS_FLAG_TSTAMP | TCP_OPTS_FLAG_MSS); - - for (; opts_len > 0; opts_len -= opt_len, data += opt_len) - { - kind = data[0]; - - /* Get options length */ - if (kind == TCP_OPTION_EOL) - break; - else if (kind == TCP_OPTION_NOOP) - { - opt_len = 1; - continue; - } - else - { - /* broken options */ - if (opts_len < 2) - return -1; - opt_len = data[1]; - - /* weird option length */ - if (opt_len < 2 || opt_len > opts_len) - return -1; - } - - /* Parse options */ - switch (kind) - { - case TCP_OPTION_MSS: - if (!is_syn) - break; - if ((opt_len == TCP_OPTION_LEN_MSS) && tcp_syn (th)) - { - to->flags |= TCP_OPTS_FLAG_MSS; - to->mss = clib_net_to_host_u16 (*(u16 *) (data + 2)); - } - break; - case TCP_OPTION_WINDOW_SCALE: - if (!is_syn) - break; - if ((opt_len == TCP_OPTION_LEN_WINDOW_SCALE) && tcp_syn (th)) - { - to->flags |= TCP_OPTS_FLAG_WSCALE; - to->wscale = data[2]; - if (to->wscale > TCP_MAX_WND_SCALE) - to->wscale = TCP_MAX_WND_SCALE; - } - break; - case TCP_OPTION_TIMESTAMP: - if (is_syn) - to->flags |= TCP_OPTS_FLAG_TSTAMP; - if ((to->flags & TCP_OPTS_FLAG_TSTAMP) - && opt_len == TCP_OPTION_LEN_TIMESTAMP) - { - to->tsval = clib_net_to_host_u32 (*(u32 *) (data + 2)); - to->tsecr = clib_net_to_host_u32 (*(u32 *) (data + 6)); - } - break; - case TCP_OPTION_SACK_PERMITTED: - if (!is_syn) - break; - if (opt_len == TCP_OPTION_LEN_SACK_PERMITTED && tcp_syn (th)) - to->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; - break; - case TCP_OPTION_SACK_BLOCK: - /* If SACK permitted was not advertised or a SYN, break */ - if ((to->flags & TCP_OPTS_FLAG_SACK_PERMITTED) == 0 || tcp_syn (th)) - break; - - /* If too short or not correctly formatted, break */ - if (opt_len < 10 || ((opt_len - 2) % TCP_OPTION_LEN_SACK_BLOCK)) - break; - - to->flags |= TCP_OPTS_FLAG_SACK; - to->n_sack_blocks = (opt_len - 2) / TCP_OPTION_LEN_SACK_BLOCK; - vec_reset_length (to->sacks); - for (j = 0; j < to->n_sack_blocks; j++) - { - b.start = clib_net_to_host_u32 (*(u32 *) (data + 2 + 8 * j)); - b.end = clib_net_to_host_u32 (*(u32 *) (data + 6 + 8 * j)); - vec_add1 (to->sacks, b); - } - break; - default: - /* Nothing to see here */ - continue; - } - } - return 0; -} - -/** * RFC1323: Check against wrapped sequence numbers (PAWS). If we have * timestamp to echo and it's less than tsval_recent, drop segment * but still send an ACK in order to retain TCP's mechanism for detecting @@ -565,15 +452,6 @@ tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt) } } -#ifndef CLIB_MARCH_VARIANT -void -tcp_update_rto (tcp_connection_t * tc) -{ - tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); - tc->rto = clib_max (tc->rto, TCP_RTO_MIN); -} -#endif /* CLIB_MARCH_VARIANT */ - /** * Update RTT estimate and RTO timer * @@ -726,567 +604,6 @@ tcp_program_dequeue (tcp_worker_ctx_t * wrk, tcp_connection_t * tc) tc->burst_acked += tc->bytes_acked; } -#ifndef CLIB_MARCH_VARIANT -static u32 -scoreboard_hole_index (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) -{ - ASSERT (!pool_is_free_index (sb->holes, hole - sb->holes)); - return hole - sb->holes; -} - -static u32 -scoreboard_hole_bytes (sack_scoreboard_hole_t * hole) -{ - return hole->end - hole->start; -} - -sack_scoreboard_hole_t * -scoreboard_get_hole (sack_scoreboard_t * sb, u32 index) -{ - if (index != TCP_INVALID_SACK_HOLE_INDEX) - return pool_elt_at_index (sb->holes, index); - return 0; -} - -sack_scoreboard_hole_t * -scoreboard_next_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) -{ - if (hole->next != TCP_INVALID_SACK_HOLE_INDEX) - return pool_elt_at_index (sb->holes, hole->next); - return 0; -} - -sack_scoreboard_hole_t * -scoreboard_prev_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) -{ - if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX) - return pool_elt_at_index (sb->holes, hole->prev); - return 0; -} - -sack_scoreboard_hole_t * -scoreboard_first_hole (sack_scoreboard_t * sb) -{ - if (sb->head != TCP_INVALID_SACK_HOLE_INDEX) - return pool_elt_at_index (sb->holes, sb->head); - return 0; -} - -sack_scoreboard_hole_t * -scoreboard_last_hole (sack_scoreboard_t * sb) -{ - if (sb->tail != TCP_INVALID_SACK_HOLE_INDEX) - return pool_elt_at_index (sb->holes, sb->tail); - return 0; -} - -static void -scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) -{ - sack_scoreboard_hole_t *next, *prev; - - if (hole->next != TCP_INVALID_SACK_HOLE_INDEX) - { - next = pool_elt_at_index (sb->holes, hole->next); - next->prev = hole->prev; - } - else - { - sb->tail = hole->prev; - } - - if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX) - { - prev = pool_elt_at_index (sb->holes, hole->prev); - prev->next = hole->next; - } - else - { - sb->head = hole->next; - } - - if (scoreboard_hole_index (sb, hole) == sb->cur_rxt_hole) - sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; - - /* Poison the entry */ - if (CLIB_DEBUG > 0) - clib_memset (hole, 0xfe, sizeof (*hole)); - - pool_put (sb->holes, hole); -} - -static sack_scoreboard_hole_t * -scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index, - u32 start, u32 end) -{ - sack_scoreboard_hole_t *hole, *next, *prev; - u32 hole_index; - - pool_get (sb->holes, hole); - clib_memset (hole, 0, sizeof (*hole)); - - hole->start = start; - hole->end = end; - hole_index = scoreboard_hole_index (sb, hole); - - prev = scoreboard_get_hole (sb, prev_index); - if (prev) - { - hole->prev = prev_index; - hole->next = prev->next; - - if ((next = scoreboard_next_hole (sb, hole))) - next->prev = hole_index; - else - sb->tail = hole_index; - - prev->next = hole_index; - } - else - { - sb->head = hole_index; - hole->prev = TCP_INVALID_SACK_HOLE_INDEX; - hole->next = TCP_INVALID_SACK_HOLE_INDEX; - } - - return hole; -} - -always_inline void -scoreboard_update_sacked_rxt (sack_scoreboard_t * sb, u32 start, u32 end, - u8 has_rxt) -{ - if (!has_rxt || seq_geq (start, sb->high_rxt)) - return; - - sb->rxt_sacked += - seq_lt (end, sb->high_rxt) ? (end - start) : (sb->high_rxt - start); -} - -always_inline void -scoreboard_update_bytes (sack_scoreboard_t * sb, u32 ack, u32 snd_mss) -{ - sack_scoreboard_hole_t *left, *right; - u32 sacked = 0, blks = 0, old_sacked; - - old_sacked = sb->sacked_bytes; - - sb->last_lost_bytes = 0; - sb->lost_bytes = 0; - sb->sacked_bytes = 0; - - right = scoreboard_last_hole (sb); - if (!right) - { - sb->sacked_bytes = sb->high_sacked - ack; - sb->last_sacked_bytes = sb->sacked_bytes - - (old_sacked - sb->last_bytes_delivered); - return; - } - - if (seq_gt (sb->high_sacked, right->end)) - { - sacked = sb->high_sacked - right->end; - blks = 1; - } - - while (sacked < (TCP_DUPACK_THRESHOLD - 1) * snd_mss - && blks < TCP_DUPACK_THRESHOLD) - { - if (right->is_lost) - sb->lost_bytes += scoreboard_hole_bytes (right); - - left = scoreboard_prev_hole (sb, right); - if (!left) - { - ASSERT (right->start == ack || sb->is_reneging); - sacked += right->start - ack; - right = 0; - break; - } - - sacked += right->start - left->end; - blks++; - right = left; - } - - /* right is first lost */ - while (right) - { - sb->lost_bytes += scoreboard_hole_bytes (right); - sb->last_lost_bytes += right->is_lost ? 0 : (right->end - right->start); - right->is_lost = 1; - left = scoreboard_prev_hole (sb, right); - if (!left) - { - ASSERT (right->start == ack || sb->is_reneging); - sacked += right->start - ack; - break; - } - sacked += right->start - left->end; - right = left; - } - - sb->sacked_bytes = sacked; - sb->last_sacked_bytes = sacked - (old_sacked - sb->last_bytes_delivered); -} - -/** - * Figure out the next hole to retransmit - * - * Follows logic proposed in RFC6675 Sec. 4, NextSeg() - */ -sack_scoreboard_hole_t * -scoreboard_next_rxt_hole (sack_scoreboard_t * sb, - sack_scoreboard_hole_t * start, - u8 have_unsent, u8 * can_rescue, u8 * snd_limited) -{ - sack_scoreboard_hole_t *hole = 0; - - hole = start ? start : scoreboard_first_hole (sb); - while (hole && seq_leq (hole->end, sb->high_rxt) && hole->is_lost) - hole = scoreboard_next_hole (sb, hole); - - /* Nothing, return */ - if (!hole) - { - sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; - return 0; - } - - /* Rule (1): if higher than rxt, less than high_sacked and lost */ - if (hole->is_lost && seq_lt (hole->start, sb->high_sacked)) - { - sb->cur_rxt_hole = scoreboard_hole_index (sb, hole); - } - else - { - /* Rule (2): available unsent data */ - if (have_unsent) - { - sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; - return 0; - } - /* Rule (3): if hole not lost */ - else if (seq_lt (hole->start, sb->high_sacked)) - { - /* And we didn't already retransmit it */ - if (seq_leq (hole->end, sb->high_rxt)) - { - sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; - return 0; - } - *snd_limited = 0; - sb->cur_rxt_hole = scoreboard_hole_index (sb, hole); - } - /* Rule (4): if hole beyond high_sacked */ - else - { - ASSERT (seq_geq (hole->start, sb->high_sacked)); - *snd_limited = 1; - *can_rescue = 1; - /* HighRxt MUST NOT be updated */ - return 0; - } - } - - if (hole && seq_lt (sb->high_rxt, hole->start)) - sb->high_rxt = hole->start; - - return hole; -} - -void -scoreboard_init_rxt (sack_scoreboard_t * sb, u32 snd_una) -{ - sack_scoreboard_hole_t *hole; - hole = scoreboard_first_hole (sb); - if (hole) - { - snd_una = seq_gt (snd_una, hole->start) ? snd_una : hole->start; - sb->cur_rxt_hole = sb->head; - } - sb->high_rxt = snd_una; - sb->rescue_rxt = snd_una - 1; -} - -void -scoreboard_init (sack_scoreboard_t * sb) -{ - sb->head = TCP_INVALID_SACK_HOLE_INDEX; - sb->tail = TCP_INVALID_SACK_HOLE_INDEX; - sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; -} - -void -scoreboard_clear (sack_scoreboard_t * sb) -{ - sack_scoreboard_hole_t *hole; - while ((hole = scoreboard_first_hole (sb))) - { - scoreboard_remove_hole (sb, hole); - } - ASSERT (sb->head == sb->tail && sb->head == TCP_INVALID_SACK_HOLE_INDEX); - ASSERT (pool_elts (sb->holes) == 0); - sb->sacked_bytes = 0; - sb->last_sacked_bytes = 0; - sb->last_bytes_delivered = 0; - sb->lost_bytes = 0; - sb->last_lost_bytes = 0; - sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; - sb->is_reneging = 0; -} - -void -scoreboard_clear_reneging (sack_scoreboard_t * sb, u32 start, u32 end) -{ - sack_scoreboard_hole_t *last_hole; - - clib_warning ("sack reneging"); - - scoreboard_clear (sb); - last_hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX, - start, end); - last_hole->is_lost = 1; - sb->tail = scoreboard_hole_index (sb, last_hole); - sb->high_sacked = start; - scoreboard_init_rxt (sb, start); -} - -#endif /* CLIB_MARCH_VARIANT */ - -/** - * Test that scoreboard is sane after recovery - * - * Returns 1 if scoreboard is empty or if first hole beyond - * snd_una. - */ -static u8 -tcp_scoreboard_is_sane_post_recovery (tcp_connection_t * tc) -{ - sack_scoreboard_hole_t *hole; - hole = scoreboard_first_hole (&tc->sack_sb); - return (!hole || (seq_geq (hole->start, tc->snd_una) - && seq_lt (hole->end, tc->snd_nxt))); -} - -#ifndef CLIB_MARCH_VARIANT - -void -tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) -{ - sack_scoreboard_hole_t *hole, *next_hole; - sack_scoreboard_t *sb = &tc->sack_sb; - sack_block_t *blk, *rcv_sacks; - u32 blk_index = 0, i, j; - u8 has_rxt; - - sb->last_sacked_bytes = 0; - sb->last_bytes_delivered = 0; - sb->rxt_sacked = 0; - - if (!tcp_opts_sack (&tc->rcv_opts) && !sb->sacked_bytes - && sb->head == TCP_INVALID_SACK_HOLE_INDEX) - return; - - has_rxt = tcp_in_cong_recovery (tc); - - /* Remove invalid blocks */ - blk = tc->rcv_opts.sacks; - while (blk < vec_end (tc->rcv_opts.sacks)) - { - if (seq_lt (blk->start, blk->end) - && seq_gt (blk->start, tc->snd_una) - && seq_gt (blk->start, ack) && seq_leq (blk->end, tc->snd_nxt)) - { - blk++; - continue; - } - vec_del1 (tc->rcv_opts.sacks, blk - tc->rcv_opts.sacks); - } - - /* Add block for cumulative ack */ - if (seq_gt (ack, tc->snd_una)) - { - vec_add2 (tc->rcv_opts.sacks, blk, 1); - blk->start = tc->snd_una; - blk->end = ack; - } - - if (vec_len (tc->rcv_opts.sacks) == 0) - return; - - tcp_scoreboard_trace_add (tc, ack); - - /* Make sure blocks are ordered */ - rcv_sacks = tc->rcv_opts.sacks; - for (i = 0; i < vec_len (rcv_sacks); i++) - for (j = i + 1; j < vec_len (rcv_sacks); j++) - if (seq_lt (rcv_sacks[j].start, rcv_sacks[i].start)) - { - sack_block_t tmp = rcv_sacks[i]; - rcv_sacks[i] = rcv_sacks[j]; - rcv_sacks[j] = tmp; - } - - if (sb->head == TCP_INVALID_SACK_HOLE_INDEX) - { - /* Handle reneging as a special case */ - if (PREDICT_FALSE (sb->is_reneging)) - { - /* No holes, only sacked bytes */ - if (seq_leq (tc->snd_nxt, sb->high_sacked)) - { - /* No progress made so return */ - if (seq_leq (ack, tc->snd_una)) - return; - - /* Update sacked bytes delivered and return */ - sb->last_bytes_delivered = ack - tc->snd_una; - sb->sacked_bytes -= sb->last_bytes_delivered; - sb->is_reneging = seq_lt (ack, sb->high_sacked); - return; - } - - /* New hole above high sacked. Add it and process normally */ - hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX, - sb->high_sacked, tc->snd_nxt); - sb->tail = scoreboard_hole_index (sb, hole); - } - /* Not reneging and no holes. Insert the first that covers all - * outstanding bytes */ - else - { - hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX, - tc->snd_una, tc->snd_nxt); - sb->tail = scoreboard_hole_index (sb, hole); - } - sb->high_sacked = rcv_sacks[vec_len (rcv_sacks) - 1].end; - } - else - { - /* If we have holes but snd_nxt is beyond the last hole, update - * last hole end or add new hole after high sacked */ - hole = scoreboard_last_hole (sb); - if (seq_gt (tc->snd_nxt, hole->end)) - { - if (seq_geq (hole->start, sb->high_sacked)) - { - hole->end = tc->snd_nxt; - } - /* New hole after high sacked block */ - else if (seq_lt (sb->high_sacked, tc->snd_nxt)) - { - scoreboard_insert_hole (sb, sb->tail, sb->high_sacked, - tc->snd_nxt); - } - } - - /* Keep track of max byte sacked for when the last hole - * is acked */ - sb->high_sacked = seq_max (rcv_sacks[vec_len (rcv_sacks) - 1].end, - sb->high_sacked); - } - - /* Walk the holes with the SACK blocks */ - hole = pool_elt_at_index (sb->holes, sb->head); - - if (PREDICT_FALSE (sb->is_reneging)) - { - sb->last_bytes_delivered += clib_min (hole->start - tc->snd_una, - ack - tc->snd_una); - sb->is_reneging = seq_lt (ack, hole->start); - } - - while (hole && blk_index < vec_len (rcv_sacks)) - { - blk = &rcv_sacks[blk_index]; - if (seq_leq (blk->start, hole->start)) - { - /* Block covers hole. Remove hole */ - if (seq_geq (blk->end, hole->end)) - { - next_hole = scoreboard_next_hole (sb, hole); - - /* If covered by ack, compute delivered bytes */ - if (blk->end == ack) - { - u32 sacked = next_hole ? next_hole->start : sb->high_sacked; - if (PREDICT_FALSE (seq_lt (ack, sacked))) - { - sb->last_bytes_delivered += ack - hole->end; - sb->is_reneging = 1; - } - else - { - sb->last_bytes_delivered += sacked - hole->end; - sb->is_reneging = 0; - } - } - scoreboard_update_sacked_rxt (sb, hole->start, hole->end, - has_rxt); - scoreboard_remove_hole (sb, hole); - hole = next_hole; - } - /* Partial 'head' overlap */ - else - { - if (seq_gt (blk->end, hole->start)) - { - scoreboard_update_sacked_rxt (sb, hole->start, blk->end, - has_rxt); - hole->start = blk->end; - } - blk_index++; - } - } - else - { - /* Hole must be split */ - if (seq_lt (blk->end, hole->end)) - { - u32 hole_index = scoreboard_hole_index (sb, hole); - next_hole = scoreboard_insert_hole (sb, hole_index, blk->end, - hole->end); - /* Pool might've moved */ - hole = scoreboard_get_hole (sb, hole_index); - hole->end = blk->start; - - scoreboard_update_sacked_rxt (sb, blk->start, blk->end, - has_rxt); - - blk_index++; - ASSERT (hole->next == scoreboard_hole_index (sb, next_hole)); - } - else if (seq_lt (blk->start, hole->end)) - { - scoreboard_update_sacked_rxt (sb, blk->start, hole->end, - has_rxt); - hole->end = blk->start; - } - hole = scoreboard_next_hole (sb, hole); - } - } - - scoreboard_update_bytes (sb, ack, tc->snd_mss); - - ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes || tcp_in_recovery (tc)); - ASSERT (sb->sacked_bytes == 0 || tcp_in_recovery (tc) - || sb->sacked_bytes <= tc->snd_nxt - seq_max (tc->snd_una, ack)); - ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_nxt - - seq_max (tc->snd_una, ack) || tcp_in_recovery (tc)); - ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc) - || sb->is_reneging || sb->holes[sb->head].start == ack); - ASSERT (sb->last_lost_bytes <= sb->lost_bytes); - ASSERT ((ack - tc->snd_una) + sb->last_sacked_bytes - - sb->last_bytes_delivered >= sb->rxt_sacked); - ASSERT ((ack - tc->snd_una) >= tc->sack_sb.last_bytes_delivered - || (tc->flags & TCP_CONN_FINSNT)); - - TCP_EVT (TCP_EVT_CC_SCOREBOARD, tc); -} -#endif /* CLIB_MARCH_VARIANT */ - /** * Try to update snd_wnd based on feedback received from peer. * @@ -1825,89 +1142,6 @@ tcp_rcv_fin (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, vlib_buffer_t * b, *error = TCP_ERROR_FIN_RCVD; } -#ifndef CLIB_MARCH_VARIANT -static u8 -tcp_sack_vector_is_sane (sack_block_t * sacks) -{ - int i; - for (i = 1; i < vec_len (sacks); i++) - { - if (sacks[i - 1].end == sacks[i].start) - return 0; - } - return 1; -} - -/** - * Build SACK list as per RFC2018. - * - * Makes sure the first block contains the segment that generated the current - * ACK and the following ones are the ones most recently reported in SACK - * blocks. - * - * @param tc TCP connection for which the SACK list is updated - * @param start Start sequence number of the newest SACK block - * @param end End sequence of the newest SACK block - */ -void -tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end) -{ - sack_block_t *new_list = tc->snd_sacks_fl, *block = 0; - int i; - - /* If the first segment is ooo add it to the list. Last write might've moved - * rcv_nxt over the first segment. */ - if (seq_lt (tc->rcv_nxt, start)) - { - vec_add2 (new_list, block, 1); - block->start = start; - block->end = end; - } - - /* Find the blocks still worth keeping. */ - for (i = 0; i < vec_len (tc->snd_sacks); i++) - { - /* Discard if rcv_nxt advanced beyond current block */ - if (seq_leq (tc->snd_sacks[i].start, tc->rcv_nxt)) - continue; - - /* Merge or drop if segment overlapped by the new segment */ - if (block && (seq_geq (tc->snd_sacks[i].end, new_list[0].start) - && seq_leq (tc->snd_sacks[i].start, new_list[0].end))) - { - if (seq_lt (tc->snd_sacks[i].start, new_list[0].start)) - new_list[0].start = tc->snd_sacks[i].start; - if (seq_lt (new_list[0].end, tc->snd_sacks[i].end)) - new_list[0].end = tc->snd_sacks[i].end; - continue; - } - - /* Save to new SACK list if we have space. */ - if (vec_len (new_list) < TCP_MAX_SACK_BLOCKS) - vec_add1 (new_list, tc->snd_sacks[i]); - } - - ASSERT (vec_len (new_list) <= TCP_MAX_SACK_BLOCKS); - - /* Replace old vector with new one */ - vec_reset_length (tc->snd_sacks); - tc->snd_sacks_fl = tc->snd_sacks; - tc->snd_sacks = new_list; - - /* Segments should not 'touch' */ - ASSERT (tcp_sack_vector_is_sane (tc->snd_sacks)); -} - -u32 -tcp_sack_list_bytes (tcp_connection_t * tc) -{ - u32 bytes = 0, i; - for (i = 0; i < vec_len (tc->snd_sacks); i++) - bytes += tc->snd_sacks[i].end - tc->snd_sacks[i].start; - return bytes; -} -#endif /* CLIB_MARCH_VARIANT */ - /** Enqueue data for delivery to application */ static int tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, @@ -3551,102 +2785,6 @@ tcp_input_set_error_next (tcp_main_t * tm, u16 * next, u32 * error, u8 is_ip4) } } -always_inline tcp_connection_t * -tcp_input_lookup_buffer (vlib_buffer_t * b, u8 thread_index, u32 * error, - u8 is_ip4, u8 is_nolookup) -{ - u32 fib_index = vnet_buffer (b)->ip.fib_index; - int n_advance_bytes, n_data_bytes; - transport_connection_t *tc; - tcp_header_t *tcp; - u8 result = 0; - - if (is_ip4) - { - ip4_header_t *ip4 = vlib_buffer_get_current (b); - int ip_hdr_bytes = ip4_header_bytes (ip4); - if (PREDICT_FALSE (b->current_length < ip_hdr_bytes + sizeof (*tcp))) - { - *error = TCP_ERROR_LENGTH; - return 0; - } - tcp = ip4_next_header (ip4); - vnet_buffer (b)->tcp.hdr_offset = (u8 *) tcp - (u8 *) ip4; - n_advance_bytes = (ip_hdr_bytes + tcp_header_bytes (tcp)); - n_data_bytes = clib_net_to_host_u16 (ip4->length) - n_advance_bytes; - - /* Length check. Checksum computed by ipx_local no need to compute again */ - if (PREDICT_FALSE (n_data_bytes < 0)) - { - *error = TCP_ERROR_LENGTH; - return 0; - } - - if (!is_nolookup) - tc = session_lookup_connection_wt4 (fib_index, &ip4->dst_address, - &ip4->src_address, tcp->dst_port, - tcp->src_port, - TRANSPORT_PROTO_TCP, thread_index, - &result); - } - else - { - ip6_header_t *ip6 = vlib_buffer_get_current (b); - if (PREDICT_FALSE (b->current_length < sizeof (*ip6) + sizeof (*tcp))) - { - *error = TCP_ERROR_LENGTH; - return 0; - } - tcp = ip6_next_header (ip6); - vnet_buffer (b)->tcp.hdr_offset = (u8 *) tcp - (u8 *) ip6; - n_advance_bytes = tcp_header_bytes (tcp); - n_data_bytes = clib_net_to_host_u16 (ip6->payload_length) - - n_advance_bytes; - n_advance_bytes += sizeof (ip6[0]); - - if (PREDICT_FALSE (n_data_bytes < 0)) - { - *error = TCP_ERROR_LENGTH; - return 0; - } - - if (!is_nolookup) - { - if (PREDICT_FALSE - (ip6_address_is_link_local_unicast (&ip6->dst_address))) - { - ip4_main_t *im = &ip4_main; - fib_index = vec_elt (im->fib_index_by_sw_if_index, - vnet_buffer (b)->sw_if_index[VLIB_RX]); - } - - tc = session_lookup_connection_wt6 (fib_index, &ip6->dst_address, - &ip6->src_address, - tcp->dst_port, tcp->src_port, - TRANSPORT_PROTO_TCP, - thread_index, &result); - } - } - - if (is_nolookup) - tc = - (transport_connection_t *) tcp_connection_get (vnet_buffer (b)-> - tcp.connection_index, - thread_index); - - vnet_buffer (b)->tcp.seq_number = clib_net_to_host_u32 (tcp->seq_number); - vnet_buffer (b)->tcp.ack_number = clib_net_to_host_u32 (tcp->ack_number); - vnet_buffer (b)->tcp.data_offset = n_advance_bytes; - vnet_buffer (b)->tcp.data_len = n_data_bytes; - vnet_buffer (b)->tcp.seq_end = vnet_buffer (b)->tcp.seq_number - + n_data_bytes; - vnet_buffer (b)->tcp.flags = 0; - - *error = result ? TCP_ERROR_NONE + result : *error; - - return tcp_get_connection_from_transport (tc); -} - static inline void tcp_input_dispatch_buffer (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, u16 * next, diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c index 69dd2247132..c5ffc2a4109 100644 --- a/src/vnet/tcp/tcp_newreno.c +++ b/src/vnet/tcp/tcp_newreno.c @@ -14,6 +14,7 @@ */ #include <vnet/tcp/tcp.h> +#include <vnet/tcp/tcp_inlines.h> typedef struct nwreno_cfg_ { diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index d07fb2ec26e..aeeffa726c3 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -14,6 +14,7 @@ */ #include <vnet/tcp/tcp.h> +#include <vnet/tcp/tcp_inlines.h> #include <math.h> typedef enum _tcp_output_next @@ -166,90 +167,6 @@ tcp_window_to_advertise (tcp_connection_t * tc, tcp_state_t state) return tc->rcv_wnd >> tc->rcv_wscale; } -/** - * Write TCP options to segment. - */ -static u32 -tcp_options_write (u8 * data, tcp_options_t * opts) -{ - u32 opts_len = 0; - u32 buf, seq_len = 4; - - if (tcp_opts_mss (opts)) - { - *data++ = TCP_OPTION_MSS; - *data++ = TCP_OPTION_LEN_MSS; - buf = clib_host_to_net_u16 (opts->mss); - clib_memcpy_fast (data, &buf, sizeof (opts->mss)); - data += sizeof (opts->mss); - opts_len += TCP_OPTION_LEN_MSS; - } - - if (tcp_opts_wscale (opts)) - { - *data++ = TCP_OPTION_WINDOW_SCALE; - *data++ = TCP_OPTION_LEN_WINDOW_SCALE; - *data++ = opts->wscale; - opts_len += TCP_OPTION_LEN_WINDOW_SCALE; - } - - if (tcp_opts_sack_permitted (opts)) - { - *data++ = TCP_OPTION_SACK_PERMITTED; - *data++ = TCP_OPTION_LEN_SACK_PERMITTED; - opts_len += TCP_OPTION_LEN_SACK_PERMITTED; - } - - if (tcp_opts_tstamp (opts)) - { - *data++ = TCP_OPTION_TIMESTAMP; - *data++ = TCP_OPTION_LEN_TIMESTAMP; - buf = clib_host_to_net_u32 (opts->tsval); - clib_memcpy_fast (data, &buf, sizeof (opts->tsval)); - data += sizeof (opts->tsval); - buf = clib_host_to_net_u32 (opts->tsecr); - clib_memcpy_fast (data, &buf, sizeof (opts->tsecr)); - data += sizeof (opts->tsecr); - opts_len += TCP_OPTION_LEN_TIMESTAMP; - } - - if (tcp_opts_sack (opts)) - { - int i; - - if (opts->n_sack_blocks != 0) - { - *data++ = TCP_OPTION_SACK_BLOCK; - *data++ = 2 + opts->n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK; - for (i = 0; i < opts->n_sack_blocks; i++) - { - buf = clib_host_to_net_u32 (opts->sacks[i].start); - clib_memcpy_fast (data, &buf, seq_len); - data += seq_len; - buf = clib_host_to_net_u32 (opts->sacks[i].end); - clib_memcpy_fast (data, &buf, seq_len); - data += seq_len; - } - opts_len += 2 + opts->n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK; - } - } - - /* Terminate TCP options */ - if (opts_len % 4) - { - *data++ = TCP_OPTION_EOL; - opts_len += TCP_OPTION_LEN_EOL; - } - - /* Pad with zeroes to a u32 boundary */ - while (opts_len % 4) - { - *data++ = TCP_OPTION_NOOP; - opts_len += TCP_OPTION_LEN_NOOP; - } - return opts_len; -} - static int tcp_make_syn_options (tcp_connection_t * tc, tcp_options_t * opts) { @@ -563,7 +480,7 @@ tcp_make_ack (tcp_connection_t * tc, vlib_buffer_t * b) /** * Convert buffer to FIN-ACK */ -void +static void tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b) { tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_ACK); @@ -598,7 +515,7 @@ tcp_make_syn (tcp_connection_t * tc, vlib_buffer_t * b) /** * Convert buffer to SYN-ACK */ -void +static void tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b) { tcp_options_t _snd_opts, *snd_opts = &_snd_opts; diff --git a/src/vnet/tcp/tcp_packet.h b/src/vnet/tcp/tcp_packet.h index 1e637a83271..fcc55ff549e 100644 --- a/src/vnet/tcp/tcp_packet.h +++ b/src/vnet/tcp/tcp_packet.h @@ -172,6 +172,219 @@ typedef struct #define TCP_MAX_WND_SCALE 14 /* See RFC 1323 */ #define TCP_OPTS_ALIGN 4 #define TCP_OPTS_MAX_SACK_BLOCKS 3 + +/* Modulo arithmetic for TCP sequence numbers */ +#define seq_lt(_s1, _s2) ((i32)((_s1)-(_s2)) < 0) +#define seq_leq(_s1, _s2) ((i32)((_s1)-(_s2)) <= 0) +#define seq_gt(_s1, _s2) ((i32)((_s1)-(_s2)) > 0) +#define seq_geq(_s1, _s2) ((i32)((_s1)-(_s2)) >= 0) +#define seq_max(_s1, _s2) (seq_gt((_s1), (_s2)) ? (_s1) : (_s2)) + +/* Modulo arithmetic for timestamps */ +#define timestamp_lt(_t1, _t2) ((i32)((_t1)-(_t2)) < 0) +#define timestamp_leq(_t1, _t2) ((i32)((_t1)-(_t2)) <= 0) + +/** + * Parse TCP header options. + * + * @param th TCP header + * @param to TCP options data structure to be populated + * @param is_syn set if packet is syn + * @return -1 if parsing failed + */ +always_inline int +tcp_options_parse (tcp_header_t * th, tcp_options_t * to, u8 is_syn) +{ + const u8 *data; + u8 opt_len, opts_len, kind; + int j; + sack_block_t b; + + opts_len = (tcp_doff (th) << 2) - sizeof (tcp_header_t); + data = (const u8 *) (th + 1); + + /* Zero out all flags but those set in SYN */ + to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE + | TCP_OPTS_FLAG_TSTAMP | TCP_OPTS_FLAG_MSS); + + for (; opts_len > 0; opts_len -= opt_len, data += opt_len) + { + kind = data[0]; + + /* Get options length */ + if (kind == TCP_OPTION_EOL) + break; + else if (kind == TCP_OPTION_NOOP) + { + opt_len = 1; + continue; + } + else + { + /* broken options */ + if (opts_len < 2) + return -1; + opt_len = data[1]; + + /* weird option length */ + if (opt_len < 2 || opt_len > opts_len) + return -1; + } + + /* Parse options */ + switch (kind) + { + case TCP_OPTION_MSS: + if (!is_syn) + break; + if ((opt_len == TCP_OPTION_LEN_MSS) && tcp_syn (th)) + { + to->flags |= TCP_OPTS_FLAG_MSS; + to->mss = clib_net_to_host_u16 (*(u16 *) (data + 2)); + } + break; + case TCP_OPTION_WINDOW_SCALE: + if (!is_syn) + break; + if ((opt_len == TCP_OPTION_LEN_WINDOW_SCALE) && tcp_syn (th)) + { + to->flags |= TCP_OPTS_FLAG_WSCALE; + to->wscale = data[2]; + if (to->wscale > TCP_MAX_WND_SCALE) + to->wscale = TCP_MAX_WND_SCALE; + } + break; + case TCP_OPTION_TIMESTAMP: + if (is_syn) + to->flags |= TCP_OPTS_FLAG_TSTAMP; + if ((to->flags & TCP_OPTS_FLAG_TSTAMP) + && opt_len == TCP_OPTION_LEN_TIMESTAMP) + { + to->tsval = clib_net_to_host_u32 (*(u32 *) (data + 2)); + to->tsecr = clib_net_to_host_u32 (*(u32 *) (data + 6)); + } + break; + case TCP_OPTION_SACK_PERMITTED: + if (!is_syn) + break; + if (opt_len == TCP_OPTION_LEN_SACK_PERMITTED && tcp_syn (th)) + to->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; + break; + case TCP_OPTION_SACK_BLOCK: + /* If SACK permitted was not advertised or a SYN, break */ + if ((to->flags & TCP_OPTS_FLAG_SACK_PERMITTED) == 0 || tcp_syn (th)) + break; + + /* If too short or not correctly formatted, break */ + if (opt_len < 10 || ((opt_len - 2) % TCP_OPTION_LEN_SACK_BLOCK)) + break; + + to->flags |= TCP_OPTS_FLAG_SACK; + to->n_sack_blocks = (opt_len - 2) / TCP_OPTION_LEN_SACK_BLOCK; + vec_reset_length (to->sacks); + for (j = 0; j < to->n_sack_blocks; j++) + { + b.start = clib_net_to_host_u32 (*(u32 *) (data + 2 + 8 * j)); + b.end = clib_net_to_host_u32 (*(u32 *) (data + 6 + 8 * j)); + vec_add1 (to->sacks, b); + } + break; + default: + /* Nothing to see here */ + continue; + } + } + return 0; +} + +/** + * Write TCP options to segment. + * + * @param data buffer where to write the options + * @param opts options to write + * @return length of options written + */ +always_inline u32 +tcp_options_write (u8 * data, tcp_options_t * opts) +{ + u32 opts_len = 0; + u32 buf, seq_len = 4; + + if (tcp_opts_mss (opts)) + { + *data++ = TCP_OPTION_MSS; + *data++ = TCP_OPTION_LEN_MSS; + buf = clib_host_to_net_u16 (opts->mss); + clib_memcpy_fast (data, &buf, sizeof (opts->mss)); + data += sizeof (opts->mss); + opts_len += TCP_OPTION_LEN_MSS; + } + + if (tcp_opts_wscale (opts)) + { + *data++ = TCP_OPTION_WINDOW_SCALE; + *data++ = TCP_OPTION_LEN_WINDOW_SCALE; + *data++ = opts->wscale; + opts_len += TCP_OPTION_LEN_WINDOW_SCALE; + } + + if (tcp_opts_sack_permitted (opts)) + { + *data++ = TCP_OPTION_SACK_PERMITTED; + *data++ = TCP_OPTION_LEN_SACK_PERMITTED; + opts_len += TCP_OPTION_LEN_SACK_PERMITTED; + } + + if (tcp_opts_tstamp (opts)) + { + *data++ = TCP_OPTION_TIMESTAMP; + *data++ = TCP_OPTION_LEN_TIMESTAMP; + buf = clib_host_to_net_u32 (opts->tsval); + clib_memcpy_fast (data, &buf, sizeof (opts->tsval)); + data += sizeof (opts->tsval); + buf = clib_host_to_net_u32 (opts->tsecr); + clib_memcpy_fast (data, &buf, sizeof (opts->tsecr)); + data += sizeof (opts->tsecr); + opts_len += TCP_OPTION_LEN_TIMESTAMP; + } + + if (tcp_opts_sack (opts)) + { + int i; + + if (opts->n_sack_blocks != 0) + { + *data++ = TCP_OPTION_SACK_BLOCK; + *data++ = 2 + opts->n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK; + for (i = 0; i < opts->n_sack_blocks; i++) + { + buf = clib_host_to_net_u32 (opts->sacks[i].start); + clib_memcpy_fast (data, &buf, seq_len); + data += seq_len; + buf = clib_host_to_net_u32 (opts->sacks[i].end); + clib_memcpy_fast (data, &buf, seq_len); + data += seq_len; + } + opts_len += 2 + opts->n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK; + } + } + + /* Terminate TCP options */ + if (opts_len % 4) + { + *data++ = TCP_OPTION_EOL; + opts_len += TCP_OPTION_LEN_EOL; + } + + /* Pad with zeroes to a u32 boundary */ + while (opts_len % 4) + { + *data++ = TCP_OPTION_NOOP; + opts_len += TCP_OPTION_LEN_NOOP; + } + return opts_len; +} + #endif /* included_tcp_packet_h */ /* diff --git a/src/vnet/tcp/tcp_sack.c b/src/vnet/tcp/tcp_sack.c new file mode 100644 index 00000000000..3388dd6c5b3 --- /dev/null +++ b/src/vnet/tcp/tcp_sack.c @@ -0,0 +1,607 @@ +/* + * Copyright (c) 2020 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/tcp/tcp_sack.h> + +static void +scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) +{ + sack_scoreboard_hole_t *next, *prev; + + if (hole->next != TCP_INVALID_SACK_HOLE_INDEX) + { + next = pool_elt_at_index (sb->holes, hole->next); + next->prev = hole->prev; + } + else + { + sb->tail = hole->prev; + } + + if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX) + { + prev = pool_elt_at_index (sb->holes, hole->prev); + prev->next = hole->next; + } + else + { + sb->head = hole->next; + } + + if (scoreboard_hole_index (sb, hole) == sb->cur_rxt_hole) + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; + + /* Poison the entry */ + if (CLIB_DEBUG > 0) + clib_memset (hole, 0xfe, sizeof (*hole)); + + pool_put (sb->holes, hole); +} + +static sack_scoreboard_hole_t * +scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index, + u32 start, u32 end) +{ + sack_scoreboard_hole_t *hole, *next, *prev; + u32 hole_index; + + pool_get (sb->holes, hole); + clib_memset (hole, 0, sizeof (*hole)); + + hole->start = start; + hole->end = end; + hole_index = scoreboard_hole_index (sb, hole); + + prev = scoreboard_get_hole (sb, prev_index); + if (prev) + { + hole->prev = prev_index; + hole->next = prev->next; + + if ((next = scoreboard_next_hole (sb, hole))) + next->prev = hole_index; + else + sb->tail = hole_index; + + prev->next = hole_index; + } + else + { + sb->head = hole_index; + hole->prev = TCP_INVALID_SACK_HOLE_INDEX; + hole->next = TCP_INVALID_SACK_HOLE_INDEX; + } + + return hole; +} + +always_inline void +scoreboard_update_sacked_rxt (sack_scoreboard_t * sb, u32 start, u32 end, + u8 has_rxt) +{ + if (!has_rxt || seq_geq (start, sb->high_rxt)) + return; + + sb->rxt_sacked += + seq_lt (end, sb->high_rxt) ? (end - start) : (sb->high_rxt - start); +} + +always_inline void +scoreboard_update_bytes (sack_scoreboard_t * sb, u32 ack, u32 snd_mss) +{ + sack_scoreboard_hole_t *left, *right; + u32 sacked = 0, blks = 0, old_sacked; + + old_sacked = sb->sacked_bytes; + + sb->last_lost_bytes = 0; + sb->lost_bytes = 0; + sb->sacked_bytes = 0; + + right = scoreboard_last_hole (sb); + if (!right) + { + sb->sacked_bytes = sb->high_sacked - ack; + sb->last_sacked_bytes = sb->sacked_bytes + - (old_sacked - sb->last_bytes_delivered); + return; + } + + if (seq_gt (sb->high_sacked, right->end)) + { + sacked = sb->high_sacked - right->end; + blks = 1; + } + + while (sacked < (TCP_DUPACK_THRESHOLD - 1) * snd_mss + && blks < TCP_DUPACK_THRESHOLD) + { + if (right->is_lost) + sb->lost_bytes += scoreboard_hole_bytes (right); + + left = scoreboard_prev_hole (sb, right); + if (!left) + { + ASSERT (right->start == ack || sb->is_reneging); + sacked += right->start - ack; + right = 0; + break; + } + + sacked += right->start - left->end; + blks++; + right = left; + } + + /* right is first lost */ + while (right) + { + sb->lost_bytes += scoreboard_hole_bytes (right); + sb->last_lost_bytes += right->is_lost ? 0 : (right->end - right->start); + right->is_lost = 1; + left = scoreboard_prev_hole (sb, right); + if (!left) + { + ASSERT (right->start == ack || sb->is_reneging); + sacked += right->start - ack; + break; + } + sacked += right->start - left->end; + right = left; + } + + sb->sacked_bytes = sacked; + sb->last_sacked_bytes = sacked - (old_sacked - sb->last_bytes_delivered); +} + +/** + * Figure out the next hole to retransmit + * + * Follows logic proposed in RFC6675 Sec. 4, NextSeg() + */ +sack_scoreboard_hole_t * +scoreboard_next_rxt_hole (sack_scoreboard_t * sb, + sack_scoreboard_hole_t * start, + u8 have_unsent, u8 * can_rescue, u8 * snd_limited) +{ + sack_scoreboard_hole_t *hole = 0; + + hole = start ? start : scoreboard_first_hole (sb); + while (hole && seq_leq (hole->end, sb->high_rxt) && hole->is_lost) + hole = scoreboard_next_hole (sb, hole); + + /* Nothing, return */ + if (!hole) + { + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; + return 0; + } + + /* Rule (1): if higher than rxt, less than high_sacked and lost */ + if (hole->is_lost && seq_lt (hole->start, sb->high_sacked)) + { + sb->cur_rxt_hole = scoreboard_hole_index (sb, hole); + } + else + { + /* Rule (2): available unsent data */ + if (have_unsent) + { + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; + return 0; + } + /* Rule (3): if hole not lost */ + else if (seq_lt (hole->start, sb->high_sacked)) + { + /* And we didn't already retransmit it */ + if (seq_leq (hole->end, sb->high_rxt)) + { + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; + return 0; + } + *snd_limited = 0; + sb->cur_rxt_hole = scoreboard_hole_index (sb, hole); + } + /* Rule (4): if hole beyond high_sacked */ + else + { + ASSERT (seq_geq (hole->start, sb->high_sacked)); + *snd_limited = 1; + *can_rescue = 1; + /* HighRxt MUST NOT be updated */ + return 0; + } + } + + if (hole && seq_lt (sb->high_rxt, hole->start)) + sb->high_rxt = hole->start; + + return hole; +} + +void +scoreboard_init_rxt (sack_scoreboard_t * sb, u32 snd_una) +{ + sack_scoreboard_hole_t *hole; + hole = scoreboard_first_hole (sb); + if (hole) + { + snd_una = seq_gt (snd_una, hole->start) ? snd_una : hole->start; + sb->cur_rxt_hole = sb->head; + } + sb->high_rxt = snd_una; + sb->rescue_rxt = snd_una - 1; +} + +void +scoreboard_init (sack_scoreboard_t * sb) +{ + sb->head = TCP_INVALID_SACK_HOLE_INDEX; + sb->tail = TCP_INVALID_SACK_HOLE_INDEX; + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; +} + +void +scoreboard_clear (sack_scoreboard_t * sb) +{ + sack_scoreboard_hole_t *hole; + while ((hole = scoreboard_first_hole (sb))) + { + scoreboard_remove_hole (sb, hole); + } + ASSERT (sb->head == sb->tail && sb->head == TCP_INVALID_SACK_HOLE_INDEX); + ASSERT (pool_elts (sb->holes) == 0); + sb->sacked_bytes = 0; + sb->last_sacked_bytes = 0; + sb->last_bytes_delivered = 0; + sb->lost_bytes = 0; + sb->last_lost_bytes = 0; + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; + sb->is_reneging = 0; +} + +void +scoreboard_clear_reneging (sack_scoreboard_t * sb, u32 start, u32 end) +{ + sack_scoreboard_hole_t *last_hole; + + clib_warning ("sack reneging"); + + scoreboard_clear (sb); + last_hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX, + start, end); + last_hole->is_lost = 1; + sb->tail = scoreboard_hole_index (sb, last_hole); + sb->high_sacked = start; + scoreboard_init_rxt (sb, start); +} + +/** + * Test that scoreboard is sane after recovery + * + * Returns 1 if scoreboard is empty or if first hole beyond + * snd_una. + */ +u8 +tcp_scoreboard_is_sane_post_recovery (tcp_connection_t * tc) +{ + sack_scoreboard_hole_t *hole; + hole = scoreboard_first_hole (&tc->sack_sb); + return (!hole || (seq_geq (hole->start, tc->snd_una) + && seq_lt (hole->end, tc->snd_nxt))); +} + +void +tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) +{ + sack_scoreboard_hole_t *hole, *next_hole; + sack_scoreboard_t *sb = &tc->sack_sb; + sack_block_t *blk, *rcv_sacks; + u32 blk_index = 0, i, j; + u8 has_rxt; + + sb->last_sacked_bytes = 0; + sb->last_bytes_delivered = 0; + sb->rxt_sacked = 0; + + if (!tcp_opts_sack (&tc->rcv_opts) && !sb->sacked_bytes + && sb->head == TCP_INVALID_SACK_HOLE_INDEX) + return; + + has_rxt = tcp_in_cong_recovery (tc); + + /* Remove invalid blocks */ + blk = tc->rcv_opts.sacks; + while (blk < vec_end (tc->rcv_opts.sacks)) + { + if (seq_lt (blk->start, blk->end) + && seq_gt (blk->start, tc->snd_una) + && seq_gt (blk->start, ack) && seq_leq (blk->end, tc->snd_nxt)) + { + blk++; + continue; + } + vec_del1 (tc->rcv_opts.sacks, blk - tc->rcv_opts.sacks); + } + + /* Add block for cumulative ack */ + if (seq_gt (ack, tc->snd_una)) + { + vec_add2 (tc->rcv_opts.sacks, blk, 1); + blk->start = tc->snd_una; + blk->end = ack; + } + + if (vec_len (tc->rcv_opts.sacks) == 0) + return; + + tcp_scoreboard_trace_add (tc, ack); + + /* Make sure blocks are ordered */ + rcv_sacks = tc->rcv_opts.sacks; + for (i = 0; i < vec_len (rcv_sacks); i++) + for (j = i + 1; j < vec_len (rcv_sacks); j++) + if (seq_lt (rcv_sacks[j].start, rcv_sacks[i].start)) + { + sack_block_t tmp = rcv_sacks[i]; + rcv_sacks[i] = rcv_sacks[j]; + rcv_sacks[j] = tmp; + } + + if (sb->head == TCP_INVALID_SACK_HOLE_INDEX) + { + /* Handle reneging as a special case */ + if (PREDICT_FALSE (sb->is_reneging)) + { + /* No holes, only sacked bytes */ + if (seq_leq (tc->snd_nxt, sb->high_sacked)) + { + /* No progress made so return */ + if (seq_leq (ack, tc->snd_una)) + return; + + /* Update sacked bytes delivered and return */ + sb->last_bytes_delivered = ack - tc->snd_una; + sb->sacked_bytes -= sb->last_bytes_delivered; + sb->is_reneging = seq_lt (ack, sb->high_sacked); + return; + } + + /* New hole above high sacked. Add it and process normally */ + hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX, + sb->high_sacked, tc->snd_nxt); + sb->tail = scoreboard_hole_index (sb, hole); + } + /* Not reneging and no holes. Insert the first that covers all + * outstanding bytes */ + else + { + hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX, + tc->snd_una, tc->snd_nxt); + sb->tail = scoreboard_hole_index (sb, hole); + } + sb->high_sacked = rcv_sacks[vec_len (rcv_sacks) - 1].end; + } + else + { + /* If we have holes but snd_nxt is beyond the last hole, update + * last hole end or add new hole after high sacked */ + hole = scoreboard_last_hole (sb); + if (seq_gt (tc->snd_nxt, hole->end)) + { + if (seq_geq (hole->start, sb->high_sacked)) + { + hole->end = tc->snd_nxt; + } + /* New hole after high sacked block */ + else if (seq_lt (sb->high_sacked, tc->snd_nxt)) + { + scoreboard_insert_hole (sb, sb->tail, sb->high_sacked, + tc->snd_nxt); + } + } + + /* Keep track of max byte sacked for when the last hole + * is acked */ + sb->high_sacked = seq_max (rcv_sacks[vec_len (rcv_sacks) - 1].end, + sb->high_sacked); + } + + /* Walk the holes with the SACK blocks */ + hole = pool_elt_at_index (sb->holes, sb->head); + + if (PREDICT_FALSE (sb->is_reneging)) + { + sb->last_bytes_delivered += clib_min (hole->start - tc->snd_una, + ack - tc->snd_una); + sb->is_reneging = seq_lt (ack, hole->start); + } + + while (hole && blk_index < vec_len (rcv_sacks)) + { + blk = &rcv_sacks[blk_index]; + if (seq_leq (blk->start, hole->start)) + { + /* Block covers hole. Remove hole */ + if (seq_geq (blk->end, hole->end)) + { + next_hole = scoreboard_next_hole (sb, hole); + + /* If covered by ack, compute delivered bytes */ + if (blk->end == ack) + { + u32 sacked = next_hole ? next_hole->start : sb->high_sacked; + if (PREDICT_FALSE (seq_lt (ack, sacked))) + { + sb->last_bytes_delivered += ack - hole->end; + sb->is_reneging = 1; + } + else + { + sb->last_bytes_delivered += sacked - hole->end; + sb->is_reneging = 0; + } + } + scoreboard_update_sacked_rxt (sb, hole->start, hole->end, + has_rxt); + scoreboard_remove_hole (sb, hole); + hole = next_hole; + } + /* Partial 'head' overlap */ + else + { + if (seq_gt (blk->end, hole->start)) + { + scoreboard_update_sacked_rxt (sb, hole->start, blk->end, + has_rxt); + hole->start = blk->end; + } + blk_index++; + } + } + else + { + /* Hole must be split */ + if (seq_lt (blk->end, hole->end)) + { + u32 hole_index = scoreboard_hole_index (sb, hole); + next_hole = scoreboard_insert_hole (sb, hole_index, blk->end, + hole->end); + /* Pool might've moved */ + hole = scoreboard_get_hole (sb, hole_index); + hole->end = blk->start; + + scoreboard_update_sacked_rxt (sb, blk->start, blk->end, + has_rxt); + + blk_index++; + ASSERT (hole->next == scoreboard_hole_index (sb, next_hole)); + } + else if (seq_lt (blk->start, hole->end)) + { + scoreboard_update_sacked_rxt (sb, blk->start, hole->end, + has_rxt); + hole->end = blk->start; + } + hole = scoreboard_next_hole (sb, hole); + } + } + + scoreboard_update_bytes (sb, ack, tc->snd_mss); + + ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes || tcp_in_recovery (tc)); + ASSERT (sb->sacked_bytes == 0 || tcp_in_recovery (tc) + || sb->sacked_bytes <= tc->snd_nxt - seq_max (tc->snd_una, ack)); + ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_nxt + - seq_max (tc->snd_una, ack) || tcp_in_recovery (tc)); + ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc) + || sb->is_reneging || sb->holes[sb->head].start == ack); + ASSERT (sb->last_lost_bytes <= sb->lost_bytes); + ASSERT ((ack - tc->snd_una) + sb->last_sacked_bytes + - sb->last_bytes_delivered >= sb->rxt_sacked); + ASSERT ((ack - tc->snd_una) >= tc->sack_sb.last_bytes_delivered + || (tc->flags & TCP_CONN_FINSNT)); + + TCP_EVT (TCP_EVT_CC_SCOREBOARD, tc); +} + +static u8 +tcp_sack_vector_is_sane (sack_block_t * sacks) +{ + int i; + for (i = 1; i < vec_len (sacks); i++) + { + if (sacks[i - 1].end == sacks[i].start) + return 0; + } + return 1; +} + +/** + * Build SACK list as per RFC2018. + * + * Makes sure the first block contains the segment that generated the current + * ACK and the following ones are the ones most recently reported in SACK + * blocks. + * + * @param tc TCP connection for which the SACK list is updated + * @param start Start sequence number of the newest SACK block + * @param end End sequence of the newest SACK block + */ +void +tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end) +{ + sack_block_t *new_list = tc->snd_sacks_fl, *block = 0; + int i; + + /* If the first segment is ooo add it to the list. Last write might've moved + * rcv_nxt over the first segment. */ + if (seq_lt (tc->rcv_nxt, start)) + { + vec_add2 (new_list, block, 1); + block->start = start; + block->end = end; + } + + /* Find the blocks still worth keeping. */ + for (i = 0; i < vec_len (tc->snd_sacks); i++) + { + /* Discard if rcv_nxt advanced beyond current block */ + if (seq_leq (tc->snd_sacks[i].start, tc->rcv_nxt)) + continue; + + /* Merge or drop if segment overlapped by the new segment */ + if (block && (seq_geq (tc->snd_sacks[i].end, new_list[0].start) + && seq_leq (tc->snd_sacks[i].start, new_list[0].end))) + { + if (seq_lt (tc->snd_sacks[i].start, new_list[0].start)) + new_list[0].start = tc->snd_sacks[i].start; + if (seq_lt (new_list[0].end, tc->snd_sacks[i].end)) + new_list[0].end = tc->snd_sacks[i].end; + continue; + } + + /* Save to new SACK list if we have space. */ + if (vec_len (new_list) < TCP_MAX_SACK_BLOCKS) + vec_add1 (new_list, tc->snd_sacks[i]); + } + + ASSERT (vec_len (new_list) <= TCP_MAX_SACK_BLOCKS); + + /* Replace old vector with new one */ + vec_reset_length (tc->snd_sacks); + tc->snd_sacks_fl = tc->snd_sacks; + tc->snd_sacks = new_list; + + /* Segments should not 'touch' */ + ASSERT (tcp_sack_vector_is_sane (tc->snd_sacks)); +} + +u32 +tcp_sack_list_bytes (tcp_connection_t * tc) +{ + u32 bytes = 0, i; + for (i = 0; i < vec_len (tc->snd_sacks); i++) + bytes += tc->snd_sacks[i].end - tc->snd_sacks[i].start; + return bytes; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_sack.h b/src/vnet/tcp/tcp_sack.h new file mode 100644 index 00000000000..1c3fa95510b --- /dev/null +++ b/src/vnet/tcp/tcp_sack.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2020 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SRC_VNET_TCP_TCP_SACK_H_ +#define SRC_VNET_TCP_TCP_SACK_H_ + +#include <vnet/tcp/tcp_types.h> + +always_inline u32 +scoreboard_hole_index (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) +{ + ASSERT (!pool_is_free_index (sb->holes, hole - sb->holes)); + return hole - sb->holes; +} + +always_inline u32 +scoreboard_hole_bytes (sack_scoreboard_hole_t * hole) +{ + return hole->end - hole->start; +} + +always_inline sack_scoreboard_hole_t * +scoreboard_get_hole (sack_scoreboard_t * sb, u32 index) +{ + if (index != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, index); + return 0; +} + +always_inline sack_scoreboard_hole_t * +scoreboard_next_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) +{ + if (hole->next != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, hole->next); + return 0; +} + +always_inline sack_scoreboard_hole_t * +scoreboard_prev_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) +{ + if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, hole->prev); + return 0; +} + +always_inline sack_scoreboard_hole_t * +scoreboard_first_hole (sack_scoreboard_t * sb) +{ + if (sb->head != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, sb->head); + return 0; +} + +always_inline sack_scoreboard_hole_t * +scoreboard_last_hole (sack_scoreboard_t * sb) +{ + if (sb->tail != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, sb->tail); + return 0; +} + +#if TCP_SCOREBOARD_TRACE +#define tcp_scoreboard_trace_add(_tc, _ack) \ +{ \ + static u64 _group = 0; \ + sack_scoreboard_t *_sb = &_tc->sack_sb; \ + sack_block_t *_sack, *_sacks; \ + scoreboard_trace_elt_t *_elt; \ + int i; \ + _group++; \ + _sacks = _tc->rcv_opts.sacks; \ + for (i = 0; i < vec_len (_sacks); i++) \ + { \ + _sack = &_sacks[i]; \ + vec_add2 (_sb->trace, _elt, 1); \ + _elt->start = _sack->start; \ + _elt->end = _sack->end; \ + _elt->ack = _elt->end == _ack ? _ack : 0; \ + _elt->snd_una_max = _elt->end == _ack ? _tc->snd_una_max : 0; \ + _elt->group = _group; \ + } \ +} +#else +#define tcp_scoreboard_trace_add(_tc, _ack) +#endif + +sack_scoreboard_hole_t *scoreboard_next_rxt_hole (sack_scoreboard_t * sb, + sack_scoreboard_hole_t * + start, u8 have_sent_1_smss, + u8 * can_rescue, + u8 * snd_limited); +void scoreboard_clear (sack_scoreboard_t * sb); +void scoreboard_clear_reneging (sack_scoreboard_t * sb, u32 start, u32 end); +void scoreboard_init (sack_scoreboard_t * sb); +void scoreboard_init_rxt (sack_scoreboard_t * sb, u32 snd_una); + +format_function_t format_tcp_scoreboard; + +/* Made public for unit testing only */ +void tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end); +u32 tcp_sack_list_bytes (tcp_connection_t * tc); +void tcp_rcv_sacks (tcp_connection_t * tc, u32 ack); +u8 *tcp_scoreboard_replay (u8 * s, tcp_connection_t * tc, u8 verbose); +u8 tcp_scoreboard_is_sane_post_recovery (tcp_connection_t * tc); + +#endif /* SRC_VNET_TCP_TCP_SACK_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_types.h b/src/vnet/tcp/tcp_types.h new file mode 100644 index 00000000000..ccb7ae86aed --- /dev/null +++ b/src/vnet/tcp/tcp_types.h @@ -0,0 +1,451 @@ +/* + * Copyright (c) 2020 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SRC_VNET_TCP_TCP_TYPES_H_ +#define SRC_VNET_TCP_TCP_TYPES_H_ + +#include <vppinfra/clib.h> +#include <vppinfra/rbtree.h> +#include <vnet/tcp/tcp_packet.h> +#include <vnet/session/transport.h> + +#define TCP_TICK 0.001 /**< TCP tick period (s) */ +#define THZ (u32) (1/TCP_TICK) /**< TCP tick frequency */ +#define TCP_TSTAMP_RESOLUTION TCP_TICK /**< Time stamp resolution */ +#define TCP_PAWS_IDLE 24 * 24 * 60 * 60 * THZ /**< 24 days */ +#define TCP_FIB_RECHECK_PERIOD 1 * THZ /**< Recheck every 1s */ +#define TCP_MAX_OPTION_SPACE 40 +#define TCP_CC_DATA_SZ 24 +#define TCP_MAX_GSO_SZ 65536 +#define TCP_RXT_MAX_BURST 10 + +#define TCP_DUPACK_THRESHOLD 3 +#define TCP_IW_N_SEGMENTS 10 +#define TCP_ALWAYS_ACK 1 /**< On/off delayed acks */ +#define TCP_USE_SACKS 1 /**< Disable only for testing */ + +/** TCP FSM state definitions as per RFC793. */ +#define foreach_tcp_fsm_state \ + _(CLOSED, "CLOSED") \ + _(LISTEN, "LISTEN") \ + _(SYN_SENT, "SYN_SENT") \ + _(SYN_RCVD, "SYN_RCVD") \ + _(ESTABLISHED, "ESTABLISHED") \ + _(CLOSE_WAIT, "CLOSE_WAIT") \ + _(FIN_WAIT_1, "FIN_WAIT_1") \ + _(LAST_ACK, "LAST_ACK") \ + _(CLOSING, "CLOSING") \ + _(FIN_WAIT_2, "FIN_WAIT_2") \ + _(TIME_WAIT, "TIME_WAIT") + +typedef enum _tcp_state +{ +#define _(sym, str) TCP_STATE_##sym, + foreach_tcp_fsm_state +#undef _ + TCP_N_STATES +} tcp_state_t; + +/** TCP timers */ +#define foreach_tcp_timer \ + _(RETRANSMIT, "RETRANSMIT") \ + _(DELACK, "DELAYED ACK") \ + _(PERSIST, "PERSIST") \ + _(WAITCLOSE, "WAIT CLOSE") \ + _(RETRANSMIT_SYN, "RETRANSMIT SYN") \ + +typedef enum _tcp_timers +{ +#define _(sym, str) TCP_TIMER_##sym, + foreach_tcp_timer +#undef _ + TCP_N_TIMERS +} tcp_timers_e; + +#define TCP_TIMER_HANDLE_INVALID ((u32) ~0) + +#define TCP_TIMER_TICK 0.1 /**< Timer tick in seconds */ +#define TCP_TO_TIMER_TICK TCP_TICK*10 /**< Factor for converting + ticks to timer ticks */ + +#define TCP_RTO_MAX 60 * THZ /* Min max RTO (60s) as per RFC6298 */ +#define TCP_RTO_MIN 0.2 * THZ /* Min RTO (200ms) - lower than standard */ +#define TCP_RTT_MAX 30 * THZ /* 30s (probably too much) */ +#define TCP_RTO_SYN_RETRIES 3 /* SYN retries without doubling RTO */ +#define TCP_RTO_INIT 1 * THZ /* Initial retransmit timer */ +#define TCP_RTO_BOFF_MAX 8 /* Max number of retries before reset */ +#define TCP_ESTABLISH_TIME (60 * THZ) /* Connection establish timeout */ + +/** Connection configuration flags */ +#define foreach_tcp_cfg_flag \ + _(RATE_SAMPLE, "Rate sampling") \ + _(NO_CSUM_OFFLOAD, "No csum offload") \ + _(NO_TSO, "TSO off") \ + _(TSO, "TSO") \ + _(NO_ENDPOINT,"No endpoint") \ + +typedef enum tcp_cfg_flag_bits_ +{ +#define _(sym, str) TCP_CFG_F_##sym##_BIT, + foreach_tcp_cfg_flag +#undef _ + TCP_CFG_N_FLAG_BITS +} tcp_cfg_flag_bits_e; + +typedef enum tcp_cfg_flag_ +{ +#define _(sym, str) TCP_CFG_F_##sym = 1 << TCP_CFG_F_##sym##_BIT, + foreach_tcp_cfg_flag +#undef _ + TCP_CFG_N_FLAGS +} tcp_cfg_flags_e; + +/** TCP connection flags */ +#define foreach_tcp_connection_flag \ + _(SNDACK, "Send ACK") \ + _(FINSNT, "FIN sent") \ + _(RECOVERY, "Recovery") \ + _(FAST_RECOVERY, "Fast Recovery") \ + _(DCNT_PENDING, "Disconnect pending") \ + _(HALF_OPEN_DONE, "Half-open completed") \ + _(FINPNDG, "FIN pending") \ + _(RXT_PENDING, "Retransmit pending") \ + _(FRXT_FIRST, "Retransmit first") \ + _(DEQ_PENDING, "Dequeue pending ") \ + _(PSH_PENDING, "PSH pending") \ + _(FINRCVD, "FIN received") \ + _(ZERO_RWND_SENT, "Zero RWND sent") \ + +typedef enum tcp_connection_flag_bits_ +{ +#define _(sym, str) TCP_CONN_##sym##_BIT, + foreach_tcp_connection_flag +#undef _ + TCP_CONN_N_FLAG_BITS +} tcp_connection_flag_bits_e; + +typedef enum tcp_connection_flag_ +{ +#define _(sym, str) TCP_CONN_##sym = 1 << TCP_CONN_##sym##_BIT, + foreach_tcp_connection_flag +#undef _ + TCP_CONN_N_FLAGS +} tcp_connection_flags_e; + +#define TCP_SCOREBOARD_TRACE (0) +#define TCP_MAX_SACK_BLOCKS 256 /**< Max number of SACK blocks stored */ +#define TCP_INVALID_SACK_HOLE_INDEX ((u32)~0) + +typedef struct _scoreboard_trace_elt +{ + u32 start; + u32 end; + u32 ack; + u32 snd_una_max; + u32 group; +} scoreboard_trace_elt_t; + +typedef struct _sack_scoreboard_hole +{ + u32 next; /**< Index for next entry in linked list */ + u32 prev; /**< Index for previous entry in linked list */ + u32 start; /**< Start sequence number */ + u32 end; /**< End sequence number */ + u8 is_lost; /**< Mark hole as lost */ +} sack_scoreboard_hole_t; + +typedef struct _sack_scoreboard +{ + sack_scoreboard_hole_t *holes; /**< Pool of holes */ + u32 head; /**< Index of first entry */ + u32 tail; /**< Index of last entry */ + u32 sacked_bytes; /**< Number of bytes sacked in sb */ + u32 last_sacked_bytes; /**< Number of bytes last sacked */ + u32 last_bytes_delivered; /**< Sack bytes delivered to app */ + u32 rxt_sacked; /**< Rxt bytes last delivered */ + u32 high_sacked; /**< Highest byte sacked (fack) */ + u32 high_rxt; /**< Highest retransmitted sequence */ + u32 rescue_rxt; /**< Rescue sequence number */ + u32 lost_bytes; /**< Bytes lost as per RFC6675 */ + u32 last_lost_bytes; /**< Number of bytes last lost */ + u32 cur_rxt_hole; /**< Retransmitting from this hole */ + u8 is_reneging; + +#if TCP_SCOREBOARD_TRACE + scoreboard_trace_elt_t *trace; +#endif + +} sack_scoreboard_t; + +#define TCP_BTS_INVALID_INDEX ((u32)~0) + +typedef enum tcp_bts_flags_ +{ + TCP_BTS_IS_RXT = 1, + TCP_BTS_IS_APP_LIMITED = 1 << 1, + TCP_BTS_IS_SACKED = 1 << 2, + TCP_BTS_IS_RXT_LOST = 1 << 3, +} __clib_packed tcp_bts_flags_t; + +typedef struct tcp_bt_sample_ +{ + u32 next; /**< Next sample index in list */ + u32 prev; /**< Previous sample index in list */ + u32 min_seq; /**< Min seq number in sample */ + u32 max_seq; /**< Max seq number. Set for rxt samples */ + u64 delivered; /**< Total delivered bytes for sample */ + f64 delivered_time; /**< Delivered time when sample taken */ + f64 tx_time; /**< Transmit time for the burst */ + f64 first_tx_time; /**< Connection first tx time at tx */ + u64 tx_in_flight; /**< In flight at tx time */ + u64 tx_lost; /**< Lost at tx time */ + tcp_bts_flags_t flags; /**< Sample flag */ +} tcp_bt_sample_t; + +typedef struct tcp_rate_sample_ +{ + u64 prior_delivered; /**< Delivered of sample used for rate, i.e., + total bytes delivered at prior_time */ + f64 prior_time; /**< Delivered time of sample used for rate */ + f64 interval_time; /**< Time to ack the bytes delivered */ + f64 rtt_time; /**< RTT for sample */ + u64 tx_in_flight; /**< In flight at (re)transmit time */ + u64 tx_lost; /**< Lost over interval */ + u32 delivered; /**< Bytes delivered in interval_time */ + u32 acked_and_sacked; /**< Bytes acked + sacked now */ + u32 last_lost; /**< Bytes lost now */ + u32 lost; /**< Number of bytes lost over interval */ + tcp_bts_flags_t flags; /**< Rate sample flags from bt sample */ +} tcp_rate_sample_t; + +typedef struct tcp_byte_tracker_ +{ + tcp_bt_sample_t *samples; /**< Pool of samples */ + rb_tree_t sample_lookup; /**< Rbtree for sample lookup by min_seq */ + u32 head; /**< Head of samples linked list */ + u32 tail; /**< Tail of samples linked list */ + u32 last_ooo; /**< Cached last ooo sample */ +} tcp_byte_tracker_t; + +typedef enum _tcp_cc_algorithm_type +{ + TCP_CC_NEWRENO, + TCP_CC_CUBIC, + TCP_CC_LAST = TCP_CC_CUBIC +} tcp_cc_algorithm_type_e; + +typedef struct _tcp_cc_algorithm tcp_cc_algorithm_t; + +typedef enum _tcp_cc_ack_t +{ + TCP_CC_ACK, + TCP_CC_DUPACK, + TCP_CC_PARTIALACK +} tcp_cc_ack_t; + +typedef enum tcp_cc_event_ +{ + TCP_CC_EVT_START_TX, +} tcp_cc_event_t; + +/* + * As per RFC4898 tcpEStatsStackSoftErrors + */ +typedef struct tcp_errors_ +{ + u32 below_data_wnd; /**< All data in seg is below snd_una */ + u32 above_data_wnd; /**< Some data in segment is above snd_wnd */ + u32 below_ack_wnd; /**< Acks for data below snd_una */ + u32 above_ack_wnd; /**< Acks for data not sent */ +} tcp_errors_t; + +typedef struct _tcp_connection +{ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + transport_connection_t connection; /**< Common transport data. First! */ + + u8 state; /**< TCP state as per tcp_state_t */ + u8 cfg_flags; /**< Connection configuration flags */ + u16 flags; /**< Connection flags (see tcp_conn_flags_e) */ + u32 timers[TCP_N_TIMERS]; /**< Timer handles into timer wheel */ + + u64 segs_in; /** RFC4022/4898 tcpHCInSegs/tcpEStatsPerfSegsIn */ + u64 bytes_in; /** RFC4898 tcpEStatsPerfHCDataOctetsIn */ + u64 segs_out; /** RFC4898 tcpEStatsPerfSegsOut */ + u64 bytes_out; /** RFC4898 tcpEStatsPerfHCDataOctetsOut */ + + /** Send sequence variables RFC793 */ + u32 snd_una; /**< oldest unacknowledged sequence number */ + u32 snd_una_max; /**< newest unacknowledged sequence number + 1*/ + u32 snd_wnd; /**< send window */ + u32 snd_wl1; /**< seq number used for last snd.wnd update */ + u32 snd_wl2; /**< ack number used for last snd.wnd update */ + u32 snd_nxt; /**< next seq number to be sent */ + u16 snd_mss; /**< Effective send max seg (data) size */ + + u64 data_segs_in; /** RFC4898 tcpEStatsPerfDataSegsIn */ + u64 data_segs_out; /** RFC4898 tcpEStatsPerfDataSegsOut */ + + /** Receive sequence variables RFC793 */ + u32 rcv_nxt; /**< next sequence number expected */ + u32 rcv_wnd; /**< receive window we expect */ + + u32 rcv_las; /**< rcv_nxt at last ack sent/rcv_wnd update */ + u32 iss; /**< initial sent sequence */ + u32 irs; /**< initial remote sequence */ + + /* Options */ + u8 snd_opts_len; /**< Tx options len */ + u8 rcv_wscale; /**< Window scale to advertise to peer */ + u8 snd_wscale; /**< Window scale to use when sending */ + u32 tsval_recent; /**< Last timestamp received */ + u32 tsval_recent_age; /**< When last updated tstamp_recent*/ + tcp_options_t snd_opts; /**< Tx options for connection */ + tcp_options_t rcv_opts; /**< Rx options for connection */ + + sack_block_t *snd_sacks; /**< Vector of SACKs to send. XXX Fixed size? */ + u8 snd_sack_pos; /**< Position in vec of first block to send */ + sack_block_t *snd_sacks_fl; /**< Vector for building new list */ + sack_scoreboard_t sack_sb; /**< SACK "scoreboard" that tracks holes */ + + u16 rcv_dupacks; /**< Number of recent DUPACKs received */ + u32 dupacks_in; /**< RFC4898 tcpEStatsStackDupAcksIn*/ + u8 pending_dupacks; /**< Number of DUPACKs to be sent */ + u32 dupacks_out; /**< RFC4898 tcpEStatsPathDupAcksOut */ + + /* Congestion control */ + u32 cwnd; /**< Congestion window */ + u32 cwnd_acc_bytes; /**< Bytes accumulated for cwnd increment */ + u32 ssthresh; /**< Slow-start threshold */ + u32 prev_ssthresh; /**< ssthresh before congestion */ + u32 prev_cwnd; /**< ssthresh before congestion */ + u32 bytes_acked; /**< Bytes acknowledged by current segment */ + u32 burst_acked; /**< Bytes acknowledged in current burst */ + u32 snd_rxt_bytes; /**< Retransmitted bytes during current cc event */ + u32 snd_rxt_ts; /**< Timestamp when first packet is retransmitted */ + u32 prr_delivered; /**< RFC6937 bytes delivered during current event */ + u32 prr_start; /**< snd_una when prr starts */ + u32 rxt_delivered; /**< Rxt bytes delivered during current cc event */ + u32 rxt_head; /**< snd_una last time we re rxted the head */ + u32 tsecr_last_ack; /**< Timestamp echoed to us in last healthy ACK */ + u32 snd_congestion; /**< snd_una_max when congestion is detected */ + u32 tx_fifo_size; /**< Tx fifo size. Used to constrain cwnd */ + tcp_cc_algorithm_t *cc_algo; /**< Congestion control algorithm */ + u8 cc_data[TCP_CC_DATA_SZ]; /**< Congestion control algo private data */ + + u32 fr_occurences; /**< fast-retransmit occurrences RFC4898 + tcpEStatsStackFastRetran */ + u32 tr_occurences; /**< timer-retransmit occurrences */ + u64 bytes_retrans; /**< RFC4898 tcpEStatsPerfOctetsRetrans */ + u64 segs_retrans; /**< RFC4898 tcpEStatsPerfSegsRetrans*/ + + /* RTT and RTO */ + u32 rto; /**< Retransmission timeout */ + u32 rto_boff; /**< Index for RTO backoff */ + u32 srtt; /**< Smoothed RTT */ + u32 rttvar; /**< Smoothed mean RTT difference. Approximates variance */ + u32 rtt_seq; /**< Sequence number for tracked ACK */ + f64 rtt_ts; /**< Timestamp for tracked ACK */ + f64 mrtt_us; /**< High precision mrtt from tracked acks */ + + u32 psh_seq; /**< Add psh header for seg that includes this */ + u32 next_node_index; /**< Can be used to control next node in output */ + u32 next_node_opaque; /**< Opaque to pass to next node */ + u32 limited_transmit; /**< snd_nxt when limited transmit starts */ + u32 sw_if_index; /**< Interface for the connection */ + + /* Delivery rate estimation */ + u64 delivered; /**< Total bytes delivered to peer */ + u64 app_limited; /**< Delivered when app-limited detected */ + f64 delivered_time; /**< Time last bytes were acked */ + f64 first_tx_time; /**< Send time for recently delivered/sent */ + u64 lost; /**< Total bytes lost */ + tcp_byte_tracker_t *bt; /**< Tx byte tracker */ + + tcp_errors_t errors; /**< Soft connection errors */ + + f64 start_ts; /**< Timestamp when connection initialized */ + u32 last_fib_check; /**< Last time we checked fib route for peer */ + u16 mss; /**< Our max seg size that includes options */ + u32 timestamp_delta; /**< Offset for timestamp */ + u32 ipv6_flow_label; /**< flow label for ipv6 header */ + +#define rst_state snd_wl1 +} tcp_connection_t; + +/* *INDENT-OFF* */ +struct _tcp_cc_algorithm +{ + const char *name; + uword (*unformat_cfg) (unformat_input_t * input); + void (*init) (tcp_connection_t * tc); + void (*cleanup) (tcp_connection_t * tc); + void (*rcv_ack) (tcp_connection_t * tc, tcp_rate_sample_t *rs); + void (*rcv_cong_ack) (tcp_connection_t * tc, tcp_cc_ack_t ack, + tcp_rate_sample_t *rs); + void (*congestion) (tcp_connection_t * tc); + void (*loss) (tcp_connection_t * tc); + void (*recovered) (tcp_connection_t * tc); + void (*undo_recovery) (tcp_connection_t * tc); + void (*event) (tcp_connection_t *tc, tcp_cc_event_t evt); + u64 (*get_pacing_rate) (tcp_connection_t *tc); +}; +/* *INDENT-ON* */ + +#define tcp_fastrecovery_on(tc) (tc)->flags |= TCP_CONN_FAST_RECOVERY +#define tcp_fastrecovery_off(tc) (tc)->flags &= ~TCP_CONN_FAST_RECOVERY +#define tcp_recovery_on(tc) (tc)->flags |= TCP_CONN_RECOVERY +#define tcp_recovery_off(tc) (tc)->flags &= ~TCP_CONN_RECOVERY +#define tcp_in_fastrecovery(tc) ((tc)->flags & TCP_CONN_FAST_RECOVERY) +#define tcp_in_recovery(tc) ((tc)->flags & (TCP_CONN_RECOVERY)) +#define tcp_in_slowstart(tc) (tc->cwnd < tc->ssthresh) +#define tcp_disconnect_pending(tc) ((tc)->flags & TCP_CONN_DCNT_PENDING) +#define tcp_disconnect_pending_on(tc) ((tc)->flags |= TCP_CONN_DCNT_PENDING) +#define tcp_disconnect_pending_off(tc) ((tc)->flags &= ~TCP_CONN_DCNT_PENDING) +#define tcp_fastrecovery_first(tc) ((tc)->flags & TCP_CONN_FRXT_FIRST) +#define tcp_fastrecovery_first_on(tc) ((tc)->flags |= TCP_CONN_FRXT_FIRST) +#define tcp_fastrecovery_first_off(tc) ((tc)->flags &= ~TCP_CONN_FRXT_FIRST) + +#define tcp_in_cong_recovery(tc) ((tc)->flags & \ + (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY)) + +always_inline void +tcp_cong_recovery_off (tcp_connection_t * tc) +{ + tc->flags &= ~(TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY); + tcp_fastrecovery_first_off (tc); +} + +#define tcp_csum_offload(tc) (!((tc)->cfg_flags & TCP_CFG_F_NO_CSUM_OFFLOAD)) + +#define tcp_zero_rwnd_sent(tc) ((tc)->flags & TCP_CONN_ZERO_RWND_SENT) +#define tcp_zero_rwnd_sent_on(tc) (tc)->flags |= TCP_CONN_ZERO_RWND_SENT +#define tcp_zero_rwnd_sent_off(tc) (tc)->flags &= ~TCP_CONN_ZERO_RWND_SENT + +always_inline tcp_connection_t * +tcp_get_connection_from_transport (transport_connection_t * tconn) +{ + return (tcp_connection_t *) tconn; +} + +#endif /* SRC_VNET_TCP_TCP_TYPES_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ |