From 68b0fb0c620c7451ef1a6380c43c39de6614db51 Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Tue, 28 Feb 2017 15:15:56 -0500 Subject: VPP-598: tcp stack initial commit Change-Id: I49e5ce0aae6e4ff634024387ceaf7dbc432a0351 Signed-off-by: Dave Barach Signed-off-by: Florin Coras --- src/vnet/tcp/tcp_input.c | 2316 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2316 insertions(+) create mode 100644 src/vnet/tcp/tcp_input.c (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c new file mode 100644 index 00000000..daa0683b --- /dev/null +++ b/src/vnet/tcp/tcp_input.c @@ -0,0 +1,2316 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +static char *tcp_error_strings[] = { +#define tcp_error(n,s) s, +#include +#undef tcp_error +}; + +/* All TCP nodes have the same outgoing arcs */ +#define foreach_tcp_state_next \ + _ (DROP, "error-drop") \ + _ (TCP4_OUTPUT, "tcp4-output") \ + _ (TCP6_OUTPUT, "tcp6-output") + +typedef enum _tcp_established_next +{ +#define _(s,n) TCP_ESTABLISHED_NEXT_##s, + foreach_tcp_state_next +#undef _ + TCP_ESTABLISHED_N_NEXT, +} tcp_established_next_t; + +typedef enum _tcp_rcv_process_next +{ +#define _(s,n) TCP_RCV_PROCESS_NEXT_##s, + foreach_tcp_state_next +#undef _ + TCP_RCV_PROCESS_N_NEXT, +} tcp_rcv_process_next_t; + +typedef enum _tcp_syn_sent_next +{ +#define _(s,n) TCP_SYN_SENT_NEXT_##s, + foreach_tcp_state_next +#undef _ + TCP_SYN_SENT_N_NEXT, +} tcp_syn_sent_next_t; + +typedef enum _tcp_listen_next +{ +#define _(s,n) TCP_LISTEN_NEXT_##s, + foreach_tcp_state_next +#undef _ + TCP_LISTEN_N_NEXT, +} tcp_listen_next_t; + +/* Generic, state independent indices */ +typedef enum _tcp_state_next +{ +#define _(s,n) TCP_NEXT_##s, + foreach_tcp_state_next +#undef _ + TCP_STATE_N_NEXT, +} tcp_state_next_t; + +#define tcp_next_output(is_ip4) (is_ip4 ? TCP_NEXT_TCP4_OUTPUT \ + : TCP_NEXT_TCP6_OUTPUT) + +vlib_node_registration_t tcp4_established_node; +vlib_node_registration_t tcp6_established_node; + +/** + * Validate segment sequence number. As per RFC793: + * + * Segment Receive Test + * Length Window + * ------- ------- ------------------------------------------- + * 0 0 SEG.SEQ = RCV.NXT + * 0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND + * >0 0 not acceptable + * >0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND + * or RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND + * + * This ultimately consists in checking if segment falls within the window. + * The one important difference compared to RFC793 is that we use rcv_las, + * or the rcv_nxt at last ack sent instead of rcv_nxt since that's the + * peer's reference when computing our receive window. + * + * This accepts only segments within the window. + */ +always_inline u8 +tcp_segment_in_rcv_wnd (tcp_connection_t * tc, u32 seq, u32 end_seq) +{ + return seq_leq (end_seq, tc->rcv_las + tc->rcv_wnd) + && seq_geq (seq, tc->rcv_nxt); +} + +void +tcp_options_parse (tcp_header_t * th, tcp_options_t * to) +{ + const u8 *data; + u8 opt_len, opts_len, kind; + int j; + sack_block_t b; + + opts_len = (tcp_doff (th) << 2) - sizeof (tcp_header_t); + data = (const u8 *) (th + 1); + + /* Zero out all flags but those set in SYN */ + to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE); + + for (; opts_len > 0; opts_len -= opt_len, data += opt_len) + { + kind = data[0]; + + /* Get options length */ + if (kind == TCP_OPTION_EOL) + break; + else if (kind == TCP_OPTION_NOOP) + opt_len = 1; + else + { + /* broken options */ + if (opts_len < 2) + break; + opt_len = data[1]; + + /* weird option length */ + if (opt_len < 2 || opt_len > opts_len) + break; + } + + /* Parse options */ + switch (kind) + { + case TCP_OPTION_MSS: + if ((opt_len == TCP_OPTION_LEN_MSS) && tcp_syn (th)) + { + to->flags |= TCP_OPTS_FLAG_MSS; + to->mss = clib_net_to_host_u16 (*(u16 *) (data + 2)); + } + break; + case TCP_OPTION_WINDOW_SCALE: + if ((opt_len == TCP_OPTION_LEN_WINDOW_SCALE) && tcp_syn (th)) + { + to->flags |= TCP_OPTS_FLAG_WSCALE; + to->wscale = data[2]; + if (to->wscale > TCP_MAX_WND_SCALE) + { + clib_warning ("Illegal window scaling value: %d", + to->wscale); + to->wscale = TCP_MAX_WND_SCALE; + } + } + break; + case TCP_OPTION_TIMESTAMP: + if (opt_len == TCP_OPTION_LEN_TIMESTAMP) + { + to->flags |= TCP_OPTS_FLAG_TSTAMP; + to->tsval = clib_net_to_host_u32 (*(u32 *) (data + 2)); + to->tsecr = clib_net_to_host_u32 (*(u32 *) (data + 6)); + } + break; + case TCP_OPTION_SACK_PERMITTED: + if (opt_len == TCP_OPTION_LEN_SACK_PERMITTED && tcp_syn (th)) + to->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; + break; + case TCP_OPTION_SACK_BLOCK: + /* If SACK permitted was not advertised or a SYN, break */ + if ((to->flags & TCP_OPTS_FLAG_SACK_PERMITTED) == 0 || tcp_syn (th)) + break; + + /* If too short or not correctly formatted, break */ + if (opt_len < 10 || ((opt_len - 2) % TCP_OPTION_LEN_SACK_BLOCK)) + break; + + to->flags |= TCP_OPTS_FLAG_SACK; + to->n_sack_blocks = (opt_len - 2) / TCP_OPTION_LEN_SACK_BLOCK; + vec_reset_length (to->sacks); + for (j = 0; j < to->n_sack_blocks; j++) + { + b.start = clib_net_to_host_u32 (*(u32 *) (data + 2 + 4 * j)); + b.end = clib_net_to_host_u32 (*(u32 *) (data + 6 + 4 * j)); + vec_add1 (to->sacks, b); + } + break; + default: + /* Nothing to see here */ + continue; + } + } +} + +always_inline int +tcp_segment_check_paws (tcp_connection_t * tc) +{ + /* XXX normally test for timestamp should be lt instead of leq, but for + * local testing this is not enough */ + return tcp_opts_tstamp (&tc->opt) && tc->tsval_recent + && timestamp_lt (tc->opt.tsval, tc->tsval_recent); +} + +/** + * Validate incoming segment as per RFC793 p. 69 and RFC1323 p. 19 + * + * It first verifies if segment has a wrapped sequence number (PAWS) and then + * does the processing associated to the first four steps (ignoring security + * and precedence): sequence number, rst bit and syn bit checks. + * + * @return 0 if segments passes validation. + */ +static int +tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, + vlib_buffer_t * b0, tcp_header_t * th0, u32 * next0) +{ + u8 paws_failed; + + if (PREDICT_FALSE (!tcp_ack (th0) && !tcp_rst (th0) && !tcp_syn (th0))) + return -1; + + tcp_options_parse (th0, &tc0->opt); + + /* RFC1323: Check against wrapped sequence numbers (PAWS). If we have + * timestamp to echo and it's less than tsval_recent, drop segment + * but still send an ACK in order to retain TCP's mechanism for detecting + * and recovering from half-open connections */ + paws_failed = tcp_segment_check_paws (tc0); + if (paws_failed) + { + clib_warning ("paws failed"); + + /* If it just so happens that a segment updates tsval_recent for a + * segment over 24 days old, invalidate tsval_recent. */ + if (timestamp_lt (tc0->tsval_recent_age + TCP_PAWS_IDLE, + tcp_time_now ())) + { + /* Age isn't reset until we get a valid tsval (bsd inspired) */ + tc0->tsval_recent = 0; + } + else + { + /* Drop after ack if not rst */ + if (!tcp_rst (th0)) + { + tcp_make_ack (tc0, b0); + *next0 = tcp_next_output (tc0->c_is_ip4); + return -1; + } + } + } + + /* 1st: check sequence number */ + if (!tcp_segment_in_rcv_wnd (tc0, vnet_buffer (b0)->tcp.seq_number, + vnet_buffer (b0)->tcp.seq_end)) + { + if (!tcp_rst (th0)) + { + /* Send dup ack */ + tcp_make_ack (tc0, b0); + *next0 = tcp_next_output (tc0->c_is_ip4); + } + return -1; + } + + /* 2nd: check the RST bit */ + if (tcp_rst (th0)) + { + /* Notify session that connection has been reset. Switch + * state to closed and await for session to do the cleanup. */ + stream_session_reset_notify (&tc0->connection); + tc0->state = TCP_STATE_CLOSED; + return -1; + } + + /* 3rd: check security and precedence (skip) */ + + /* 4th: check the SYN bit */ + if (tcp_syn (th0)) + { + tcp_send_reset (b0, tc0->c_is_ip4); + return -1; + } + + /* If PAWS passed and segment in window, save timestamp */ + if (!paws_failed) + { + tc0->tsval_recent = tc0->opt.tsval; + tc0->tsval_recent_age = tcp_time_now (); + } + + return 0; +} + +always_inline int +tcp_rcv_ack_is_acceptable (tcp_connection_t * tc0, vlib_buffer_t * tb0) +{ + /* SND.UNA =< SEG.ACK =< SND.NXT */ + return (seq_leq (tc0->snd_una, vnet_buffer (tb0)->tcp.ack_number) + && seq_leq (vnet_buffer (tb0)->tcp.ack_number, tc0->snd_nxt)); +} + +/** + * Compute smoothed RTT as per VJ's '88 SIGCOMM and RFC6298 + * + * Note that although the original article, srtt and rttvar are scaled + * to minimize round-off errors, here we don't. Instead, we rely on + * better precision time measurements. + * + * TODO support us rtt resolution + */ +static void +tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt) +{ + int err; + + if (tc->srtt != 0) + { + err = mrtt - tc->srtt; + tc->srtt += err >> 3; + + /* XXX Drop in RTT results in RTTVAR increase and bigger RTO. + * The increase should be bound */ + tc->rttvar += (clib_abs (err) - tc->rttvar) >> 2; + } + else + { + /* First measurement. */ + tc->srtt = mrtt; + tc->rttvar = mrtt << 1; + } +} + +/** Update RTT estimate and RTO timer + * + * Measure RTT: We have two sources of RTT measurements: TSOPT and ACK + * timing. Middle boxes are known to fiddle with TCP options so we + * should give higher priority to ACK timing. + * + * return 1 if valid rtt 0 otherwise + */ +static int +tcp_update_rtt (tcp_connection_t * tc, u32 ack) +{ + u32 mrtt = 0; + + /* Karn's rule, part 1. Don't use retransmitted segments to estimate + * RTT because they're ambiguous. */ + if (tc->rtt_seq && seq_gt (ack, tc->rtt_seq) && !tc->rto_boff) + { + mrtt = tcp_time_now () - tc->rtt_ts; + tc->rtt_seq = 0; + } + + /* As per RFC7323 TSecr can be used for RTTM only if the segment advances + * snd_una, i.e., the left side of the send window: + * seq_lt (tc->snd_una, ack). Note: last condition could be dropped, we don't + * try to update rtt for dupacks */ + else if (tcp_opts_tstamp (&tc->opt) && tc->opt.tsecr && tc->bytes_acked) + { + mrtt = tcp_time_now () - tc->opt.tsecr; + } + + /* Ignore dubious measurements */ + if (mrtt == 0 || mrtt > TCP_RTT_MAX) + return 0; + + tcp_estimate_rtt (tc, mrtt); + + tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); + + return 1; +} + +/** + * Dequeue bytes that have been acked and while at it update RTT estimates. + */ +static void +tcp_dequeue_acked (tcp_connection_t * tc, u32 ack) +{ + /* Dequeue the newly ACKed bytes */ + stream_session_dequeue_drop (&tc->connection, tc->bytes_acked); + + /* Update rtt and rto */ + if (tcp_update_rtt (tc, ack)) + { + /* Good ACK received and valid RTT, make sure retransmit backoff is 0 */ + tc->rto_boff = 0; + } +} + +/** Check if dupack as per RFC5681 Sec. 2 */ +always_inline u8 +tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 new_snd_wnd) +{ + return ((vnet_buffer (b)->tcp.ack_number == tc->snd_una) + && seq_gt (tc->snd_una_max, tc->snd_una) + && (vnet_buffer (b)->tcp.seq_end == vnet_buffer (b)->tcp.seq_number) + && (new_snd_wnd == tc->snd_wnd)); +} + +void +scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) +{ + sack_scoreboard_hole_t *next, *prev; + + if (hole->next != TCP_INVALID_SACK_HOLE_INDEX) + { + next = pool_elt_at_index (sb->holes, hole->next); + next->prev = hole->prev; + } + + if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX) + { + prev = pool_elt_at_index (sb->holes, hole->prev); + prev->next = hole->next; + } + else + { + sb->head = hole->next; + } + + pool_put (sb->holes, hole); +} + +sack_scoreboard_hole_t * +scoreboard_insert_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * prev, + u32 start, u32 end) +{ + sack_scoreboard_hole_t *hole, *next; + u32 hole_index; + + pool_get (sb->holes, hole); + memset (hole, 0, sizeof (*hole)); + + hole->start = start; + hole->end = end; + hole_index = hole - sb->holes; + + if (prev) + { + hole->prev = prev - sb->holes; + hole->next = prev->next; + + if ((next = scoreboard_next_hole (sb, hole))) + next->prev = hole_index; + + prev->next = hole_index; + } + else + { + sb->head = hole_index; + hole->prev = TCP_INVALID_SACK_HOLE_INDEX; + hole->next = TCP_INVALID_SACK_HOLE_INDEX; + } + + return hole; +} + +static void +tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) +{ + sack_scoreboard_t *sb = &tc->sack_sb; + sack_block_t *blk, tmp; + sack_scoreboard_hole_t *hole, *next_hole; + u32 blk_index = 0; + int i, j; + + if (!tcp_opts_sack (tc) && sb->head == TCP_INVALID_SACK_HOLE_INDEX) + return; + + /* Remove invalid blocks */ + vec_foreach (blk, tc->opt.sacks) + { + if (seq_lt (blk->start, blk->end) + && seq_gt (blk->start, tc->snd_una) + && seq_gt (blk->start, ack) && seq_lt (blk->end, tc->snd_nxt)) + continue; + + vec_del1 (tc->opt.sacks, blk - tc->opt.sacks); + } + + /* Add block for cumulative ack */ + if (seq_gt (ack, tc->snd_una)) + { + tmp.start = tc->snd_una; + tmp.end = ack; + vec_add1 (tc->opt.sacks, tmp); + } + + if (vec_len (tc->opt.sacks) == 0) + return; + + /* Make sure blocks are ordered */ + for (i = 0; i < vec_len (tc->opt.sacks); i++) + for (j = i; j < vec_len (tc->opt.sacks); j++) + if (seq_lt (tc->opt.sacks[j].start, tc->opt.sacks[i].start)) + { + tmp = tc->opt.sacks[i]; + tc->opt.sacks[i] = tc->opt.sacks[j]; + tc->opt.sacks[j] = tmp; + } + + /* If no holes, insert the first that covers all outstanding bytes */ + if (sb->head == TCP_INVALID_SACK_HOLE_INDEX) + { + scoreboard_insert_hole (sb, 0, tc->snd_una, tc->snd_una_max); + } + + /* Walk the holes with the SACK blocks */ + hole = pool_elt_at_index (sb->holes, sb->head); + while (hole && blk_index < vec_len (tc->opt.sacks)) + { + blk = &tc->opt.sacks[blk_index]; + + if (seq_leq (blk->start, hole->start)) + { + /* Block covers hole. Remove hole */ + if (seq_geq (blk->end, hole->end)) + { + next_hole = scoreboard_next_hole (sb, hole); + + /* Byte accounting */ + if (seq_lt (hole->end, ack)) + { + /* Bytes lost because snd wnd left edge advances */ + if (seq_lt (next_hole->start, ack)) + sb->sacked_bytes -= next_hole->start - hole->end; + else + sb->sacked_bytes -= ack - hole->end; + } + else + { + sb->sacked_bytes += scoreboard_hole_bytes (hole); + } + + scoreboard_remove_hole (sb, hole); + hole = next_hole; + } + /* Partial overlap */ + else + { + sb->sacked_bytes += blk->end - hole->start; + hole->start = blk->end; + blk_index++; + } + } + else + { + /* Hole must be split */ + if (seq_leq (blk->end, hole->end)) + { + sb->sacked_bytes += blk->end - blk->start; + scoreboard_insert_hole (sb, hole, blk->end, hole->end); + hole->end = blk->start - 1; + blk_index++; + } + else + { + sb->sacked_bytes += hole->end - blk->start + 1; + hole->end = blk->start - 1; + hole = scoreboard_next_hole (sb, hole); + } + } + } +} + +/** Update snd_wnd + * + * If (SND.WL1 < SEG.SEQ or (SND.WL1 = SEG.SEQ and SND.WL2 =< SEG.ACK)), set + * SND.WND <- SEG.WND, set SND.WL1 <- SEG.SEQ, and set SND.WL2 <- SEG.ACK */ +static void +tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd) +{ + if (tc->snd_wl1 < seq || (tc->snd_wl1 == seq && tc->snd_wl2 <= ack)) + { + tc->snd_wnd = snd_wnd; + tc->snd_wl1 = seq; + tc->snd_wl2 = ack; + } +} + +static void +tcp_cc_congestion (tcp_connection_t * tc) +{ + tc->cc_algo->congestion (tc); +} + +static void +tcp_cc_recover (tcp_connection_t * tc) +{ + if (tcp_in_fastrecovery (tc)) + { + tc->cc_algo->recovered (tc); + tcp_recovery_off (tc); + } + else if (tcp_in_recovery (tc)) + { + tcp_recovery_off (tc); + tc->cwnd = tcp_loss_wnd (tc); + } +} + +static void +tcp_cc_rcv_ack (tcp_connection_t * tc) +{ + u8 partial_ack; + + if (tcp_in_recovery (tc)) + { + partial_ack = seq_lt (tc->snd_una, tc->snd_una_max); + if (!partial_ack) + { + /* Clear retransmitted bytes. */ + tc->rtx_bytes = 0; + tcp_cc_recover (tc); + } + else + { + /* Clear retransmitted bytes. XXX should we clear all? */ + tc->rtx_bytes = 0; + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK); + + /* Retransmit first unacked segment */ + tcp_retransmit_first_unacked (tc); + } + } + else + { + tc->cc_algo->rcv_ack (tc); + } + + tc->rcv_dupacks = 0; + tc->tsecr_last_ack = tc->opt.tsecr; +} + +static void +tcp_cc_rcv_dupack (tcp_connection_t * tc, u32 ack) +{ + ASSERT (tc->snd_una == ack); + + tc->rcv_dupacks++; + if (tc->rcv_dupacks == TCP_DUPACK_THRESHOLD) + { + /* RFC6582 NewReno heuristic to avoid multiple fast retransmits */ + if (tc->opt.tsecr != tc->tsecr_last_ack) + { + tc->rcv_dupacks = 0; + return; + } + + tcp_fastrecovery_on (tc); + + /* Handle congestion and dupack */ + tcp_cc_congestion (tc); + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + + tcp_fast_retransmit (tc); + + /* Post retransmit update cwnd to ssthresh and account for the + * three segments that have left the network and should've been + * buffered at the receiver */ + tc->cwnd = tc->ssthresh + TCP_DUPACK_THRESHOLD * tc->snd_mss; + } + else if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD) + { + ASSERT (tcp_in_fastrecovery (tc)); + + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + } +} + +void +tcp_cc_init (tcp_connection_t * tc) +{ + tc->cc_algo = tcp_cc_algo_get (TCP_CC_NEWRENO); + tc->cc_algo->init (tc); +} + +static int +tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, + tcp_header_t * th, u32 * next, u32 * error) +{ + u32 new_snd_wnd; + + /* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) then send an + * ACK, drop the segment, and return */ + if (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)) + { + tcp_make_ack (tc, b); + *next = tcp_next_output (tc->c_is_ip4); + *error = TCP_ERROR_ACK_INVALID; + return -1; + } + + /* If old ACK, discard */ + if (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una)) + { + *error = TCP_ERROR_ACK_OLD; + return -1; + } + + if (tcp_opts_sack_permitted (&tc->opt)) + tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number); + + new_snd_wnd = clib_net_to_host_u32 (th->window) << tc->snd_wscale; + + if (tcp_ack_is_dupack (tc, b, new_snd_wnd)) + { + tcp_cc_rcv_dupack (tc, vnet_buffer (b)->tcp.ack_number); + *error = TCP_ERROR_ACK_DUP; + return -1; + } + + /* Valid ACK */ + tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una; + tc->snd_una = vnet_buffer (b)->tcp.ack_number; + + /* Dequeue ACKed packet and update RTT */ + tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number); + + tcp_update_snd_wnd (tc, vnet_buffer (b)->tcp.seq_number, + vnet_buffer (b)->tcp.ack_number, new_snd_wnd); + + /* Updates congestion control (slow start/congestion avoidance) */ + tcp_cc_rcv_ack (tc); + + /* If everything has been acked, stop retransmit timer + * otherwise update */ + if (tc->snd_una == tc->snd_una_max) + tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT); + else + tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, tc->rto); + + return 0; +} + +/** + * Build SACK list as per RFC2018. + * + * Makes sure the first block contains the segment that generated the current + * ACK and the following ones are the ones most recently reported in SACK + * blocks. + * + * @param tc TCP connection for which the SACK list is updated + * @param start Start sequence number of the newest SACK block + * @param end End sequence of the newest SACK block + */ +static void +tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end) +{ + sack_block_t *new_list = 0, block; + u32 n_elts; + int i; + u8 new_head = 0; + + /* If the first segment is ooo add it to the list. Last write might've moved + * rcv_nxt over the first segment. */ + if (seq_lt (tc->rcv_nxt, start)) + { + block.start = start; + block.end = end; + vec_add1 (new_list, block); + new_head = 1; + } + + /* Find the blocks still worth keeping. */ + for (i = 0; i < vec_len (tc->snd_sacks); i++) + { + /* Discard if: + * 1) rcv_nxt advanced beyond current block OR + * 2) Segment overlapped by the first segment, i.e., it has been merged + * into it.*/ + if (seq_leq (tc->snd_sacks[i].start, tc->rcv_nxt) + || seq_leq (tc->snd_sacks[i].start, end)) + continue; + + /* Save subsequent segments to new SACK list. */ + n_elts = clib_min (vec_len (tc->snd_sacks) - i, + TCP_MAX_SACK_BLOCKS - new_head); + vec_insert_elts (new_list, &tc->snd_sacks[i], n_elts, new_head); + break; + } + + /* Replace old vector with new one */ + vec_free (tc->snd_sacks); + tc->snd_sacks = new_list; +} + +/** Enqueue data for delivery to application */ +always_inline u32 +tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, + u16 data_len) +{ + int written; + + /* Pure ACK. Update rcv_nxt and be done. */ + if (PREDICT_FALSE (data_len == 0)) + { + tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end; + return TCP_ERROR_PURE_ACK; + } + + written = stream_session_enqueue_data (&tc->connection, + vlib_buffer_get_current (b), + data_len, 1 /* queue event */ ); + + /* Update rcv_nxt */ + if (PREDICT_TRUE (written == data_len)) + { + tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end; + } + /* If more data written than expected, account for out-of-order bytes. */ + else if (written > data_len) + { + tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end + written - data_len; + + /* Send ACK confirming the update */ + tc->flags |= TCP_CONN_SNDACK; + + /* Update SACK list if need be */ + if (tcp_opts_sack_permitted (&tc->opt)) + { + /* Remove SACK blocks that have been delivered */ + tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt); + } + } + else + { + ASSERT (0); + return TCP_ERROR_FIFO_FULL; + } + + return TCP_ERROR_ENQUEUED; +} + +/** Enqueue out-of-order data */ +always_inline u32 +tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, + u16 data_len) +{ + stream_session_t *s0; + u32 offset, seq; + + s0 = stream_session_get (tc->c_s_index, tc->c_thread_index); + seq = vnet_buffer (b)->tcp.seq_number; + offset = seq - tc->rcv_nxt; + + if (svm_fifo_enqueue_with_offset (s0->server_rx_fifo, s0->pid, offset, + data_len, vlib_buffer_get_current (b))) + return TCP_ERROR_FIFO_FULL; + + /* Update SACK list if in use */ + if (tcp_opts_sack_permitted (&tc->opt)) + { + ooo_segment_t *newest; + u32 start, end; + + /* Get the newest segment from the fifo */ + newest = svm_fifo_newest_ooo_segment (s0->server_rx_fifo); + start = tc->rcv_nxt + ooo_segment_offset (s0->server_rx_fifo, newest); + end = tc->rcv_nxt + ooo_segment_end_offset (s0->server_rx_fifo, newest); + + tcp_update_sack_list (tc, start, end); + } + + return TCP_ERROR_ENQUEUED; +} + +/** + * Check if ACK could be delayed. DELACK timer is set only after frame is + * processed so this can return true for a full bursts of packets. + */ +always_inline int +tcp_can_delack (tcp_connection_t * tc) +{ + /* If there's no DELACK timer set and the last window sent wasn't 0 we + * can safely delay. */ + if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK) + && (tc->flags & TCP_CONN_SENT_RCV_WND0) == 0 + && (tc->flags & TCP_CONN_SNDACK) == 0) + return 1; + + return 0; +} + +static int +tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, + u16 n_data_bytes, u32 * next0) +{ + u32 error = 0; + + /* Handle out-of-order data */ + if (PREDICT_FALSE (vnet_buffer (b)->tcp.seq_number != tc->rcv_nxt)) + { + error = tcp_session_enqueue_ooo (tc, b, n_data_bytes); + + /* Don't send more than 3 dupacks per burst + * XXX decide if this is good */ + if (tc->snt_dupacks < 3) + { + /* RFC2581: Send DUPACK for fast retransmit */ + tcp_make_ack (tc, b); + *next0 = tcp_next_output (tc->c_is_ip4); + + /* Mark as DUPACK. We may filter these in output if + * the burst fills the holes. */ + vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_DUPACK; + + tc->snt_dupacks++; + } + + goto done; + } + + /* In order data, enqueue. Fifo figures out by itself if any out-of-order + * segments can be enqueued after fifo tail offset changes. */ + error = tcp_session_enqueue_data (tc, b, n_data_bytes); + + /* Check if ACK can be delayed */ + if (tcp_can_delack (tc)) + { + /* Nothing to do for pure ACKs */ + if (n_data_bytes == 0) + goto done; + + /* If connection has not been previously marked for delay ack + * add it to the list and flag it */ + if (!tc->flags & TCP_CONN_DELACK) + { + vec_add1 (tm->delack_connections[tc->c_thread_index], + tc->c_c_index); + tc->flags |= TCP_CONN_DELACK; + } + } + else + { + /* Check if a packet has already been enqueued to output for burst. + * If yes, then drop this one, otherwise, let it pass through to + * output */ + if ((tc->flags & TCP_CONN_BURSTACK) == 0) + { + *next0 = tcp_next_output (tc->c_is_ip4); + tcp_make_ack (tc, b); + error = TCP_ERROR_ENQUEUED; + + /* TODO: maybe add counter to ensure N acks will be sent/burst */ + tc->flags |= TCP_CONN_BURSTACK; + } + } + +done: + return error; +} + +void +delack_timers_init (tcp_main_t * tm, u32 thread_index) +{ + tcp_connection_t *tc; + u32 i, *conns; + tw_timer_wheel_16t_2w_512sl_t *tw; + + tw = &tm->timer_wheels[thread_index]; + conns = tm->delack_connections[thread_index]; + for (i = 0; i < vec_len (conns); i++) + { + tc = pool_elt_at_index (tm->connections[thread_index], conns[i]); + ASSERT (0 != tc); + + tc->timers[TCP_TIMER_DELACK] + = tw_timer_start_16t_2w_512sl (tw, conns[i], + TCP_TIMER_DELACK, TCP_DELACK_TIME); + } + vec_reset_length (tm->delack_connections[thread_index]); +} + +always_inline uword +tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->cpu_index, errors = 0; + tcp_main_t *tm = vnet_get_tcp_main (); + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + tcp_header_t *th0 = 0; + tcp_connection_t *tc0; + ip4_header_t *ip40; + ip6_header_t *ip60; + u32 n_advance_bytes0, n_data_bytes0; + u32 next0 = TCP_ESTABLISHED_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index, + my_thread_index); + + /* Checksum computed by ipx_local no need to compute again */ + + if (is_ip4) + { + ip40 = vlib_buffer_get_current (b0); + th0 = ip4_next_header (ip40); + n_advance_bytes0 = (ip4_header_bytes (ip40) + + tcp_header_bytes (th0)); + n_data_bytes0 = clib_net_to_host_u16 (ip40->length) + - n_advance_bytes0; + } + else + { + ip60 = vlib_buffer_get_current (b0); + th0 = ip6_next_header (ip60); + n_advance_bytes0 = tcp_header_bytes (th0); + n_data_bytes0 = clib_net_to_host_u16 (ip60->payload_length) + - n_advance_bytes0; + n_advance_bytes0 += sizeof (ip60[0]); + } + + /* SYNs, FINs and data consume sequence numbers */ + vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number + + tcp_is_syn (th0) + tcp_is_fin (th0) + n_data_bytes0; + + /* TODO header prediction fast path */ + + /* 1-4: check SEQ, RST, SYN */ + if (PREDICT_FALSE (tcp_segment_validate (vm, tc0, b0, th0, &next0))) + { + error0 = TCP_ERROR_SEGMENT_INVALID; + goto drop; + } + + /* 5: check the ACK field */ + if (tcp_rcv_ack (tc0, b0, th0, &next0, &error0)) + { + goto drop; + } + + /* 6: check the URG bit TODO */ + + /* 7: process the segment text */ + vlib_buffer_advance (b0, n_advance_bytes0); + error0 = tcp_segment_rcv (tm, tc0, b0, n_data_bytes0, &next0); + + /* 8: check the FIN bit */ + if (tcp_fin (th0)) + { + /* Send ACK and enter CLOSE-WAIT */ + tcp_make_ack (tc0, b0); + tcp_connection_force_ack (tc0, b0); + next0 = tcp_next_output (tc0->c_is_ip4); + tc0->state = TCP_STATE_CLOSE_WAIT; + stream_session_disconnect_notify (&tc0->connection); + } + + drop: + b0->error = node->errors[error0]; + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + errors = session_manager_flush_enqueue_events (my_thread_index); + if (errors) + { + if (is_ip4) + vlib_node_increment_counter (vm, tcp4_established_node.index, + TCP_ERROR_EVENT_FIFO_FULL, errors); + else + vlib_node_increment_counter (vm, tcp6_established_node.index, + TCP_ERROR_EVENT_FIFO_FULL, errors); + } + + delack_timers_init (tm, my_thread_index); + + return from_frame->n_vectors; +} + +static uword +tcp4_established (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_established_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +tcp6_established (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_established_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_established_node) = +{ + .function = tcp4_established, + .name = "tcp4-established", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR,.error_strings = tcp_error_strings, + .n_next_nodes = TCP_ESTABLISHED_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_established_node, tcp4_established); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_established_node) = +{ + .function = tcp6_established, + .name = "tcp6-established", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_ESTABLISHED_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, +}; +/* *INDENT-ON* */ + + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_established_node, tcp6_established); + +vlib_node_registration_t tcp4_syn_sent_node; +vlib_node_registration_t tcp6_syn_sent_node; + +always_inline uword +tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->cpu_index, errors = 0; + u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0, ack0, seq0; + vlib_buffer_t *b0; + tcp_header_t *tcp0 = 0; + tcp_connection_t *tc0; + ip4_header_t *ip40; + ip6_header_t *ip60; + u32 n_advance_bytes0, n_data_bytes0; + tcp_connection_t *new_tc0; + u32 next0 = TCP_SYN_SENT_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + tc0 = + tcp_half_open_connection_get (vnet_buffer (b0)-> + tcp.connection_index); + + ack0 = vnet_buffer (b0)->tcp.ack_number; + seq0 = vnet_buffer (b0)->tcp.seq_number; + + /* Checksum computed by ipx_local no need to compute again */ + + if (is_ip4) + { + ip40 = vlib_buffer_get_current (b0); + tcp0 = ip4_next_header (ip40); + n_advance_bytes0 = (ip4_header_bytes (ip40) + + tcp_header_bytes (tcp0)); + n_data_bytes0 = clib_net_to_host_u16 (ip40->length) + - n_advance_bytes0; + } + else + { + ip60 = vlib_buffer_get_current (b0); + tcp0 = ip6_next_header (ip60); + n_advance_bytes0 = tcp_header_bytes (tcp0); + n_data_bytes0 = clib_net_to_host_u16 (ip60->payload_length) + - n_advance_bytes0; + n_advance_bytes0 += sizeof (ip60[0]); + } + + if (PREDICT_FALSE + (!tcp_ack (tcp0) && !tcp_rst (tcp0) && !tcp_syn (tcp0))) + goto drop; + + /* SYNs, FINs and data consume sequence numbers */ + vnet_buffer (b0)->tcp.seq_end = seq0 + tcp_is_syn (tcp0) + + tcp_is_fin (tcp0) + n_data_bytes0; + + /* + * 1. check the ACK bit + */ + + /* + * If the ACK bit is set + * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless + * the RST bit is set, if so drop the segment and return) + * + * and discard the segment. Return. + * If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable. + */ + if (tcp_ack (tcp0)) + { + if (ack0 <= tc0->iss || ack0 > tc0->snd_nxt) + { + if (!tcp_rst (tcp0)) + tcp_send_reset (b0, is_ip4); + + goto drop; + } + + /* Make sure ACK is valid */ + if (tc0->snd_una > ack0) + goto drop; + } + + /* + * 2. check the RST bit + */ + + if (tcp_rst (tcp0)) + { + /* If ACK is acceptable, signal client that peer is not + * willing to accept connection and drop connection*/ + if (tcp_ack (tcp0)) + { + stream_session_connect_notify (&tc0->connection, sst, + 1 /* fail */ ); + tcp_connection_cleanup (tc0); + } + goto drop; + } + + /* + * 3. check the security and precedence (skipped) + */ + + /* + * 4. check the SYN bit + */ + + /* No SYN flag. Drop. */ + if (!tcp_syn (tcp0)) + goto drop; + + /* Stop connection establishment and retransmit timers */ + tcp_timer_reset (tc0, TCP_TIMER_ESTABLISH); + tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT_SYN); + + /* Valid SYN or SYN-ACK. Move connection from half-open pool to + * current thread pool. */ + pool_get (tm->connections[my_thread_index], new_tc0); + clib_memcpy (new_tc0, tc0, sizeof (*new_tc0)); + + new_tc0->c_thread_index = my_thread_index; + + /* Cleanup half-open connection XXX lock */ + pool_put (tm->half_open_connections, tc0); + + new_tc0->rcv_nxt = vnet_buffer (b0)->tcp.seq_end; + new_tc0->irs = seq0; + + /* Parse options */ + tcp_options_parse (tcp0, &new_tc0->opt); + tcp_connection_init_vars (new_tc0); + + if (tcp_opts_tstamp (&new_tc0->opt)) + { + new_tc0->tsval_recent = new_tc0->opt.tsval; + new_tc0->tsval_recent_age = tcp_time_now (); + } + + if (tcp_opts_wscale (&new_tc0->opt)) + new_tc0->snd_wscale = new_tc0->opt.wscale; + + new_tc0->snd_wnd = clib_net_to_host_u32 (tcp0->window) + << new_tc0->snd_wscale; + new_tc0->snd_wl1 = seq0; + new_tc0->snd_wl2 = ack0; + + /* SYN-ACK: See if we can switch to ESTABLISHED state */ + if (tcp_ack (tcp0)) + { + /* Our SYN is ACKed: we have iss < ack = snd_una */ + + /* TODO Dequeue acknowledged segments if we support Fast Open */ + new_tc0->snd_una = ack0; + new_tc0->state = TCP_STATE_ESTABLISHED; + + /* Notify app that we have connection */ + stream_session_connect_notify (&new_tc0->connection, sst, 0); + + /* Make sure after data segment processing ACK is sent */ + new_tc0->flags |= TCP_CONN_SNDACK; + } + /* SYN: Simultaneous open. Change state to SYN-RCVD and send SYN-ACK */ + else + { + new_tc0->state = TCP_STATE_SYN_RCVD; + + /* Notify app that we have connection XXX */ + stream_session_connect_notify (&new_tc0->connection, sst, 0); + + tcp_make_synack (new_tc0, b0); + next0 = tcp_next_output (is_ip4); + + goto drop; + } + + /* Read data, if any */ + if (n_data_bytes0) + { + error0 = + tcp_segment_rcv (tm, new_tc0, b0, n_data_bytes0, &next0); + if (error0 == TCP_ERROR_PURE_ACK) + error0 = TCP_ERROR_SYN_ACKS_RCVD; + } + else + { + tcp_make_ack (new_tc0, b0); + next0 = tcp_next_output (new_tc0->c_is_ip4); + } + + drop: + + b0->error = error0 ? node->errors[error0] : 0; + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + errors = session_manager_flush_enqueue_events (my_thread_index); + if (errors) + { + if (is_ip4) + vlib_node_increment_counter (vm, tcp4_established_node.index, + TCP_ERROR_EVENT_FIFO_FULL, errors); + else + vlib_node_increment_counter (vm, tcp6_established_node.index, + TCP_ERROR_EVENT_FIFO_FULL, errors); + } + + return from_frame->n_vectors; +} + +static uword +tcp4_syn_sent (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_syn_sent_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +tcp6_syn_sent_rcv (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_syn_sent_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_syn_sent_node) = +{ + .function = tcp4_syn_sent, + .name = "tcp4-syn-sent", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_SYN_SENT_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_syn_sent_node, tcp4_syn_sent); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_syn_sent_node) = +{ + .function = tcp6_syn_sent_rcv, + .name = "tcp6-syn-sent", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_SYN_SENT_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + } +,}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_syn_sent_node, tcp6_syn_sent_rcv); +/** + * Handles reception for all states except LISTEN, SYN-SEND and ESTABLISHED + * as per RFC793 p. 64 + */ +always_inline uword +tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->cpu_index, errors = 0; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + tcp_header_t *tcp0 = 0; + tcp_connection_t *tc0; + ip4_header_t *ip40; + ip6_header_t *ip60; + u32 n_advance_bytes0, n_data_bytes0; + u32 next0 = TCP_RCV_PROCESS_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index, + my_thread_index); + + /* Checksum computed by ipx_local no need to compute again */ + + if (is_ip4) + { + ip40 = vlib_buffer_get_current (b0); + tcp0 = ip4_next_header (ip40); + n_advance_bytes0 = (ip4_header_bytes (ip40) + + tcp_header_bytes (tcp0)); + n_data_bytes0 = clib_net_to_host_u16 (ip40->length) + - n_advance_bytes0; + } + else + { + ip60 = vlib_buffer_get_current (b0); + tcp0 = ip6_next_header (ip60); + n_advance_bytes0 = tcp_header_bytes (tcp0); + n_data_bytes0 = clib_net_to_host_u16 (ip60->payload_length) + - n_advance_bytes0; + n_advance_bytes0 += sizeof (ip60[0]); + } + + /* SYNs, FINs and data consume sequence numbers */ + vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number + + tcp_is_syn (tcp0) + tcp_is_fin (tcp0) + n_data_bytes0; + + /* + * Special treatment for CLOSED + */ + switch (tc0->state) + { + case TCP_STATE_CLOSED: + goto drop; + break; + } + + /* + * For all other states (except LISTEN) + */ + + /* 1-4: check SEQ, RST, SYN */ + if (PREDICT_FALSE + (tcp_segment_validate (vm, tc0, b0, tcp0, &next0))) + { + error0 = TCP_ERROR_SEGMENT_INVALID; + goto drop; + } + + /* 5: check the ACK field */ + switch (tc0->state) + { + case TCP_STATE_SYN_RCVD: + /* + * If the segment acknowledgment is not acceptable, form a + * reset segment, + * + * and send it. + */ + if (!tcp_rcv_ack_is_acceptable (tc0, b0)) + { + tcp_send_reset (b0, is_ip4); + goto drop; + } + /* Switch state to ESTABLISHED */ + tc0->state = TCP_STATE_ESTABLISHED; + + /* Initialize session variables */ + tc0->snd_una = vnet_buffer (b0)->tcp.ack_number; + tc0->snd_wnd = clib_net_to_host_u32 (tcp0->window) + << tc0->opt.wscale; + tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number; + tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number; + + /* Shoulder tap the server */ + stream_session_accept_notify (&tc0->connection); + + tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT_SYN); + break; + case TCP_STATE_ESTABLISHED: + /* We can get packets in established state here because they + * were enqueued before state change */ + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + + break; + case TCP_STATE_FIN_WAIT_1: + /* In addition to the processing for the ESTABLISHED state, if + * our FIN is now acknowledged then enter FIN-WAIT-2 and + * continue processing in that state. */ + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + tc0->state = TCP_STATE_FIN_WAIT_2; + /* Stop all timers, 2MSL will be set lower */ + tcp_connection_timers_reset (tc0); + break; + case TCP_STATE_FIN_WAIT_2: + /* In addition to the processing for the ESTABLISHED state, if + * the retransmission queue is empty, the user's CLOSE can be + * acknowledged ("ok") but do not delete the TCB. */ + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + /* check if rtx queue is empty and ack CLOSE TODO */ + break; + case TCP_STATE_CLOSE_WAIT: + /* Do the same processing as for the ESTABLISHED state. */ + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + break; + case TCP_STATE_CLOSING: + /* In addition to the processing for the ESTABLISHED state, if + * the ACK acknowledges our FIN then enter the TIME-WAIT state, + * otherwise ignore the segment. */ + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + + /* XXX test that send queue empty */ + tc0->state = TCP_STATE_TIME_WAIT; + goto drop; + + break; + case TCP_STATE_LAST_ACK: + /* The only thing that can arrive in this state is an + * acknowledgment of our FIN. If our FIN is now acknowledged, + * delete the TCB, enter the CLOSED state, and return. */ + + if (!tcp_rcv_ack_is_acceptable (tc0, b0)) + goto drop; + + tcp_connection_del (tc0); + goto drop; + + break; + case TCP_STATE_TIME_WAIT: + /* The only thing that can arrive in this state is a + * retransmission of the remote FIN. Acknowledge it, and restart + * the 2 MSL timeout. */ + + /* TODO */ + goto drop; + break; + default: + ASSERT (0); + } + + /* 6: check the URG bit TODO */ + + /* 7: process the segment text */ + switch (tc0->state) + { + case TCP_STATE_ESTABLISHED: + case TCP_STATE_FIN_WAIT_1: + case TCP_STATE_FIN_WAIT_2: + error0 = tcp_segment_rcv (tm, tc0, b0, n_data_bytes0, &next0); + break; + case TCP_STATE_CLOSE_WAIT: + case TCP_STATE_CLOSING: + case TCP_STATE_LAST_ACK: + case TCP_STATE_TIME_WAIT: + /* This should not occur, since a FIN has been received from the + * remote side. Ignore the segment text. */ + break; + } + + /* 8: check the FIN bit */ + if (!tcp_fin (tcp0)) + goto drop; + + switch (tc0->state) + { + case TCP_STATE_ESTABLISHED: + case TCP_STATE_SYN_RCVD: + /* Send FIN-ACK notify app and enter CLOSE-WAIT */ + tcp_connection_timers_reset (tc0); + tcp_make_finack (tc0, b0); + next0 = tcp_next_output (tc0->c_is_ip4); + stream_session_disconnect_notify (&tc0->connection); + tc0->state = TCP_STATE_CLOSE_WAIT; + break; + case TCP_STATE_CLOSE_WAIT: + case TCP_STATE_CLOSING: + case TCP_STATE_LAST_ACK: + /* move along .. */ + break; + case TCP_STATE_FIN_WAIT_1: + tc0->state = TCP_STATE_TIME_WAIT; + tcp_connection_timers_reset (tc0); + tcp_timer_set (tc0, TCP_TIMER_2MSL, TCP_2MSL_TIME); + break; + case TCP_STATE_FIN_WAIT_2: + /* Got FIN, send ACK! */ + tc0->state = TCP_STATE_TIME_WAIT; + tcp_timer_set (tc0, TCP_TIMER_2MSL, TCP_2MSL_TIME); + tcp_make_ack (tc0, b0); + next0 = tcp_next_output (is_ip4); + break; + case TCP_STATE_TIME_WAIT: + /* Remain in the TIME-WAIT state. Restart the 2 MSL time-wait + * timeout. + */ + tcp_timer_update (tc0, TCP_TIMER_2MSL, TCP_2MSL_TIME); + break; + } + + b0->error = error0 ? node->errors[error0] : 0; + + drop: + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + errors = session_manager_flush_enqueue_events (my_thread_index); + if (errors) + { + if (is_ip4) + vlib_node_increment_counter (vm, tcp4_established_node.index, + TCP_ERROR_EVENT_FIFO_FULL, errors); + else + vlib_node_increment_counter (vm, tcp6_established_node.index, + TCP_ERROR_EVENT_FIFO_FULL, errors); + } + + return from_frame->n_vectors; +} + +static uword +tcp4_rcv_process (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_rcv_process_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +tcp6_rcv_process (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_rcv_process_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_rcv_process_node) = +{ + .function = tcp4_rcv_process, + .name = "tcp4-rcv-process", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_RCV_PROCESS_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_rcv_process_node, tcp4_rcv_process); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_rcv_process_node) = +{ + .function = tcp6_rcv_process, + .name = "tcp6-rcv-process", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_RCV_PROCESS_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_rcv_process_node, tcp6_rcv_process); + +vlib_node_registration_t tcp4_listen_node; +vlib_node_registration_t tcp6_listen_node; + +/** + * LISTEN state processing as per RFC 793 p. 65 + */ +always_inline uword +tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->cpu_index; + tcp_main_t *tm = vnet_get_tcp_main (); + u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + tcp_header_t *th0 = 0; + tcp_connection_t *lc0; + ip4_header_t *ip40; + ip6_header_t *ip60; + tcp_connection_t *child0; + u32 error0 = TCP_ERROR_SYNS_RCVD, next0 = TCP_LISTEN_NEXT_DROP; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + lc0 = tcp_listener_get (vnet_buffer (b0)->tcp.connection_index); + + if (is_ip4) + { + ip40 = vlib_buffer_get_current (b0); + th0 = ip4_next_header (ip40); + } + else + { + ip60 = vlib_buffer_get_current (b0); + th0 = ip6_next_header (ip60); + } + + /* Create child session. For syn-flood protection use filter */ + + /* 1. first check for an RST */ + if (tcp_rst (th0)) + goto drop; + + /* 2. second check for an ACK */ + if (tcp_ack (th0)) + { + tcp_send_reset (b0, is_ip4); + goto drop; + } + + /* 3. check for a SYN (did that already) */ + + /* Create child session and send SYN-ACK */ + pool_get (tm->connections[my_thread_index], child0); + memset (child0, 0, sizeof (*child0)); + + child0->c_c_index = child0 - tm->connections[my_thread_index]; + child0->c_lcl_port = lc0->c_lcl_port; + child0->c_rmt_port = th0->src_port; + child0->c_is_ip4 = is_ip4; + child0->c_thread_index = my_thread_index; + + if (is_ip4) + { + child0->c_lcl_ip4.as_u32 = ip40->dst_address.as_u32; + child0->c_rmt_ip4.as_u32 = ip40->src_address.as_u32; + } + else + { + clib_memcpy (&child0->c_lcl_ip6, &ip60->dst_address, + sizeof (ip6_address_t)); + clib_memcpy (&child0->c_rmt_ip6, &ip60->src_address, + sizeof (ip6_address_t)); + } + + if (stream_session_accept (&child0->connection, lc0->c_s_index, sst, + 0 /* notify */ )) + { + error0 = TCP_ERROR_CREATE_SESSION_FAIL; + goto drop; + } + + tcp_options_parse (th0, &child0->opt); + tcp_connection_init_vars (child0); + + child0->irs = vnet_buffer (b0)->tcp.seq_number; + child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1; + child0->state = TCP_STATE_SYN_RCVD; + + /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK} + * segments are used to initialize PAWS. */ + if (tcp_opts_tstamp (&child0->opt)) + { + child0->tsval_recent = child0->opt.tsval; + child0->tsval_recent_age = tcp_time_now (); + } + + /* Reuse buffer to make syn-ack and send */ + tcp_make_synack (child0, b0); + next0 = tcp_next_output (is_ip4); + + drop: + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + + } + + b0->error = error0 ? node->errors[error0] : 0; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + return from_frame->n_vectors; +} + +static uword +tcp4_listen (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_listen_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +tcp6_listen (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_listen_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_listen_node) = +{ + .function = tcp4_listen, + .name = "tcp4-listen", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_LISTEN_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_LISTEN_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_listen_node, tcp4_listen); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_listen_node) = +{ + .function = tcp6_listen, + .name = "tcp6-listen", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_LISTEN_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_LISTEN_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_listen_node, tcp6_listen); + +vlib_node_registration_t tcp4_input_node; +vlib_node_registration_t tcp6_input_node; + +typedef enum _tcp_input_next +{ + TCP_INPUT_NEXT_DROP, + TCP_INPUT_NEXT_LISTEN, + TCP_INPUT_NEXT_RCV_PROCESS, + TCP_INPUT_NEXT_SYN_SENT, + TCP_INPUT_NEXT_ESTABLISHED, + TCP_INPUT_NEXT_RESET, + TCP_INPUT_N_NEXT +} tcp_input_next_t; + +#define foreach_tcp4_input_next \ + _ (DROP, "error-drop") \ + _ (LISTEN, "tcp4-listen") \ + _ (RCV_PROCESS, "tcp4-rcv-process") \ + _ (SYN_SENT, "tcp4-syn-sent") \ + _ (ESTABLISHED, "tcp4-established") \ + _ (RESET, "tcp4-reset") + +#define foreach_tcp6_input_next \ + _ (DROP, "error-drop") \ + _ (LISTEN, "tcp6-listen") \ + _ (RCV_PROCESS, "tcp6-rcv-process") \ + _ (SYN_SENT, "tcp6-syn-sent") \ + _ (ESTABLISHED, "tcp6-established") \ + _ (RESET, "tcp6-reset") + +typedef struct +{ + u16 src_port; + u16 dst_port; + u8 state; +} tcp_rx_trace_t; + +const char *tcp_fsm_states[] = { +#define _(sym, str) str, + foreach_tcp_fsm_state +#undef _ +}; + +u8 * +format_tcp_state (u8 * s, va_list * args) +{ + tcp_state_t *state = va_arg (*args, tcp_state_t *); + + if (state[0] < TCP_N_STATES) + s = format (s, "%s", tcp_fsm_states[state[0]]); + else + s = format (s, "UNKNOWN"); + + return s; +} + +u8 * +format_tcp_rx_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *); + + s = format (s, "TCP: src-port %d dst-port %U%s\n", + clib_net_to_host_u16 (t->src_port), + clib_net_to_host_u16 (t->dst_port), format_tcp_state, t->state); + + return s; +} + +#define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN) + +always_inline uword +tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->cpu_index; + tcp_main_t *tm = vnet_get_tcp_main (); + session_manager_main_t *ssm = vnet_get_session_manager_main (); + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + tcp_header_t *tcp0 = 0; + tcp_connection_t *tc0; + ip4_header_t *ip40; + ip6_header_t *ip60; + u32 error0 = TCP_ERROR_NO_LISTENER, next0 = TCP_INPUT_NEXT_DROP; + u8 flags0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + if (is_ip4) + { + ip40 = vlib_buffer_get_current (b0); + tcp0 = ip4_next_header (ip40); + + /* lookup session */ + tc0 = + (tcp_connection_t *) stream_session_lookup_transport4 (ssm, + &ip40->dst_address, + &ip40->src_address, + tcp0->dst_port, + tcp0->src_port, + SESSION_TYPE_IP4_TCP, + my_thread_index); + } + else + { + ip60 = vlib_buffer_get_current (b0); + tcp0 = ip6_next_header (ip60); + tc0 = + (tcp_connection_t *) stream_session_lookup_transport6 (ssm, + &ip60->src_address, + &ip60->dst_address, + tcp0->src_port, + tcp0->dst_port, + SESSION_TYPE_IP6_TCP, + my_thread_index); + } + + /* Session exists */ + if (PREDICT_TRUE (0 != tc0)) + { + /* Save connection index */ + vnet_buffer (b0)->tcp.connection_index = tc0->c_c_index; + vnet_buffer (b0)->tcp.seq_number = + clib_net_to_host_u32 (tcp0->seq_number); + vnet_buffer (b0)->tcp.ack_number = + clib_net_to_host_u32 (tcp0->ack_number); + + flags0 = tcp0->flags & filter_flags; + next0 = tm->dispatch_table[tc0->state][flags0].next; + error0 = tm->dispatch_table[tc0->state][flags0].error; + + if (PREDICT_FALSE (error0 == TCP_ERROR_DISPATCH)) + { + /* Overload tcp flags to store state */ + vnet_buffer (b0)->tcp.flags = tc0->state; + } + } + else + { + /* Send reset */ + next0 = TCP_INPUT_NEXT_RESET; + error0 = TCP_ERROR_NO_LISTENER; + vnet_buffer (b0)->tcp.flags = 0; + } + + b0->error = error0 ? node->errors[error0] : 0; + + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return from_frame->n_vectors; +} + +static uword +tcp4_input (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +tcp6_input (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_input_node) = +{ + .function = tcp4_input, + .name = "tcp4-input", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_INPUT_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_INPUT_NEXT_##s] = n, + foreach_tcp4_input_next +#undef _ + }, + .format_buffer = format_tcp_header, + .format_trace = format_tcp_rx_trace, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_input_node, tcp4_input); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_input_node) = +{ + .function = tcp6_input, + .name = "tcp6-input", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_INPUT_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_INPUT_NEXT_##s] = n, + foreach_tcp6_input_next +#undef _ + }, + .format_buffer = format_tcp_header, + .format_trace = format_tcp_rx_trace, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_input_node, tcp6_input); +void +tcp_update_time (f64 now, u32 thread_index) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + tw_timer_expire_timers_16t_2w_512sl (&tm->timer_wheels[thread_index], now); +} + +static void +tcp_dispatch_table_init (tcp_main_t * tm) +{ + int i, j; + for (i = 0; i < ARRAY_LEN (tm->dispatch_table); i++) + for (j = 0; j < ARRAY_LEN (tm->dispatch_table[i]); j++) + { + tm->dispatch_table[i][j].next = TCP_INPUT_NEXT_DROP; + tm->dispatch_table[i][j].error = TCP_ERROR_DISPATCH; + } + +#define _(t,f,n,e) \ +do { \ + tm->dispatch_table[TCP_STATE_##t][f].next = (n); \ + tm->dispatch_table[TCP_STATE_##t][f].error = (e); \ +} while (0) + + /* SYNs for new connections -> tcp-listen. */ + _(LISTEN, TCP_FLAG_SYN, TCP_INPUT_NEXT_LISTEN, TCP_ERROR_NONE); + /* ACK for for a SYN-ACK -> tcp-rcv-process. */ + _(SYN_RCVD, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + /* SYN-ACK for a SYN */ + _(SYN_SENT, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, + TCP_ERROR_NONE); + _(SYN_SENT, TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE); + _(SYN_SENT, TCP_FLAG_RST, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE); + _(SYN_SENT, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, + TCP_ERROR_NONE); + /* ACK for for established connection -> tcp-established. */ + _(ESTABLISHED, TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); + /* FIN for for established connection -> tcp-established. */ + _(ESTABLISHED, TCP_FLAG_FIN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); + _(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, + TCP_ERROR_NONE); + /* ACK or FIN-ACK to our FIN */ + _(FIN_WAIT_1, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(FIN_WAIT_1, TCP_FLAG_ACK | TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, + TCP_ERROR_NONE); + /* FIN in reply to our FIN from the other side */ + _(FIN_WAIT_1, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + /* FIN confirming that the peer (app) has closed */ + _(FIN_WAIT_2, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(FIN_WAIT_2, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, + TCP_ERROR_NONE); + _(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); +#undef _ +} + +clib_error_t * +tcp_input_init (vlib_main_t * vm) +{ + clib_error_t *error = 0; + tcp_main_t *tm = vnet_get_tcp_main (); + + if ((error = vlib_call_init_function (vm, tcp_init))) + return error; + + /* Initialize dispatch table. */ + tcp_dispatch_table_init (tm); + + return error; +} + +VLIB_INIT_FUNCTION (tcp_input_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ -- cgit 1.2.3-korg From e04c29942af6a130591059679531c9ffa3d7237a Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Wed, 1 Mar 2017 08:17:34 -0800 Subject: Cleanup URI code and TCP bugfixing - Add CLI/API to enable session layer, by default it's disabled - Improve rcv wnd computation - Improvements to tx path - URI code cleanup - Builtin test tcp server - Improve src port allocation Change-Id: I2ace498e76a0771d4c31a8075cc14fe33d7dfa38 Signed-off-by: Florin Coras --- src/scripts/vnet/uri/dummy_app.py | 65 +++ src/scripts/vnet/uri/tcp_server | 1 + src/svm/svm_fifo.c | 6 +- src/uri.am | 10 +- src/uri/uri_tcp_test.c | 792 +++++++++++++------------ src/uri/uri_udp_test.c | 442 +++++++++++++- src/uri/uri_udp_test2.c | 954 ------------------------------- src/uri/uritest.c | 484 ---------------- src/vnet.am | 1 + src/vnet/api_errno.h | 3 +- src/vnet/session/application.c | 27 +- src/vnet/session/application.h | 1 + src/vnet/session/application_interface.c | 6 +- src/vnet/session/node.c | 57 +- src/vnet/session/session.api | 22 + src/vnet/session/session.c | 86 ++- src/vnet/session/session.h | 23 +- src/vnet/session/session_api.c | 59 +- src/vnet/session/session_cli.c | 63 +- src/vnet/tcp/builtin_server.c | 135 +++++ src/vnet/tcp/tcp.c | 48 +- src/vnet/tcp/tcp.h | 4 +- src/vnet/tcp/tcp_input.c | 56 +- src/vnet/tcp/tcp_output.c | 90 ++- 24 files changed, 1460 insertions(+), 1975 deletions(-) create mode 100644 src/scripts/vnet/uri/dummy_app.py delete mode 100644 src/uri/uri_udp_test2.c delete mode 100644 src/uri/uritest.c create mode 100644 src/vnet/tcp/builtin_server.c (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/scripts/vnet/uri/dummy_app.py b/src/scripts/vnet/uri/dummy_app.py new file mode 100644 index 00000000..b80fbb28 --- /dev/null +++ b/src/scripts/vnet/uri/dummy_app.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python + +import socket +import sys +import bitstring + +# action can be reflect or drop +action = "drop" + +def handle_connection (connection, client_address): + print("Received connection from {}".format(repr(client_address))) + try: + while True: + data = connection.recv(4096) + if not data: + break; + if (action != "drop"): + connection.sendall(data) + finally: + connection.close() + +def run_server(ip, port): + print("Starting server {}:{}".format(repr(ip), repr(port))) + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + server_address = (ip, int(port)) + sock.bind(server_address) + sock.listen(1) + + while True: + connection, client_address = sock.accept() + handle_connection (connection, client_address) + +def prepare_data(): + buf = [] + for i in range (0, pow(2, 16)): + buf.append(i & 0xff) + return bytearray(buf) + +def run_client(ip, port): + print("Starting client {}:{}".format(repr(ip), repr(port))) + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + server_address = ("6.0.1.1", 1234) + sock.connect(server_address) + + data = prepare_data() + try: + sock.sendall(data) + finally: + sock.close() + +def run(mode, ip, port): + if (mode == "server"): + run_server (ip, port) + elif (mode == "client"): + run_client (ip, port) + else: + raise Exception("Unknown mode. Only client and server supported") + +if __name__ == "__main__": + if (len(sys.argv)) < 4: + raise Exception("Usage: ./dummy_app []") + if (len(sys.argv) == 5): + action = sys.argv[4] + + run (sys.argv[1], sys.argv[2], sys.argv[3]) diff --git a/src/scripts/vnet/uri/tcp_server b/src/scripts/vnet/uri/tcp_server index 7f5a86de..c29afc6f 100644 --- a/src/scripts/vnet/uri/tcp_server +++ b/src/scripts/vnet/uri/tcp_server @@ -2,3 +2,4 @@ create host-interface name vpp1 set int state host-vpp1 up set int ip address host-vpp1 6.0.1.1/24 trace add af-packet-input 10 +session enable diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c index 11f90193..e3f534b1 100644 --- a/src/svm/svm_fifo.c +++ b/src/svm/svm_fifo.c @@ -508,9 +508,9 @@ svm_fifo_peek (svm_fifo_t * f, int pid, u32 offset, u32 max_bytes, { /* Number of bytes in first copy segment */ first_copy_bytes = - ((nitems - f->head) < total_copy_bytes) ? - (nitems - f->head) : total_copy_bytes; - clib_memcpy (copy_here, &f->data[f->head], first_copy_bytes); + ((nitems - f->head + offset) < total_copy_bytes) ? + (nitems - f->head + offset) : total_copy_bytes; + clib_memcpy (copy_here, &f->data[f->head + offset], first_copy_bytes); /* Number of bytes in second copy segment, if any */ second_copy_bytes = total_copy_bytes - first_copy_bytes; diff --git a/src/uri.am b/src/uri.am index 8cdd77c6..09b5b15b 100644 --- a/src/uri.am +++ b/src/uri.am @@ -11,12 +11,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -noinst_PROGRAMS += uri_udp_test2 uri_tcp_test +noinst_PROGRAMS += uri_udp_test uri_tcp_test -uri_udp_test2_SOURCES = uri/uri_udp_test2.c -uri_udp_test2_LDADD = libvlibmemoryclient.la libvlibapi.la libsvm.la \ - libvppinfra.la -lpthread -lm -lrt +uri_udp_test_SOURCES = uri/uri_udp_test.c +uri_udp_test_LDADD = libvlibmemoryclient.la libvlibapi.la libsvm.la \ + libvppinfra.la -lpthread -lm -lrt uri_tcp_test_SOURCES = uri/uri_tcp_test.c uri_tcp_test_LDADD = libvlibmemoryclient.la libvlibapi.la libsvm.la \ - libvppinfra.la -lpthread -lm -lrt + libvppinfra.la -lpthread -lm -lrt diff --git a/src/uri/uri_tcp_test.c b/src/uri/uri_tcp_test.c index ed5a37d8..6c9cf1db 100644 --- a/src/uri/uri_tcp_test.c +++ b/src/uri/uri_tcp_test.c @@ -20,16 +20,15 @@ #include #include #include +#include -#include "../vnet/session/application_interface.h" - -#define vl_typedefs /* define message structures */ +#define vl_typedefs /* define message structures */ #include #undef vl_typedefs /* declare message handlers for each api */ -#define vl_endianfun /* define message structures */ +#define vl_endianfun /* define message structures */ #include #undef vl_endianfun @@ -45,8 +44,8 @@ vlib_main_t **vlib_mains; typedef struct { - svm_fifo_t * server_rx_fifo; - svm_fifo_t * server_tx_fifo; + svm_fifo_t *server_rx_fifo; + svm_fifo_t *server_tx_fifo; u32 vpp_session_index; u32 vpp_session_thread; @@ -69,19 +68,19 @@ typedef struct u32 my_client_index; /* The URI we're playing with */ - u8 * uri; + u8 *uri; /* Session pool */ - session_t * sessions; + session_t *sessions; /* Hash table for disconnect processing */ - uword * session_index_by_vpp_handles; + uword *session_index_by_vpp_handles; /* intermediate rx buffer */ - u8 * rx_buf; + u8 *rx_buf; /* URI for slave's connect */ - u8 * connect_uri; + u8 *connect_uri; u32 connected_session_index; @@ -91,10 +90,10 @@ typedef struct int drop_packets; /* Our event queue */ - unix_shared_memory_queue_t * our_event_queue; + unix_shared_memory_queue_t *our_event_queue; /* $$$ single thread only for the moment */ - unix_shared_memory_queue_t * vpp_event_queue; + unix_shared_memory_queue_t *vpp_event_queue; pid_t my_pid; @@ -111,12 +110,15 @@ typedef struct u32 configured_segment_size; /* VNET_API_ERROR_FOO -> "Foo" hash table */ - uword * error_string_by_error_number; - - /* convenience */ - svm_fifo_segment_main_t * segment_main; + uword *error_string_by_error_number; u8 *connect_test_data; + pthread_t client_rx_thread_handle; + u32 client_bytes_received; + u8 test_return_packets; + + /* convenience */ + svm_fifo_segment_main_t *segment_main; } uri_tcp_test_main_t; uri_tcp_test_main_t uri_tcp_test_main; @@ -141,7 +143,7 @@ wait_for_state_change (uri_tcp_test_main_t * utm, connection_state_t state) while (clib_time_now (&utm->clib_time) < timeout) { if (utm->state == state) - return 0; + return 0; if (utm->state == STATE_FAILED) return -1; } @@ -209,7 +211,7 @@ connect_to_vpp (char *name) } static void -vl_api_map_another_segment_t_handler (vl_api_map_another_segment_t *mp) +vl_api_map_another_segment_t_handler (vl_api_map_another_segment_t * mp) { svm_fifo_segment_create_args_t _a, *a = &_a; int rv; @@ -221,24 +223,24 @@ vl_api_map_another_segment_t_handler (vl_api_map_another_segment_t *mp) if (rv) { clib_warning ("svm_fifo_segment_attach ('%s') failed", - mp->segment_name); + mp->segment_name); return; } clib_warning ("Mapped new segment '%s' size %d", mp->segment_name, - mp->segment_size); + mp->segment_size); } static void vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) { uri_tcp_test_main_t *utm = &uri_tcp_test_main; - session_t * session; - vl_api_disconnect_session_reply_t * rmp; - uword * p; + session_t *session; + vl_api_disconnect_session_reply_t *rmp; + uword *p; int rv = 0; u64 key; - key = (((u64)mp->session_thread_index) << 32) | (u64)mp->session_index; + key = (((u64) mp->session_thread_index) << 32) | (u64) mp->session_index; p = hash_get (utm->session_index_by_vpp_handles, key); @@ -254,6 +256,8 @@ vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) rv = -11; } + utm->time_to_stop = 1; + rmp = vl_msg_api_alloc (sizeof (*rmp)); memset (rmp, 0, sizeof (*rmp)); @@ -261,32 +265,32 @@ vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) rmp->retval = rv; rmp->session_index = mp->session_index; rmp->session_thread_index = mp->session_thread_index; - vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *)&rmp); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & rmp); } static void vl_api_reset_session_t_handler (vl_api_reset_session_t * mp) { uri_tcp_test_main_t *utm = &uri_tcp_test_main; - session_t * session; - vl_api_reset_session_reply_t * rmp; - uword * p; + session_t *session; + vl_api_reset_session_reply_t *rmp; + uword *p; int rv = 0; u64 key; - key = (((u64)mp->session_thread_index) << 32) | (u64)mp->session_index; + key = (((u64) mp->session_thread_index) << 32) | (u64) mp->session_index; - p = hash_get(utm->session_index_by_vpp_handles, key); + p = hash_get (utm->session_index_by_vpp_handles, key); if (p) { - session = pool_elt_at_index(utm->sessions, p[0]); - hash_unset(utm->session_index_by_vpp_handles, key); - pool_put(utm->sessions, session); + session = pool_elt_at_index (utm->sessions, p[0]); + hash_unset (utm->session_index_by_vpp_handles, key); + pool_put (utm->sessions, session); } else { - clib_warning("couldn't find session key %llx", key); + clib_warning ("couldn't find session key %llx", key); rv = -11; } @@ -296,301 +300,95 @@ vl_api_reset_session_t_handler (vl_api_reset_session_t * mp) rmp->retval = rv; rmp->session_index = mp->session_index; rmp->session_thread_index = mp->session_thread_index; - vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *)&rmp); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & rmp); } void -handle_fifo_event_connect_rx (uri_tcp_test_main_t *utm, session_fifo_event_t * e) +client_handle_fifo_event_rx (uri_tcp_test_main_t * utm, + session_fifo_event_t * e) { - svm_fifo_t * rx_fifo; - int n_read, bytes; + svm_fifo_t *rx_fifo; + int n_read, bytes, i; rx_fifo = e->fifo; bytes = e->enqueue_length; do { - n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len(utm->rx_buf), - utm->rx_buf); + n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len (utm->rx_buf), + utm->rx_buf); if (n_read > 0) - bytes -= n_read; + { + bytes -= n_read; + for (i = 0; i < n_read; i++) + { + if (utm->rx_buf[i] != ((utm->client_bytes_received + i) & 0xff)) + { + clib_warning ("error at byte %lld, 0x%x not 0x%x", + utm->client_bytes_received + i, + utm->rx_buf[i], + ((utm->client_bytes_received + i) & 0xff)); + } + } + utm->client_bytes_received += n_read; + } + } while (n_read < 0 || bytes > 0); - - // bytes_to_read = svm_fifo_max_dequeue (rx_fifo); - // - // bytes_to_read = vec_len(utm->rx_buf) > bytes_to_read ? - // bytes_to_read : vec_len(utm->rx_buf); - // - // buffer_offset = 0; - // while (bytes_to_read > 0) - // { - // rv = svm_fifo_dequeue_nowait2 (rx_fifo, mypid, - // bytes_to_read, - // utm->rx_buf + buffer_offset); - // if (rv > 0) - // { - // bytes_to_read -= rv; - // buffer_offset += rv; - // bytes_received += rv; - // } - // } - - - // while (bytes_received < bytes_sent) - // { - // rv = svm_fifo_dequeue_nowait2 (rx_fifo, mypid, - // vec_len (utm->rx_buf), - // utm->rx_buf); - // if (rv > 0) - // { - //#if CLIB_DEBUG > 0 - // int j; - // for (j = 0; j < rv; j++) - // { - // if (utm->rx_buf[j] != ((bytes_received + j) & 0xff)) - // { - // clib_warning ("error at byte %lld, 0x%x not 0x%x", - // bytes_received + j, - // utm->rx_buf[j], - // ((bytes_received + j )&0xff)); - // } - // } - //#endif - // bytes_received += (u64) rv; - // } - // } } void -handle_connect_event_queue (uri_tcp_test_main_t * utm) +client_handle_event_queue (uri_tcp_test_main_t * utm) { session_fifo_event_t _e, *e = &_e;; - unix_shared_memory_queue_sub (utm->our_event_queue, (u8 *) e, 0 /* nowait */); + unix_shared_memory_queue_sub (utm->our_event_queue, (u8 *) e, + 0 /* nowait */ ); switch (e->event_type) { case FIFO_EVENT_SERVER_RX: - handle_fifo_event_connect_rx (utm, e); + client_handle_fifo_event_rx (utm, e); break; case FIFO_EVENT_SERVER_EXIT: return; default: - clib_warning("unknown event type %d", e->event_type); + clib_warning ("unknown event type %d", e->event_type); break; } } -void -uri_tcp_connect_send (uri_tcp_test_main_t *utm) -{ - u8 *test_data = utm->connect_test_data; - u64 bytes_sent = 0; - int rv; - int mypid = getpid(); - session_t * session; - svm_fifo_t *tx_fifo; - int buffer_offset, bytes_to_send = 0; - session_fifo_event_t evt; - static int serial_number = 0; - int i; - u32 max_chunk = 64 << 10, write; - - session = pool_elt_at_index (utm->sessions, utm->connected_session_index); - tx_fifo = session->server_tx_fifo; - - vec_validate (utm->rx_buf, vec_len (test_data) - 1); - - for (i = 0; i < 10; i++) - { - bytes_to_send = vec_len (test_data); - buffer_offset = 0; - while (bytes_to_send > 0) - { - write = bytes_to_send > max_chunk ? max_chunk : bytes_to_send; - rv = svm_fifo_enqueue_nowait (tx_fifo, mypid, write, - test_data + buffer_offset); - - if (rv > 0) - { - bytes_to_send -= rv; - buffer_offset += rv; - bytes_sent += rv; - - /* Fabricate TX event, send to vpp */ - evt.fifo = tx_fifo; - evt.event_type = FIFO_EVENT_SERVER_TX; - /* $$$$ for event logging */ - evt.enqueue_length = rv; - evt.event_id = serial_number++; - - unix_shared_memory_queue_add (utm->vpp_event_queue, (u8 *) &evt, - 0 /* do wait for mutex */); - } - } - } -} - -static void -uri_tcp_client_test (uri_tcp_test_main_t * utm) -{ - vl_api_connect_uri_t * cmp; - vl_api_disconnect_session_t *dmp; - session_t *connected_session; - int i; - - cmp = vl_msg_api_alloc (sizeof (*cmp)); - memset (cmp, 0, sizeof (*cmp)); - - cmp->_vl_msg_id = ntohs (VL_API_CONNECT_URI); - cmp->client_index = utm->my_client_index; - cmp->context = ntohl(0xfeedface); - memcpy (cmp->uri, utm->connect_uri, vec_len (utm->connect_uri)); - vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *)&cmp); - - if (wait_for_state_change (utm, STATE_READY)) - { - return; - } - - /* Init test data */ - vec_validate (utm->connect_test_data, 64 * 1024 - 1); - for (i = 0; i < vec_len (utm->connect_test_data); i++) - utm->connect_test_data[i] = i & 0xff; - - /* Start reader thread */ - /* handle_connect_event_queue (utm); */ - - /* Start send */ - uri_tcp_connect_send (utm); - - /* Disconnect */ - connected_session = pool_elt_at_index(utm->sessions, - utm->connected_session_index); - dmp = vl_msg_api_alloc (sizeof (*dmp)); - memset (dmp, 0, sizeof (*dmp)); - dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION); - dmp->client_index = utm->my_client_index; - dmp->session_index = connected_session->vpp_session_index; - dmp->session_thread_index = connected_session->vpp_session_thread; - vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *)&dmp); -} - -void -handle_fifo_event_server_rx (uri_tcp_test_main_t *utm, session_fifo_event_t * e) -{ - svm_fifo_t * rx_fifo, * tx_fifo; - int n_read; - - session_fifo_event_t evt; - unix_shared_memory_queue_t *q; - int rv, bytes; - - rx_fifo = e->fifo; - tx_fifo = utm->sessions[rx_fifo->client_session_index].server_tx_fifo; - - bytes = e->enqueue_length; - do - { - n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len(utm->rx_buf), - utm->rx_buf); - - /* Reflect if a non-drop session */ - if (!utm->drop_packets && n_read > 0) - { - do - { - rv = svm_fifo_enqueue_nowait (tx_fifo, 0, n_read, utm->rx_buf); - } - while (rv == -2); - - /* Fabricate TX event, send to vpp */ - evt.fifo = tx_fifo; - evt.event_type = FIFO_EVENT_SERVER_TX; - /* $$$$ for event logging */ - evt.enqueue_length = n_read; - evt.event_id = e->event_id; - q = utm->vpp_event_queue; - unix_shared_memory_queue_add (q, (u8 *) &evt, 0 /* do wait for mutex */); - } - - if (n_read > 0) - bytes -= n_read; - } - while (n_read < 0 || bytes > 0); -} - -void -handle_event_queue (uri_tcp_test_main_t * utm) +static void * +client_rx_thread_fn (void *arg) { - session_fifo_event_t _e, *e = &_e;; + session_fifo_event_t _e, *e = &_e; + uri_tcp_test_main_t *utm = &uri_tcp_test_main; + utm->client_bytes_received = 0; while (1) { - unix_shared_memory_queue_sub (utm->our_event_queue, (u8 *)e, - 0 /* nowait */); + unix_shared_memory_queue_sub (utm->our_event_queue, (u8 *) e, + 0 /* nowait */ ); switch (e->event_type) - { - case FIFO_EVENT_SERVER_RX: - handle_fifo_event_server_rx (utm, e); - break; - - case FIFO_EVENT_SERVER_EXIT: - return; - - default: - clib_warning ("unknown event type %d", e->event_type); - break; - } - if (PREDICT_FALSE(utm->time_to_stop == 1)) - break; - if (PREDICT_FALSE(utm->time_to_print_stats == 1)) - { - utm->time_to_print_stats = 0; - fformat(stdout, "%d connections\n", pool_elts (utm->sessions)); - } + { + case FIFO_EVENT_SERVER_RX: + client_handle_fifo_event_rx (utm, e); + break; + + case FIFO_EVENT_SERVER_EXIT: + return 0; + default: + clib_warning ("unknown event type %d", e->event_type); + break; + } + + if (PREDICT_FALSE (utm->time_to_stop == 1)) + break; } + pthread_exit (0); } -static void -vl_api_bind_uri_reply_t_handler (vl_api_bind_uri_reply_t * mp) -{ - uri_tcp_test_main_t *utm = &uri_tcp_test_main; - svm_fifo_segment_create_args_t _a, *a = &_a; - int rv; - - if (mp->retval) - { - clib_warning("bind failed: %d", mp->retval); - return; - } - - if (mp->segment_name_length == 0) - { - clib_warning("segment_name_length zero"); - return; - } - - a->segment_name = (char *) mp->segment_name; - a->segment_size = mp->segment_size; - - ASSERT(mp->server_event_queue_address); - - /* Attach to the segment vpp created */ - rv = svm_fifo_segment_attach (a); - if (rv) - { - clib_warning("svm_fifo_segment_attach ('%s') failed", mp->segment_name); - return; - } - - utm->our_event_queue = - (unix_shared_memory_queue_t *) mp->server_event_queue_address; - - utm->state = STATE_READY; -} static void vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) @@ -601,6 +399,7 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) u32 session_index; svm_fifo_t *rx_fifo, *tx_fifo; int rv; + u64 key; if (mp->retval) { @@ -608,6 +407,7 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) utm->state = STATE_FAILED; return; } + /* * Attatch to segment */ @@ -622,14 +422,14 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) a->segment_name = (char *) mp->segment_name; a->segment_size = mp->segment_size; - ASSERT(mp->client_event_queue_address); + ASSERT (mp->client_event_queue_address); /* Attach to the segment vpp created */ rv = svm_fifo_segment_attach (a); if (rv) { clib_warning ("svm_fifo_segment_attach ('%s') failed", - mp->segment_name); + mp->segment_name); return; } @@ -650,9 +450,9 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) pool_get (utm->sessions, session); session_index = session - utm->sessions; - rx_fifo = (svm_fifo_t *)mp->server_rx_fifo; + rx_fifo = (svm_fifo_t *) mp->server_rx_fifo; rx_fifo->client_session_index = session_index; - tx_fifo = (svm_fifo_t *)mp->server_tx_fifo; + tx_fifo = (svm_fifo_t *) mp->server_tx_fifo; tx_fifo->client_session_index = session_index; session->server_rx_fifo = rx_fifo; @@ -662,54 +462,193 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) /* Save handle */ utm->connected_session_index = session_index; - utm->state = STATE_READY; + + /* Add it to lookup table */ + key = (((u64) mp->session_thread_index) << 32) | (u64) mp->session_index; + hash_set (utm->session_index_by_vpp_handles, key, session_index); + + /* Start RX thread */ + rv = pthread_create (&utm->client_rx_thread_handle, + NULL /*attr */ , client_rx_thread_fn, 0); + if (rv) + { + clib_warning ("pthread_create returned %d", rv); + rv = VNET_API_ERROR_SYSCALL_ERROR_1; + } } void -uri_tcp_bind (uri_tcp_test_main_t *utm) +client_send_data (uri_tcp_test_main_t * utm) { - vl_api_bind_uri_t * bmp; - u32 fifo_size = 3 << 20; - bmp = vl_msg_api_alloc (sizeof (*bmp)); - memset (bmp, 0, sizeof (*bmp)); + u8 *test_data = utm->connect_test_data; + u64 bytes_sent = 0; + int rv; + int mypid = getpid (); + session_t *session; + svm_fifo_t *tx_fifo; + int buffer_offset, bytes_to_send = 0; + session_fifo_event_t evt; + static int serial_number = 0; + int i; + u32 max_chunk = 64 << 10, write; - bmp->_vl_msg_id = ntohs (VL_API_BIND_URI); - bmp->client_index = utm->my_client_index; - bmp->context = ntohl(0xfeedface); - bmp->initial_segment_size = 256<<20; /* size of initial segment */ - bmp->options[SESSION_OPTIONS_FLAGS] = - SESSION_OPTIONS_FLAGS_USE_FIFO | SESSION_OPTIONS_FLAGS_ADD_SEGMENT; - bmp->options[SESSION_OPTIONS_RX_FIFO_SIZE] = fifo_size; - bmp->options[SESSION_OPTIONS_TX_FIFO_SIZE] = fifo_size; - bmp->options[SESSION_OPTIONS_ADD_SEGMENT_SIZE] = 128<<20; - memcpy (bmp->uri, utm->uri, vec_len (utm->uri)); - vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *)&bmp); + session = pool_elt_at_index (utm->sessions, utm->connected_session_index); + tx_fifo = session->server_tx_fifo; + + vec_validate (utm->rx_buf, vec_len (test_data) - 1); + + for (i = 0; i < 1; i++) + { + bytes_to_send = vec_len (test_data); + buffer_offset = 0; + while (bytes_to_send > 0) + { + write = bytes_to_send > max_chunk ? max_chunk : bytes_to_send; + rv = svm_fifo_enqueue_nowait (tx_fifo, mypid, write, + test_data + buffer_offset); + + if (rv > 0) + { + bytes_to_send -= rv; + buffer_offset += rv; + bytes_sent += rv; + + /* Fabricate TX event, send to vpp */ + evt.fifo = tx_fifo; + evt.event_type = FIFO_EVENT_SERVER_TX; + /* $$$$ for event logging */ + evt.enqueue_length = rv; + evt.event_id = serial_number++; + + unix_shared_memory_queue_add (utm->vpp_event_queue, + (u8 *) & evt, + 0 /* do wait for mutex */ ); + } + } + } + + if (utm->test_return_packets) + { + f64 timeout = clib_time_now (&utm->clib_time) + 2; + + /* Wait for the outstanding packets */ + while (utm->client_bytes_received < vec_len (test_data)) + { + if (clib_time_now (&utm->clib_time) > timeout) + { + clib_warning ("timed out waiting for the missing packets"); + break; + } + } + + utm->time_to_stop = 1; + } +} + +void +client_connect (uri_tcp_test_main_t * utm) +{ + vl_api_connect_uri_t *cmp; + cmp = vl_msg_api_alloc (sizeof (*cmp)); + memset (cmp, 0, sizeof (*cmp)); + + cmp->_vl_msg_id = ntohs (VL_API_CONNECT_URI); + cmp->client_index = utm->my_client_index; + cmp->context = ntohl (0xfeedface); + memcpy (cmp->uri, utm->connect_uri, vec_len (utm->connect_uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & cmp); +} + +void +client_disconnect (uri_tcp_test_main_t * utm) +{ + session_t *connected_session; + vl_api_disconnect_session_t *dmp; + connected_session = pool_elt_at_index (utm->sessions, + utm->connected_session_index); + dmp = vl_msg_api_alloc (sizeof (*dmp)); + memset (dmp, 0, sizeof (*dmp)); + dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION); + dmp->client_index = utm->my_client_index; + dmp->session_index = connected_session->vpp_session_index; + dmp->session_thread_index = connected_session->vpp_session_thread; + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & dmp); +} + +static void +client_test (uri_tcp_test_main_t * utm) +{ + int i; + + client_connect (utm); + + if (wait_for_state_change (utm, STATE_READY)) + { + return; + } + + /* Init test data */ + vec_validate (utm->connect_test_data, 64 * 1024 - 1); + for (i = 0; i < vec_len (utm->connect_test_data); i++) + utm->connect_test_data[i] = i & 0xff; + + /* Start send */ + client_send_data (utm); + + /* Disconnect */ + client_disconnect (utm); } static void -vl_api_unbind_uri_reply_t_handler (vl_api_unbind_uri_reply_t *mp) +vl_api_bind_uri_reply_t_handler (vl_api_bind_uri_reply_t * mp) { uri_tcp_test_main_t *utm = &uri_tcp_test_main; + svm_fifo_segment_create_args_t _a, *a = &_a; + int rv; - if (mp->retval != 0) - clib_warning ("returned %d", ntohl(mp->retval)); + if (mp->retval) + { + clib_warning ("bind failed: %d", mp->retval); + utm->state = STATE_FAILED; + return; + } - utm->state = STATE_START; + if (mp->segment_name_length == 0) + { + clib_warning ("segment_name_length zero"); + return; + } + + a->segment_name = (char *) mp->segment_name; + a->segment_size = mp->segment_size; + + ASSERT (mp->server_event_queue_address); + + /* Attach to the segment vpp created */ + rv = svm_fifo_segment_attach (a); + if (rv) + { + clib_warning ("svm_fifo_segment_attach ('%s') failed", + mp->segment_name); + return; + } + + utm->our_event_queue = + (unix_shared_memory_queue_t *) mp->server_event_queue_address; + + utm->state = STATE_READY; } -void -uri_tcp_unbind (uri_tcp_test_main_t *utm) +static void +vl_api_unbind_uri_reply_t_handler (vl_api_unbind_uri_reply_t * mp) { - vl_api_unbind_uri_t * ump; + uri_tcp_test_main_t *utm = &uri_tcp_test_main; - ump = vl_msg_api_alloc (sizeof (*ump)); - memset (ump, 0, sizeof (*ump)); + if (mp->retval != 0) + clib_warning ("returned %d", ntohl (mp->retval)); - ump->_vl_msg_id = ntohs (VL_API_UNBIND_URI); - ump->client_index = utm->my_client_index; - memcpy (ump->uri, utm->uri, vec_len (utm->uri)); - vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *)&ump); + utm->state = STATE_START; } static void @@ -717,14 +656,14 @@ vl_api_accept_session_t_handler (vl_api_accept_session_t * mp) { uri_tcp_test_main_t *utm = &uri_tcp_test_main; vl_api_accept_session_reply_t *rmp; - svm_fifo_t * rx_fifo, * tx_fifo; - session_t * session; + svm_fifo_t *rx_fifo, *tx_fifo; + session_t *session; static f64 start_time; u64 key; u32 session_index; if (start_time == 0.0) - start_time = clib_time_now (&utm->clib_time); + start_time = clib_time_now (&utm->clib_time); utm->vpp_event_queue = (unix_shared_memory_queue_t *) mp->vpp_event_queue_address; @@ -733,45 +672,159 @@ vl_api_accept_session_t_handler (vl_api_accept_session_t * mp) pool_get (utm->sessions, session); session_index = session - utm->sessions; - rx_fifo = (svm_fifo_t *)mp->server_rx_fifo; + rx_fifo = (svm_fifo_t *) mp->server_rx_fifo; rx_fifo->client_session_index = session_index; - tx_fifo = (svm_fifo_t *)mp->server_tx_fifo; + tx_fifo = (svm_fifo_t *) mp->server_tx_fifo; tx_fifo->client_session_index = session_index; session->server_rx_fifo = rx_fifo; session->server_tx_fifo = tx_fifo; /* Add it to lookup table */ - key = (((u64)mp->session_thread_index) << 32) | (u64)mp->session_index; + key = (((u64) mp->session_thread_index) << 32) | (u64) mp->session_index; hash_set (utm->session_index_by_vpp_handles, key, session_index); utm->state = STATE_READY; /* Stats printing */ - if (pool_elts (utm->sessions) && (pool_elts(utm->sessions) % 20000) == 0) + if (pool_elts (utm->sessions) && (pool_elts (utm->sessions) % 20000) == 0) { f64 now = clib_time_now (&utm->clib_time); fformat (stdout, "%d active sessions in %.2f seconds, %.2f/sec...\n", - pool_elts(utm->sessions), now - start_time, - (f64)pool_elts(utm->sessions) / (now - start_time)); + pool_elts (utm->sessions), now - start_time, + (f64) pool_elts (utm->sessions) / (now - start_time)); } - /* Send accept reply to vpp */ + /* + * Send accept reply to vpp + */ rmp = vl_msg_api_alloc (sizeof (*rmp)); memset (rmp, 0, sizeof (*rmp)); rmp->_vl_msg_id = ntohs (VL_API_ACCEPT_SESSION_REPLY); rmp->session_type = mp->session_type; rmp->session_index = mp->session_index; rmp->session_thread_index = mp->session_thread_index; - vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *)&rmp); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & rmp); } void -uri_tcp_server_test (uri_tcp_test_main_t * utm) +server_handle_fifo_event_rx (uri_tcp_test_main_t * utm, + session_fifo_event_t * e) { + svm_fifo_t *rx_fifo, *tx_fifo; + int n_read; + + session_fifo_event_t evt; + unix_shared_memory_queue_t *q; + int rv, bytes; + + rx_fifo = e->fifo; + tx_fifo = utm->sessions[rx_fifo->client_session_index].server_tx_fifo; + + bytes = e->enqueue_length; + do + { + n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len (utm->rx_buf), + utm->rx_buf); + + /* Reflect if a non-drop session */ + if (!utm->drop_packets && n_read > 0) + { + do + { + rv = svm_fifo_enqueue_nowait (tx_fifo, 0, n_read, utm->rx_buf); + } + while (rv == -2); + + /* Fabricate TX event, send to vpp */ + evt.fifo = tx_fifo; + evt.event_type = FIFO_EVENT_SERVER_TX; + /* $$$$ for event logging */ + evt.enqueue_length = n_read; + evt.event_id = e->event_id; + q = utm->vpp_event_queue; + unix_shared_memory_queue_add (q, (u8 *) & evt, + 0 /* do wait for mutex */ ); + } + + if (n_read > 0) + bytes -= n_read; + } + while (n_read < 0 || bytes > 0); +} + +void +server_handle_event_queue (uri_tcp_test_main_t * utm) +{ + session_fifo_event_t _e, *e = &_e;; + while (1) + { + unix_shared_memory_queue_sub (utm->our_event_queue, (u8 *) e, + 0 /* nowait */ ); + switch (e->event_type) + { + case FIFO_EVENT_SERVER_RX: + server_handle_fifo_event_rx (utm, e); + break; + + case FIFO_EVENT_SERVER_EXIT: + return; + + default: + clib_warning ("unknown event type %d", e->event_type); + break; + } + if (PREDICT_FALSE (utm->time_to_stop == 1)) + break; + if (PREDICT_FALSE (utm->time_to_print_stats == 1)) + { + utm->time_to_print_stats = 0; + fformat (stdout, "%d connections\n", pool_elts (utm->sessions)); + } + } +} + +void +server_bind (uri_tcp_test_main_t * utm) +{ + vl_api_bind_uri_t *bmp; + u32 fifo_size = 3 << 20; + bmp = vl_msg_api_alloc (sizeof (*bmp)); + memset (bmp, 0, sizeof (*bmp)); + + bmp->_vl_msg_id = ntohs (VL_API_BIND_URI); + bmp->client_index = utm->my_client_index; + bmp->context = ntohl (0xfeedface); + bmp->initial_segment_size = 256 << 20; /* size of initial segment */ + bmp->options[SESSION_OPTIONS_FLAGS] = + SESSION_OPTIONS_FLAGS_USE_FIFO | SESSION_OPTIONS_FLAGS_ADD_SEGMENT; + bmp->options[SESSION_OPTIONS_RX_FIFO_SIZE] = fifo_size; + bmp->options[SESSION_OPTIONS_TX_FIFO_SIZE] = fifo_size; + bmp->options[SESSION_OPTIONS_ADD_SEGMENT_SIZE] = 128 << 20; + memcpy (bmp->uri, utm->uri, vec_len (utm->uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & bmp); +} + +void +server_unbind (uri_tcp_test_main_t * utm) +{ + vl_api_unbind_uri_t *ump; + + ump = vl_msg_api_alloc (sizeof (*ump)); + memset (ump, 0, sizeof (*ump)); + + ump->_vl_msg_id = ntohs (VL_API_UNBIND_URI); + ump->client_index = utm->my_client_index; + memcpy (ump->uri, utm->uri, vec_len (utm->uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & ump); +} + +void +server_test (uri_tcp_test_main_t * utm) +{ /* Bind to uri */ - uri_tcp_bind (utm); + server_bind (utm); if (wait_for_state_change (utm, STATE_READY)) { @@ -780,10 +833,10 @@ uri_tcp_server_test (uri_tcp_test_main_t * utm) } /* Enter handle event loop */ - handle_event_queue (utm); + server_handle_event_queue (utm); /* Cleanup */ - uri_tcp_unbind (utm); + server_unbind (utm); if (wait_for_state_change (utm, STATE_START)) { @@ -824,12 +877,12 @@ main (int argc, char **argv) unformat_input_t _argv, *a = &_argv; u8 *chroot_prefix; u8 *heap; - u8 * bind_name = (u8 *) "tcp://0.0.0.0/1234"; + u8 *bind_name = (u8 *) "tcp://0.0.0.0/1234"; u32 tmp; mheap_t *h; - session_t * session; + session_t *session; int i; - int i_am_master = 1, drop_packets = 0; + int i_am_master = 1, drop_packets = 0, test_return_packets = 0; clib_mem_init (0, 256 << 20); @@ -841,53 +894,54 @@ main (int argc, char **argv) vec_validate (utm->rx_buf, 65536); - utm->session_index_by_vpp_handles = - hash_create (0, sizeof(uword)); + utm->session_index_by_vpp_handles = hash_create (0, sizeof (uword)); - utm->my_pid = getpid(); - utm->configured_segment_size = 1<<20; + utm->my_pid = getpid (); + utm->configured_segment_size = 1 << 20; clib_time_init (&utm->clib_time); init_error_string_table (utm); - svm_fifo_segment_init(0x200000000ULL, 20); + svm_fifo_segment_init (0x200000000ULL, 20); unformat_init_command_line (a, argv); while (unformat_check_input (a) != UNFORMAT_END_OF_INPUT) { if (unformat (a, "chroot prefix %s", &chroot_prefix)) - { - vl_set_memory_root_path ((char *) chroot_prefix); - } + { + vl_set_memory_root_path ((char *) chroot_prefix); + } else if (unformat (a, "uri %s", &bind_name)) - ; + ; else if (unformat (a, "segment-size %dM", &tmp)) - utm->configured_segment_size = tmp<<20; + utm->configured_segment_size = tmp << 20; else if (unformat (a, "segment-size %dG", &tmp)) - utm->configured_segment_size = tmp<<30; + utm->configured_segment_size = tmp << 30; else if (unformat (a, "master")) - i_am_master = 1; + i_am_master = 1; else if (unformat (a, "slave")) - i_am_master = 0; + i_am_master = 0; else if (unformat (a, "drop")) - drop_packets = 1; + drop_packets = 1; + else if (unformat (a, "test")) + test_return_packets = 1; else - { - fformat (stderr, "%s: usage [master|slave]\n"); - exit (1); - } + { + fformat (stderr, "%s: usage [master|slave]\n"); + exit (1); + } } utm->uri = format (0, "%s%c", bind_name, 0); utm->i_am_master = i_am_master; utm->segment_main = &svm_fifo_segment_main; utm->drop_packets = drop_packets; - + utm->test_return_packets = test_return_packets; utm->connect_uri = format (0, "tcp://6.0.1.2/1234%c", 0); - setup_signal_handlers(); + setup_signal_handlers (); uri_api_hookup (utm); - if (connect_to_vpp (i_am_master? "uri_tcp_server":"uri_tcp_client") < 0) + if (connect_to_vpp (i_am_master ? "uri_tcp_server" : "uri_tcp_client") < 0) { svm_region_exit (); fformat (stderr, "Couldn't connect to vpe, exiting...\n"); @@ -896,7 +950,7 @@ main (int argc, char **argv) if (i_am_master == 0) { - uri_tcp_client_test (utm); + client_test (utm); exit (0); } @@ -909,8 +963,16 @@ main (int argc, char **argv) for (i = 0; i < 200000; i++) pool_put_index (utm->sessions, i); - uri_tcp_server_test (utm); + server_test (utm); vl_client_disconnect_from_vlib (); exit (0); } + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/uri/uri_udp_test.c b/src/uri/uri_udp_test.c index 6f5284c9..54625d64 100644 --- a/src/uri/uri_udp_test.c +++ b/src/uri/uri_udp_test.c @@ -26,25 +26,25 @@ #include #include #include -#include -#include - -#include +#include +#include +#include +#include #define vl_typedefs /* define message structures */ -#include +#include #undef vl_typedefs /* declare message handlers for each api */ #define vl_endianfun /* define message structures */ -#include +#include #undef vl_endianfun /* instantiate all the print functions we know about */ #define vl_print(handle, ...) #define vl_printfun -#include +#include #undef vl_printfun /* Satisfy external references when not linking with -lvlib */ @@ -87,12 +87,28 @@ typedef struct /* intermediate rx buffer */ u8 *rx_buf; + /* URI for connect */ + u8 *connect_uri; + + int i_am_master; + /* Our event queue */ unix_shared_memory_queue_t *our_event_queue; /* $$$ single thread only for the moment */ unix_shared_memory_queue_t *vpp_event_queue; + /* $$$$ hack: cut-through session index */ + volatile u32 cut_through_session_index; + + /* unique segment name counter */ + u32 unique_segment_index; + + pid_t my_pid; + + /* pthread handle */ + pthread_t cut_through_thread_handle; + /* For deadman timers */ clib_time_t clib_time; @@ -102,14 +118,20 @@ typedef struct volatile int time_to_stop; volatile int time_to_print_stats; + u32 configured_segment_size; + /* VNET_API_ERROR_FOO -> "Foo" hash table */ uword *error_string_by_error_number; + + /* convenience */ + svm_fifo_segment_main_t *segment_main; + } uri_udp_test_main_t; #if CLIB_DEBUG > 0 -#define NITER 1000 +#define NITER 10000 #else -#define NITER 1000000 +#define NITER 4000000 #endif uri_udp_test_main_t uri_udp_test_main; @@ -159,7 +181,13 @@ format_api_error (u8 * s, va_list * args) int wait_for_state_change (uri_udp_test_main_t * utm, connection_state_t state) { - f64 timeout = clib_time_now (&utm->clib_time) + 5.0; +#if CLIB_DEBUG > 0 +#define TIMEOUT 600.0 +#else +#define TIMEOUT 600.0 +#endif + + f64 timeout = clib_time_now (&utm->clib_time) + TIMEOUT; while (clib_time_now (&utm->clib_time) < timeout) { @@ -169,6 +197,183 @@ wait_for_state_change (uri_udp_test_main_t * utm, connection_state_t state) return -1; } +u64 server_bytes_received, server_bytes_sent; + +static void * +cut_through_thread_fn (void *arg) +{ + session_t *s; + svm_fifo_t *rx_fifo; + svm_fifo_t *tx_fifo; + u8 *my_copy_buffer = 0; + uri_udp_test_main_t *utm = &uri_udp_test_main; + i32 actual_transfer; + int rv; + u32 buffer_offset; + + while (utm->cut_through_session_index == ~0) + ; + + s = pool_elt_at_index (utm->sessions, utm->cut_through_session_index); + + rx_fifo = s->server_rx_fifo; + tx_fifo = s->server_tx_fifo; + + vec_validate (my_copy_buffer, 64 * 1024 - 1); + + while (true) + { + /* We read from the tx fifo and write to the rx fifo */ + do + { + actual_transfer = svm_fifo_dequeue_nowait (tx_fifo, 0, + vec_len (my_copy_buffer), + my_copy_buffer); + } + while (actual_transfer <= 0); + + server_bytes_received += actual_transfer; + + buffer_offset = 0; + while (actual_transfer > 0) + { + rv = svm_fifo_enqueue_nowait (rx_fifo, 0, actual_transfer, + my_copy_buffer + buffer_offset); + if (rv > 0) + { + actual_transfer -= rv; + buffer_offset += rv; + server_bytes_sent += rv; + } + + } + if (PREDICT_FALSE (utm->time_to_stop)) + break; + } + + pthread_exit (0); +} + +static void +uri_udp_slave_test (uri_udp_test_main_t * utm) +{ + vl_api_connect_uri_t *cmp; + int i; + u8 *test_data = 0; + u64 bytes_received = 0, bytes_sent = 0; + i32 bytes_to_read; + int rv; + int mypid = getpid (); + f64 before, after, delta, bytes_per_second; + session_t *session; + svm_fifo_t *rx_fifo, *tx_fifo; + int buffer_offset, bytes_to_send = 0; + + vec_validate (test_data, 64 * 1024 - 1); + for (i = 0; i < vec_len (test_data); i++) + test_data[i] = i & 0xff; + + cmp = vl_msg_api_alloc (sizeof (*cmp)); + memset (cmp, 0, sizeof (*cmp)); + + cmp->_vl_msg_id = ntohs (VL_API_CONNECT_URI); + cmp->client_index = utm->my_client_index; + cmp->context = ntohl (0xfeedface); + memcpy (cmp->uri, utm->connect_uri, vec_len (utm->connect_uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & cmp); + + if (wait_for_state_change (utm, STATE_READY)) + { + clib_warning ("timeout waiting for STATE_READY"); + return; + } + + session = pool_elt_at_index (utm->sessions, utm->cut_through_session_index); + rx_fifo = session->server_rx_fifo; + tx_fifo = session->server_tx_fifo; + + before = clib_time_now (&utm->clib_time); + + vec_validate (utm->rx_buf, vec_len (test_data) - 1); + + for (i = 0; i < NITER; i++) + { + bytes_to_send = vec_len (test_data); + buffer_offset = 0; + while (bytes_to_send > 0) + { + rv = svm_fifo_enqueue_nowait (tx_fifo, mypid, + bytes_to_send, + test_data + buffer_offset); + + if (rv > 0) + { + bytes_to_send -= rv; + buffer_offset += rv; + bytes_sent += rv; + } + } + + bytes_to_read = svm_fifo_max_dequeue (rx_fifo); + + bytes_to_read = vec_len (utm->rx_buf) > bytes_to_read ? + bytes_to_read : vec_len (utm->rx_buf); + + buffer_offset = 0; + while (bytes_to_read > 0) + { + rv = svm_fifo_dequeue_nowait (rx_fifo, mypid, + bytes_to_read, + utm->rx_buf + buffer_offset); + if (rv > 0) + { + bytes_to_read -= rv; + buffer_offset += rv; + bytes_received += rv; + } + } + } + while (bytes_received < bytes_sent) + { + rv = svm_fifo_dequeue_nowait (rx_fifo, mypid, + vec_len (utm->rx_buf), utm->rx_buf); + if (rv > 0) + { +#if CLIB_DEBUG > 0 + int j; + for (j = 0; j < rv; j++) + { + if (utm->rx_buf[j] != ((bytes_received + j) & 0xff)) + { + clib_warning ("error at byte %lld, 0x%x not 0x%x", + bytes_received + j, + utm->rx_buf[j], + ((bytes_received + j) & 0xff)); + } + } +#endif + bytes_received += (u64) rv; + } + } + + after = clib_time_now (&utm->clib_time); + delta = after - before; + bytes_per_second = 0.0; + + if (delta > 0.0) + bytes_per_second = (f64) bytes_received / delta; + + fformat (stdout, + "Done: %lld recv bytes in %.2f seconds, %.2f bytes/sec...\n\n", + bytes_received, delta, bytes_per_second); + fformat (stdout, + "Done: %lld sent bytes in %.2f seconds, %.2f bytes/sec...\n\n", + bytes_sent, delta, bytes_per_second); + fformat (stdout, + "client -> server -> client round trip: %.2f Gbit/sec \n\n", + (bytes_per_second * 8.0) / 1e9); +} + static void vl_api_bind_uri_reply_t_handler (vl_api_bind_uri_reply_t * mp) { @@ -183,12 +388,16 @@ vl_api_bind_uri_reply_t_handler (vl_api_bind_uri_reply_t * mp) } a->segment_name = (char *) mp->segment_name; + a->segment_size = mp->segment_size; + + ASSERT (mp->server_event_queue_address); /* Attach to the segment vpp created */ rv = svm_fifo_segment_attach (a); if (rv) { - clib_warning ("sm_fifo_segment_create ('%s') failed", mp->segment_name); + clib_warning ("svm_fifo_segment_attach ('%s') failed", + mp->segment_name); return; } @@ -198,6 +407,101 @@ vl_api_bind_uri_reply_t_handler (vl_api_bind_uri_reply_t * mp) utm->state = STATE_READY; } +static void +vl_api_map_another_segment_t_handler (vl_api_map_another_segment_t * mp) +{ + svm_fifo_segment_create_args_t _a, *a = &_a; + int rv; + + a->segment_name = (char *) mp->segment_name; + a->segment_size = mp->segment_size; + /* Attach to the segment vpp created */ + rv = svm_fifo_segment_attach (a); + if (rv) + { + clib_warning ("svm_fifo_segment_attach ('%s') failed", + mp->segment_name); + return; + } + clib_warning ("Mapped new segment '%s' size %d", mp->segment_name, + mp->segment_size); +} + +static void +vl_api_connect_uri_t_handler (vl_api_connect_uri_t * mp) +{ + u32 segment_index; + uri_udp_test_main_t *utm = &uri_udp_test_main; + svm_fifo_segment_main_t *sm = &svm_fifo_segment_main; + svm_fifo_segment_create_args_t _a, *a = &_a; + svm_fifo_segment_private_t *seg; + unix_shared_memory_queue_t *client_q; + vl_api_connect_uri_reply_t *rmp; + session_t *session; + int rv = 0; + + /* Create the segment */ + a->segment_name = (char *) format (0, "%d:segment%d%c", utm->my_pid, + utm->unique_segment_index++, 0); + a->segment_size = utm->configured_segment_size; + + rv = svm_fifo_segment_create (a); + if (rv) + { + clib_warning ("sm_fifo_segment_create ('%s') failed", a->segment_name); + rv = VNET_API_ERROR_URI_FIFO_CREATE_FAILED; + goto send_reply; + } + + vec_add2 (utm->seg, seg, 1); + + segment_index = vec_len (sm->segments) - 1; + + memcpy (seg, sm->segments + segment_index, sizeof (utm->seg[0])); + + pool_get (utm->sessions, session); + + /* + * By construction the master's idea of the rx fifo ends up in + * fsh->fifos[0], and the master's idea of the tx fifo ends up in + * fsh->fifos[1]. + */ + session->server_rx_fifo = svm_fifo_segment_alloc_fifo (utm->seg, + 128 * 1024); + ASSERT (session->server_rx_fifo); + + session->server_tx_fifo = svm_fifo_segment_alloc_fifo (utm->seg, + 128 * 1024); + ASSERT (session->server_tx_fifo); + + session->server_rx_fifo->server_session_index = session - utm->sessions; + session->server_tx_fifo->server_session_index = session - utm->sessions; + utm->cut_through_session_index = session - utm->sessions; + + rv = pthread_create (&utm->cut_through_thread_handle, + NULL /*attr */ , cut_through_thread_fn, 0); + if (rv) + { + clib_warning ("pthread_create returned %d", rv); + rv = VNET_API_ERROR_SYSCALL_ERROR_1; + } + +send_reply: + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + + rmp->_vl_msg_id = ntohs (VL_API_CONNECT_URI_REPLY); + rmp->context = mp->context; + rmp->retval = ntohl (rv); + rmp->segment_name_length = vec_len (a->segment_name); + memcpy (rmp->segment_name, a->segment_name, vec_len (a->segment_name)); + + vec_free (a->segment_name); + + client_q = (unix_shared_memory_queue_t *) mp->client_queue_address; + vl_msg_api_send_shmem (client_q, (u8 *) & rmp); +} + static void vl_api_unbind_uri_reply_t_handler (vl_api_unbind_uri_reply_t * mp) { @@ -293,18 +597,79 @@ vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & rmp); } +static void +vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) +{ + svm_fifo_segment_main_t *sm = &svm_fifo_segment_main; + uri_udp_test_main_t *utm = &uri_udp_test_main; + svm_fifo_segment_create_args_t _a, *a = &_a; + ssvm_shared_header_t *sh; + svm_fifo_segment_private_t *seg; + svm_fifo_segment_header_t *fsh; + session_t *session; + u32 segment_index; + int rv; + + ASSERT (utm->i_am_master == 0); + + if (mp->segment_name_length == 0) + { + clib_warning ("segment_name_length zero"); + return; + } + + memset (a, 0, sizeof (*a)); + + a->segment_name = (char *) mp->segment_name; + + sleep (1); + + rv = svm_fifo_segment_attach (a); + if (rv) + { + clib_warning ("sm_fifo_segment_create ('%v') failed", mp->segment_name); + return; + } + + segment_index = vec_len (sm->segments) - 1; + + vec_add2 (utm->seg, seg, 1); + + memcpy (seg, sm->segments + segment_index, sizeof (*seg)); + sh = seg->ssvm.sh; + fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; + + while (vec_len (fsh->fifos) < 2) + sleep (1); + + pool_get (utm->sessions, session); + utm->cut_through_session_index = session - utm->sessions; + + session->server_rx_fifo = (svm_fifo_t *) fsh->fifos[0]; + ASSERT (session->server_rx_fifo); + session->server_tx_fifo = (svm_fifo_t *) fsh->fifos[1]; + ASSERT (session->server_tx_fifo); + + /* security: could unlink /dev/shm/segment_name> here, maybe */ + + utm->state = STATE_READY; +} + #define foreach_uri_msg \ _(BIND_URI_REPLY, bind_uri_reply) \ +_(CONNECT_URI, connect_uri) \ +_(CONNECT_URI_REPLY, connect_uri_reply) \ _(UNBIND_URI_REPLY, unbind_uri_reply) \ _(ACCEPT_SESSION, accept_session) \ -_(DISCONNECT_SESSION, disconnect_session) +_(DISCONNECT_SESSION, disconnect_session) \ +_(MAP_ANOTHER_SEGMENT, map_another_segment) void uri_api_hookup (uri_udp_test_main_t * utm) { #define _(N,n) \ vl_msg_api_set_handlers(VL_API_##N, #n, \ - vl_api_##n##_t_handler, \ + vl_api_##n##_t_handler, \ vl_noop_handler, \ vl_api_##n##_t_endian, \ vl_api_##n##_t_print, \ @@ -349,7 +714,7 @@ init_error_string_table (uri_udp_test_main_t * utm) } void -handle_fifo_event_server_rx (uri_udp_test_main_t * utm, +server_handle_fifo_event_rx (uri_udp_test_main_t * utm, session_fifo_event_t * e) { svm_fifo_t *rx_fifo, *tx_fifo; @@ -385,7 +750,7 @@ handle_fifo_event_server_rx (uri_udp_test_main_t * utm, } void -handle_event_queue (uri_udp_test_main_t * utm) +server_handle_event_queue (uri_udp_test_main_t * utm) { session_fifo_event_t _e, *e = &_e;; @@ -396,7 +761,7 @@ handle_event_queue (uri_udp_test_main_t * utm) switch (e->event_type) { case FIFO_EVENT_SERVER_RX: - handle_fifo_event_server_rx (utm, e); + server_handle_fifo_event_rx (utm, e); break; case FIFO_EVENT_SERVER_EXIT: @@ -428,7 +793,12 @@ uri_udp_test (uri_udp_test_main_t * utm) bmp->_vl_msg_id = ntohs (VL_API_BIND_URI); bmp->client_index = utm->my_client_index; bmp->context = ntohl (0xfeedface); - bmp->segment_size = 2 << 30; + bmp->initial_segment_size = 256 << 20; /* size of initial segment */ + bmp->options[SESSION_OPTIONS_FLAGS] = + SESSION_OPTIONS_FLAGS_USE_FIFO | SESSION_OPTIONS_FLAGS_ADD_SEGMENT; + bmp->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 16 << 10; + bmp->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 16 << 10; + bmp->options[SESSION_OPTIONS_ADD_SEGMENT_SIZE] = 128 << 20; memcpy (bmp->uri, utm->uri, vec_len (utm->uri)); vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & bmp); @@ -438,7 +808,7 @@ uri_udp_test (uri_udp_test_main_t * utm) return; } - handle_event_queue (utm); + server_handle_event_queue (utm); ump = vl_msg_api_alloc (sizeof (*ump)); memset (ump, 0, sizeof (*ump)); @@ -464,10 +834,12 @@ main (int argc, char **argv) unformat_input_t _argv, *a = &_argv; u8 *chroot_prefix; u8 *heap; - u8 *bind_name = (u8 *) "udp4:1234"; + u8 *bind_name = (u8 *) "udp://0.0.0.0/1234"; + u32 tmp; mheap_t *h; session_t *session; int i; + int i_am_master = 1; clib_mem_init (0, 256 << 20); @@ -481,6 +853,9 @@ main (int argc, char **argv) utm->session_index_by_vpp_handles = hash_create (0, sizeof (uword)); + utm->my_pid = getpid (); + utm->configured_segment_size = 1 << 20; + clib_time_init (&utm->clib_time); init_error_string_table (utm); svm_fifo_segment_init (0x200000000ULL, 20); @@ -494,6 +869,14 @@ main (int argc, char **argv) } else if (unformat (a, "uri %s", &bind_name)) ; + else if (unformat (a, "segment-size %dM", &tmp)) + utm->configured_segment_size = tmp << 20; + else if (unformat (a, "segment-size %dG", &tmp)) + utm->configured_segment_size = tmp << 30; + else if (unformat (a, "master")) + i_am_master = 1; + else if (unformat (a, "slave")) + i_am_master = 0; else { fformat (stderr, "%s: usage [master|slave]\n"); @@ -501,19 +884,30 @@ main (int argc, char **argv) } } + utm->cut_through_session_index = ~0; utm->uri = format (0, "%s%c", bind_name, 0); + utm->i_am_master = i_am_master; + utm->segment_main = &svm_fifo_segment_main; + + utm->connect_uri = format (0, "udp://10.0.0.1/1234%c", 0); setup_signal_handlers (); uri_api_hookup (utm); - if (connect_to_vpp ("uri_udp_test") < 0) + if (connect_to_vpp (i_am_master ? "uri_udp_master" : "uri_udp_slave") < 0) { svm_region_exit (); fformat (stderr, "Couldn't connect to vpe, exiting...\n"); exit (1); } + if (i_am_master == 0) + { + uri_udp_slave_test (utm); + exit (0); + } + /* $$$$ hack preallocation */ for (i = 0; i < 200000; i++) { @@ -531,7 +925,7 @@ main (int argc, char **argv) #undef vl_api_version #define vl_api_version(n,v) static u32 vpe_api_version = v; -#include +#include #undef vl_api_version void @@ -544,6 +938,12 @@ vl_client_add_api_signatures (vl_api_memclnt_create_t * mp) mp->api_versions[0] = clib_host_to_net_u32 (vpe_api_version); } +u32 +vl (void *p) +{ + return vec_len (p); +} + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/uri/uri_udp_test2.c b/src/uri/uri_udp_test2.c deleted file mode 100644 index ddfffaa6..00000000 --- a/src/uri/uri_udp_test2.c +++ /dev/null @@ -1,954 +0,0 @@ -/* - * Copyright (c) 2016 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../vnet/session/application_interface.h" - -#define vl_typedefs /* define message structures */ -#include -#undef vl_typedefs - -/* declare message handlers for each api */ - -#define vl_endianfun /* define message structures */ -#include -#undef vl_endianfun - -/* instantiate all the print functions we know about */ -#define vl_print(handle, ...) -#define vl_printfun -#include -#undef vl_printfun - -/* Satisfy external references when not linking with -lvlib */ -vlib_main_t vlib_global_main; -vlib_main_t **vlib_mains; - -typedef enum -{ - STATE_START, - STATE_READY, - STATE_DISCONNECTING, -} connection_state_t; - -typedef struct -{ - svm_fifo_t *server_rx_fifo; - svm_fifo_t *server_tx_fifo; -} session_t; - -typedef struct -{ - /* vpe input queue */ - unix_shared_memory_queue_t *vl_input_queue; - - /* API client handle */ - u32 my_client_index; - - /* The URI we're playing with */ - u8 *uri; - - /* Session pool */ - session_t *sessions; - - /* Hash table for disconnect processing */ - uword *session_index_by_vpp_handles; - - /* fifo segment */ - svm_fifo_segment_private_t *seg; - - /* intermediate rx buffer */ - u8 *rx_buf; - - /* URI for connect */ - u8 *connect_uri; - - int i_am_master; - - /* Our event queue */ - unix_shared_memory_queue_t *our_event_queue; - - /* $$$ single thread only for the moment */ - unix_shared_memory_queue_t *vpp_event_queue; - - /* $$$$ hack: cut-through session index */ - volatile u32 cut_through_session_index; - - /* unique segment name counter */ - u32 unique_segment_index; - - pid_t my_pid; - - /* pthread handle */ - pthread_t cut_through_thread_handle; - - /* For deadman timers */ - clib_time_t clib_time; - - /* State of the connection, shared between msg RX thread and main thread */ - volatile connection_state_t state; - - volatile int time_to_stop; - volatile int time_to_print_stats; - - u32 configured_segment_size; - - /* VNET_API_ERROR_FOO -> "Foo" hash table */ - uword *error_string_by_error_number; - - /* convenience */ - svm_fifo_segment_main_t *segment_main; - -} uri_udp_test_main_t; - -#if CLIB_DEBUG > 0 -#define NITER 10000 -#else -#define NITER 4000000 -#endif - -uri_udp_test_main_t uri_udp_test_main; - -static void -stop_signal (int signum) -{ - uri_udp_test_main_t *um = &uri_udp_test_main; - - um->time_to_stop = 1; -} - -static void -stats_signal (int signum) -{ - uri_udp_test_main_t *um = &uri_udp_test_main; - - um->time_to_print_stats = 1; -} - -static clib_error_t * -setup_signal_handlers (void) -{ - signal (SIGINT, stats_signal); - signal (SIGQUIT, stop_signal); - signal (SIGTERM, stop_signal); - - return 0; -} - -u8 * -format_api_error (u8 * s, va_list * args) -{ - uri_udp_test_main_t *utm = va_arg (*args, uri_udp_test_main_t *); - i32 error = va_arg (*args, u32); - uword *p; - - p = hash_get (utm->error_string_by_error_number, -error); - - if (p) - s = format (s, "%s", p[0]); - else - s = format (s, "%d", error); - return s; -} - -int -wait_for_state_change (uri_udp_test_main_t * utm, connection_state_t state) -{ -#if CLIB_DEBUG > 0 -#define TIMEOUT 600.0 -#else -#define TIMEOUT 600.0 -#endif - - f64 timeout = clib_time_now (&utm->clib_time) + TIMEOUT; - - while (clib_time_now (&utm->clib_time) < timeout) - { - if (utm->state == state) - return 0; - } - return -1; -} - -u64 server_bytes_received, server_bytes_sent; - -static void * -cut_through_thread_fn (void *arg) -{ - session_t *s; - svm_fifo_t *rx_fifo; - svm_fifo_t *tx_fifo; - u8 *my_copy_buffer = 0; - uri_udp_test_main_t *utm = &uri_udp_test_main; - i32 actual_transfer; - int rv; - u32 buffer_offset; - - while (utm->cut_through_session_index == ~0) - ; - - s = pool_elt_at_index (utm->sessions, utm->cut_through_session_index); - - rx_fifo = s->server_rx_fifo; - tx_fifo = s->server_tx_fifo; - - vec_validate (my_copy_buffer, 64 * 1024 - 1); - - while (true) - { - /* We read from the tx fifo and write to the rx fifo */ - do - { - actual_transfer = svm_fifo_dequeue_nowait (tx_fifo, 0, - vec_len (my_copy_buffer), - my_copy_buffer); - } - while (actual_transfer <= 0); - - server_bytes_received += actual_transfer; - - buffer_offset = 0; - while (actual_transfer > 0) - { - rv = svm_fifo_enqueue_nowait (rx_fifo, 0, actual_transfer, - my_copy_buffer + buffer_offset); - if (rv > 0) - { - actual_transfer -= rv; - buffer_offset += rv; - server_bytes_sent += rv; - } - - } - if (PREDICT_FALSE (utm->time_to_stop)) - break; - } - - pthread_exit (0); -} - -static void -uri_udp_slave_test (uri_udp_test_main_t * utm) -{ - vl_api_connect_uri_t *cmp; - int i; - u8 *test_data = 0; - u64 bytes_received = 0, bytes_sent = 0; - i32 bytes_to_read; - int rv; - int mypid = getpid (); - f64 before, after, delta, bytes_per_second; - session_t *session; - svm_fifo_t *rx_fifo, *tx_fifo; - int buffer_offset, bytes_to_send = 0; - - vec_validate (test_data, 64 * 1024 - 1); - for (i = 0; i < vec_len (test_data); i++) - test_data[i] = i & 0xff; - - cmp = vl_msg_api_alloc (sizeof (*cmp)); - memset (cmp, 0, sizeof (*cmp)); - - cmp->_vl_msg_id = ntohs (VL_API_CONNECT_URI); - cmp->client_index = utm->my_client_index; - cmp->context = ntohl (0xfeedface); - memcpy (cmp->uri, utm->connect_uri, vec_len (utm->connect_uri)); - vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & cmp); - - if (wait_for_state_change (utm, STATE_READY)) - { - clib_warning ("timeout waiting for STATE_READY"); - return; - } - - session = pool_elt_at_index (utm->sessions, utm->cut_through_session_index); - rx_fifo = session->server_rx_fifo; - tx_fifo = session->server_tx_fifo; - - before = clib_time_now (&utm->clib_time); - - vec_validate (utm->rx_buf, vec_len (test_data) - 1); - - for (i = 0; i < NITER; i++) - { - bytes_to_send = vec_len (test_data); - buffer_offset = 0; - while (bytes_to_send > 0) - { - rv = svm_fifo_enqueue_nowait (tx_fifo, mypid, - bytes_to_send, - test_data + buffer_offset); - - if (rv > 0) - { - bytes_to_send -= rv; - buffer_offset += rv; - bytes_sent += rv; - } - } - - bytes_to_read = svm_fifo_max_dequeue (rx_fifo); - - bytes_to_read = vec_len (utm->rx_buf) > bytes_to_read ? - bytes_to_read : vec_len (utm->rx_buf); - - buffer_offset = 0; - while (bytes_to_read > 0) - { - rv = svm_fifo_dequeue_nowait (rx_fifo, mypid, - bytes_to_read, - utm->rx_buf + buffer_offset); - if (rv > 0) - { - bytes_to_read -= rv; - buffer_offset += rv; - bytes_received += rv; - } - } - } - while (bytes_received < bytes_sent) - { - rv = svm_fifo_dequeue_nowait (rx_fifo, mypid, - vec_len (utm->rx_buf), utm->rx_buf); - if (rv > 0) - { -#if CLIB_DEBUG > 0 - int j; - for (j = 0; j < rv; j++) - { - if (utm->rx_buf[j] != ((bytes_received + j) & 0xff)) - { - clib_warning ("error at byte %lld, 0x%x not 0x%x", - bytes_received + j, - utm->rx_buf[j], - ((bytes_received + j) & 0xff)); - } - } -#endif - bytes_received += (u64) rv; - } - } - - after = clib_time_now (&utm->clib_time); - delta = after - before; - bytes_per_second = 0.0; - - if (delta > 0.0) - bytes_per_second = (f64) bytes_received / delta; - - fformat (stdout, - "Done: %lld recv bytes in %.2f seconds, %.2f bytes/sec...\n\n", - bytes_received, delta, bytes_per_second); - fformat (stdout, - "Done: %lld sent bytes in %.2f seconds, %.2f bytes/sec...\n\n", - bytes_sent, delta, bytes_per_second); - fformat (stdout, - "client -> server -> client round trip: %.2f Gbit/sec \n\n", - (bytes_per_second * 8.0) / 1e9); -} - -static void -vl_api_bind_uri_reply_t_handler (vl_api_bind_uri_reply_t * mp) -{ - uri_udp_test_main_t *utm = &uri_udp_test_main; - svm_fifo_segment_create_args_t _a, *a = &_a; - int rv; - - if (mp->segment_name_length == 0) - { - clib_warning ("segment_name_length zero"); - return; - } - - a->segment_name = (char *) mp->segment_name; - a->segment_size = mp->segment_size; - - ASSERT (mp->server_event_queue_address); - - /* Attach to the segment vpp created */ - rv = svm_fifo_segment_attach (a); - if (rv) - { - clib_warning ("svm_fifo_segment_attach ('%s') failed", - mp->segment_name); - return; - } - - utm->our_event_queue = (unix_shared_memory_queue_t *) - mp->server_event_queue_address; - - utm->state = STATE_READY; -} - -static void -vl_api_map_another_segment_t_handler (vl_api_map_another_segment_t * mp) -{ - svm_fifo_segment_create_args_t _a, *a = &_a; - int rv; - - a->segment_name = (char *) mp->segment_name; - a->segment_size = mp->segment_size; - /* Attach to the segment vpp created */ - rv = svm_fifo_segment_attach (a); - if (rv) - { - clib_warning ("svm_fifo_segment_attach ('%s') failed", - mp->segment_name); - return; - } - clib_warning ("Mapped new segment '%s' size %d", mp->segment_name, - mp->segment_size); -} - -static void -vl_api_connect_uri_t_handler (vl_api_connect_uri_t * mp) -{ - u32 segment_index; - uri_udp_test_main_t *utm = &uri_udp_test_main; - svm_fifo_segment_main_t *sm = &svm_fifo_segment_main; - svm_fifo_segment_create_args_t _a, *a = &_a; - svm_fifo_segment_private_t *seg; - unix_shared_memory_queue_t *client_q; - vl_api_connect_uri_reply_t *rmp; - session_t *session; - int rv = 0; - - /* Create the segment */ - a->segment_name = (char *) format (0, "%d:segment%d%c", utm->my_pid, - utm->unique_segment_index++, 0); - a->segment_size = utm->configured_segment_size; - - rv = svm_fifo_segment_create (a); - if (rv) - { - clib_warning ("sm_fifo_segment_create ('%s') failed", a->segment_name); - rv = VNET_API_ERROR_URI_FIFO_CREATE_FAILED; - goto send_reply; - } - - vec_add2 (utm->seg, seg, 1); - - segment_index = vec_len (sm->segments) - 1; - - memcpy (seg, sm->segments + segment_index, sizeof (utm->seg[0])); - - pool_get (utm->sessions, session); - - /* - * By construction the master's idea of the rx fifo ends up in - * fsh->fifos[0], and the master's idea of the tx fifo ends up in - * fsh->fifos[1]. - */ - session->server_rx_fifo = svm_fifo_segment_alloc_fifo (utm->seg, - 128 * 1024); - ASSERT (session->server_rx_fifo); - - session->server_tx_fifo = svm_fifo_segment_alloc_fifo (utm->seg, - 128 * 1024); - ASSERT (session->server_tx_fifo); - - session->server_rx_fifo->server_session_index = session - utm->sessions; - session->server_tx_fifo->server_session_index = session - utm->sessions; - utm->cut_through_session_index = session - utm->sessions; - - rv = pthread_create (&utm->cut_through_thread_handle, - NULL /*attr */ , cut_through_thread_fn, 0); - if (rv) - { - clib_warning ("pthread_create returned %d", rv); - rv = VNET_API_ERROR_SYSCALL_ERROR_1; - } - -send_reply: - rmp = vl_msg_api_alloc (sizeof (*rmp)); - memset (rmp, 0, sizeof (*rmp)); - - rmp->_vl_msg_id = ntohs (VL_API_CONNECT_URI_REPLY); - rmp->context = mp->context; - rmp->retval = ntohl (rv); - rmp->segment_name_length = vec_len (a->segment_name); - memcpy (rmp->segment_name, a->segment_name, vec_len (a->segment_name)); - - vec_free (a->segment_name); - - client_q = (unix_shared_memory_queue_t *) mp->client_queue_address; - vl_msg_api_send_shmem (client_q, (u8 *) & rmp); -} - -static void -vl_api_unbind_uri_reply_t_handler (vl_api_unbind_uri_reply_t * mp) -{ - uri_udp_test_main_t *utm = &uri_udp_test_main; - - if (mp->retval != 0) - clib_warning ("returned %d", ntohl (mp->retval)); - - utm->state = STATE_START; -} - -static void -vl_api_accept_session_t_handler (vl_api_accept_session_t * mp) -{ - uri_udp_test_main_t *utm = &uri_udp_test_main; - vl_api_accept_session_reply_t *rmp; - svm_fifo_t *rx_fifo, *tx_fifo; - session_t *session; - static f64 start_time; - u64 key; - - if (start_time == 0.0) - start_time = clib_time_now (&utm->clib_time); - - utm->vpp_event_queue = (unix_shared_memory_queue_t *) - mp->vpp_event_queue_address; - - pool_get (utm->sessions, session); - - rx_fifo = (svm_fifo_t *) mp->server_rx_fifo; - rx_fifo->client_session_index = session - utm->sessions; - tx_fifo = (svm_fifo_t *) mp->server_tx_fifo; - tx_fifo->client_session_index = session - utm->sessions; - - session->server_rx_fifo = rx_fifo; - session->server_tx_fifo = tx_fifo; - - key = (((u64) mp->session_thread_index) << 32) | (u64) mp->session_index; - - hash_set (utm->session_index_by_vpp_handles, key, session - utm->sessions); - - utm->state = STATE_READY; - - if (pool_elts (utm->sessions) && (pool_elts (utm->sessions) % 20000) == 0) - { - f64 now = clib_time_now (&utm->clib_time); - fformat (stdout, "%d active sessions in %.2f seconds, %.2f/sec...\n", - pool_elts (utm->sessions), now - start_time, - (f64) pool_elts (utm->sessions) / (now - start_time)); - } - - rmp = vl_msg_api_alloc (sizeof (*rmp)); - memset (rmp, 0, sizeof (*rmp)); - rmp->_vl_msg_id = ntohs (VL_API_ACCEPT_SESSION_REPLY); - rmp->session_type = mp->session_type; - rmp->session_index = mp->session_index; - rmp->session_thread_index = mp->session_thread_index; - vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & rmp); -} - -static void -vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) -{ - uri_udp_test_main_t *utm = &uri_udp_test_main; - session_t *session; - vl_api_disconnect_session_reply_t *rmp; - uword *p; - int rv = 0; - u64 key; - - key = (((u64) mp->session_thread_index) << 32) | (u64) mp->session_index; - - p = hash_get (utm->session_index_by_vpp_handles, key); - - if (p) - { - session = pool_elt_at_index (utm->sessions, p[0]); - hash_unset (utm->session_index_by_vpp_handles, key); - pool_put (utm->sessions, session); - } - else - { - clib_warning ("couldn't find session key %llx", key); - rv = -11; - } - - rmp = vl_msg_api_alloc (sizeof (*rmp)); - memset (rmp, 0, sizeof (*rmp)); - rmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION_REPLY); - rmp->retval = rv; - rmp->session_index = mp->session_index; - rmp->session_thread_index = mp->session_thread_index; - vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & rmp); -} - -static void -vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) -{ - svm_fifo_segment_main_t *sm = &svm_fifo_segment_main; - uri_udp_test_main_t *utm = &uri_udp_test_main; - svm_fifo_segment_create_args_t _a, *a = &_a; - ssvm_shared_header_t *sh; - svm_fifo_segment_private_t *seg; - svm_fifo_segment_header_t *fsh; - session_t *session; - u32 segment_index; - int rv; - - ASSERT (utm->i_am_master == 0); - - if (mp->segment_name_length == 0) - { - clib_warning ("segment_name_length zero"); - return; - } - - memset (a, 0, sizeof (*a)); - - a->segment_name = (char *) mp->segment_name; - - sleep (1); - - rv = svm_fifo_segment_attach (a); - if (rv) - { - clib_warning ("sm_fifo_segment_create ('%v') failed", mp->segment_name); - return; - } - - segment_index = vec_len (sm->segments) - 1; - - vec_add2 (utm->seg, seg, 1); - - memcpy (seg, sm->segments + segment_index, sizeof (*seg)); - sh = seg->ssvm.sh; - fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; - - while (vec_len (fsh->fifos) < 2) - sleep (1); - - pool_get (utm->sessions, session); - utm->cut_through_session_index = session - utm->sessions; - - session->server_rx_fifo = (svm_fifo_t *) fsh->fifos[0]; - ASSERT (session->server_rx_fifo); - session->server_tx_fifo = (svm_fifo_t *) fsh->fifos[1]; - ASSERT (session->server_tx_fifo); - - /* security: could unlink /dev/shm/segment_name> here, maybe */ - - utm->state = STATE_READY; -} - -#define foreach_uri_msg \ -_(BIND_URI_REPLY, bind_uri_reply) \ -_(CONNECT_URI, connect_uri) \ -_(CONNECT_URI_REPLY, connect_uri_reply) \ -_(UNBIND_URI_REPLY, unbind_uri_reply) \ -_(ACCEPT_SESSION, accept_session) \ -_(DISCONNECT_SESSION, disconnect_session) \ -_(MAP_ANOTHER_SEGMENT, map_another_segment) - -void -uri_api_hookup (uri_udp_test_main_t * utm) -{ -#define _(N,n) \ - vl_msg_api_set_handlers(VL_API_##N, #n, \ - vl_api_##n##_t_handler, \ - vl_noop_handler, \ - vl_api_##n##_t_endian, \ - vl_api_##n##_t_print, \ - sizeof(vl_api_##n##_t), 1); - foreach_uri_msg; -#undef _ - -} - - -int -connect_to_vpp (char *name) -{ - uri_udp_test_main_t *utm = &uri_udp_test_main; - api_main_t *am = &api_main; - - if (vl_client_connect_to_vlib ("/vpe-api", name, 32) < 0) - return -1; - - utm->vl_input_queue = am->shmem_hdr->vl_input_queue; - utm->my_client_index = am->my_client_index; - - return 0; -} - -void -vlib_cli_output (struct vlib_main_t *vm, char *fmt, ...) -{ - clib_warning ("BUG"); -} - -static void -init_error_string_table (uri_udp_test_main_t * utm) -{ - utm->error_string_by_error_number = hash_create (0, sizeof (uword)); - -#define _(n,v,s) hash_set (utm->error_string_by_error_number, -v, s); - foreach_vnet_api_error; -#undef _ - - hash_set (utm->error_string_by_error_number, 99, "Misc"); -} - -void -handle_fifo_event_server_rx (uri_udp_test_main_t * utm, - session_fifo_event_t * e) -{ - svm_fifo_t *rx_fifo, *tx_fifo; - int nbytes; - - session_fifo_event_t evt; - unix_shared_memory_queue_t *q; - int rv; - - rx_fifo = e->fifo; - tx_fifo = utm->sessions[rx_fifo->client_session_index].server_tx_fifo; - - do - { - nbytes = svm_fifo_dequeue_nowait (rx_fifo, 0, - vec_len (utm->rx_buf), utm->rx_buf); - } - while (nbytes <= 0); - do - { - rv = svm_fifo_enqueue_nowait (tx_fifo, 0, nbytes, utm->rx_buf); - } - while (rv == -2); - - /* Fabricate TX event, send to vpp */ - evt.fifo = tx_fifo; - evt.event_type = FIFO_EVENT_SERVER_TX; - /* $$$$ for event logging */ - evt.enqueue_length = nbytes; - evt.event_id = e->event_id; - q = utm->vpp_event_queue; - unix_shared_memory_queue_add (q, (u8 *) & evt, 0 /* do wait for mutex */ ); -} - -void -handle_event_queue (uri_udp_test_main_t * utm) -{ - session_fifo_event_t _e, *e = &_e;; - - while (1) - { - unix_shared_memory_queue_sub (utm->our_event_queue, (u8 *) e, - 0 /* nowait */ ); - switch (e->event_type) - { - case FIFO_EVENT_SERVER_RX: - handle_fifo_event_server_rx (utm, e); - break; - - case FIFO_EVENT_SERVER_EXIT: - return; - - default: - clib_warning ("unknown event type %d", e->event_type); - break; - } - if (PREDICT_FALSE (utm->time_to_stop == 1)) - break; - if (PREDICT_FALSE (utm->time_to_print_stats == 1)) - { - utm->time_to_print_stats = 0; - fformat (stdout, "%d connections\n", pool_elts (utm->sessions)); - } - } -} - -void -uri_udp_test (uri_udp_test_main_t * utm) -{ - vl_api_bind_uri_t *bmp; - vl_api_unbind_uri_t *ump; - - bmp = vl_msg_api_alloc (sizeof (*bmp)); - memset (bmp, 0, sizeof (*bmp)); - - bmp->_vl_msg_id = ntohs (VL_API_BIND_URI); - bmp->client_index = utm->my_client_index; - bmp->context = ntohl (0xfeedface); - bmp->initial_segment_size = 256 << 20; /* size of initial segment */ - bmp->options[SESSION_OPTIONS_FLAGS] = - SESSION_OPTIONS_FLAGS_USE_FIFO | SESSION_OPTIONS_FLAGS_ADD_SEGMENT; - bmp->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 16 << 10; - bmp->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 16 << 10; - bmp->options[SESSION_OPTIONS_ADD_SEGMENT_SIZE] = 128 << 20; - memcpy (bmp->uri, utm->uri, vec_len (utm->uri)); - vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & bmp); - - if (wait_for_state_change (utm, STATE_READY)) - { - clib_warning ("timeout waiting for STATE_READY"); - return; - } - - handle_event_queue (utm); - - ump = vl_msg_api_alloc (sizeof (*ump)); - memset (ump, 0, sizeof (*ump)); - - ump->_vl_msg_id = ntohs (VL_API_UNBIND_URI); - ump->client_index = utm->my_client_index; - memcpy (ump->uri, utm->uri, vec_len (utm->uri)); - vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & ump); - - if (wait_for_state_change (utm, STATE_START)) - { - clib_warning ("timeout waiting for STATE_START"); - return; - } - - fformat (stdout, "Test complete...\n"); -} - -int -main (int argc, char **argv) -{ - uri_udp_test_main_t *utm = &uri_udp_test_main; - unformat_input_t _argv, *a = &_argv; - u8 *chroot_prefix; - u8 *heap; - u8 *bind_name = (u8 *) "udp://0.0.0.0/1234"; - u32 tmp; - mheap_t *h; - session_t *session; - int i; - int i_am_master = 1; - - clib_mem_init (0, 256 << 20); - - heap = clib_mem_get_per_cpu_heap (); - h = mheap_header (heap); - - /* make the main heap thread-safe */ - h->flags |= MHEAP_FLAG_THREAD_SAFE; - - vec_validate (utm->rx_buf, 8192); - - utm->session_index_by_vpp_handles = hash_create (0, sizeof (uword)); - - utm->my_pid = getpid (); - utm->configured_segment_size = 1 << 20; - - clib_time_init (&utm->clib_time); - init_error_string_table (utm); - svm_fifo_segment_init (0x200000000ULL, 20); - unformat_init_command_line (a, argv); - - while (unformat_check_input (a) != UNFORMAT_END_OF_INPUT) - { - if (unformat (a, "chroot prefix %s", &chroot_prefix)) - { - vl_set_memory_root_path ((char *) chroot_prefix); - } - else if (unformat (a, "uri %s", &bind_name)) - ; - else if (unformat (a, "segment-size %dM", &tmp)) - utm->configured_segment_size = tmp << 20; - else if (unformat (a, "segment-size %dG", &tmp)) - utm->configured_segment_size = tmp << 30; - else if (unformat (a, "master")) - i_am_master = 1; - else if (unformat (a, "slave")) - i_am_master = 0; - else - { - fformat (stderr, "%s: usage [master|slave]\n"); - exit (1); - } - } - - utm->cut_through_session_index = ~0; - utm->uri = format (0, "%s%c", bind_name, 0); - utm->i_am_master = i_am_master; - utm->segment_main = &svm_fifo_segment_main; - - utm->connect_uri = format (0, "udp://10.0.0.1/1234%c", 0); - - setup_signal_handlers (); - - uri_api_hookup (utm); - - if (connect_to_vpp (i_am_master ? "uri_udp_master" : "uri_udp_slave") < 0) - { - svm_region_exit (); - fformat (stderr, "Couldn't connect to vpe, exiting...\n"); - exit (1); - } - - if (i_am_master == 0) - { - uri_udp_slave_test (utm); - exit (0); - } - - /* $$$$ hack preallocation */ - for (i = 0; i < 200000; i++) - { - pool_get (utm->sessions, session); - memset (session, 0, sizeof (*session)); - } - for (i = 0; i < 200000; i++) - pool_put_index (utm->sessions, i); - - uri_udp_test (utm); - - vl_client_disconnect_from_vlib (); - exit (0); -} - -#undef vl_api_version -#define vl_api_version(n,v) static u32 vpe_api_version = v; -#include -#undef vl_api_version - -void -vl_client_add_api_signatures (vl_api_memclnt_create_t * mp) -{ - /* - * Send the main API signature in slot 0. This bit of code must - * match the checks in ../vpe/api/api.c: vl_msg_api_version_check(). - */ - mp->api_versions[0] = clib_host_to_net_u32 (vpe_api_version); -} - -u32 -vl (void *p) -{ - return vec_len (p); -} - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/uri/uritest.c b/src/uri/uritest.c deleted file mode 100644 index edcdb3ad..00000000 --- a/src/uri/uritest.c +++ /dev/null @@ -1,484 +0,0 @@ -/* - * Copyright (c) 2016 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define vl_typedefs /* define message structures */ -#include -#undef vl_typedefs - -/* declare message handlers for each api */ - -#define vl_endianfun /* define message structures */ -#include -#undef vl_endianfun - -/* instantiate all the print functions we know about */ -#define vl_print(handle, ...) -#define vl_printfun -#include -#undef vl_printfun - -typedef enum -{ - STATE_START, - STATE_READY, - STATE_DISCONNECTING, -} connection_state_t; - -typedef struct -{ - /* vpe input queue */ - unix_shared_memory_queue_t *vl_input_queue; - - /* API client handle */ - u32 my_client_index; - - /* role */ - int i_am_master; - - /* The URI we're playing with */ - u8 *uri; - - /* fifo segment */ - svm_fifo_segment_private_t *seg; - - svm_fifo_t *rx_fifo; - svm_fifo_t *tx_fifo; - - /* For deadman timers */ - clib_time_t clib_time; - - /* State of the connection, shared between msg RX thread and main thread */ - volatile connection_state_t state; - - /* VNET_API_ERROR_FOO -> "Foo" hash table */ - uword *error_string_by_error_number; -} uritest_main_t; - -#if CLIB_DEBUG > 0 -#define NITER 1000 -#else -#define NITER 1000000 -#endif - -uritest_main_t uritest_main; - -u8 * -format_api_error (u8 * s, va_list * args) -{ - uritest_main_t *utm = va_arg (*args, uritest_main_t *); - i32 error = va_arg (*args, u32); - uword *p; - - p = hash_get (utm->error_string_by_error_number, -error); - - if (p) - s = format (s, "%s", p[0]); - else - s = format (s, "%d", error); - return s; -} - -int -wait_for_state_change (uritest_main_t * utm, connection_state_t state) -{ - f64 timeout = clib_time_now (&utm->clib_time) + 1.0; - - while (clib_time_now (&utm->clib_time) < timeout) - { - if (utm->state == state) - return 0; - } - return -1; -} - -static void -vl_api_bind_uri_reply_t_handler (vl_api_bind_uri_reply_t * mp) -{ - uritest_main_t *utm = &uritest_main; - svm_fifo_segment_create_args_t _a, *a = &_a; - int rv; - - ASSERT (utm->i_am_master); - - if (mp->segment_name_length == 0) - { - clib_warning ("segment_name_length zero"); - return; - } - - a->segment_name = (char *) mp->segment_name; - a->segment_size = mp->segment_size; - - /* Create the segment */ - rv = svm_fifo_segment_create (a); - if (rv) - { - clib_warning ("sm_fifo_segment_create ('%s') failed", mp->segment_name); - return; - } - - vec_validate (utm->seg, 0); - - memcpy (utm->seg, a->rv, sizeof (*utm->seg)); - - /* - * By construction the master's idea of the rx fifo ends up in - * fsh->fifos[0], and the master's idea of the tx fifo ends up in - * fsh->fifos[1]. - */ - utm->rx_fifo = svm_fifo_segment_alloc_fifo (utm->seg, 10240); - ASSERT (utm->rx_fifo); - - utm->tx_fifo = svm_fifo_segment_alloc_fifo (utm->seg, 10240); - ASSERT (utm->tx_fifo); - - utm->state = STATE_READY; -} - -static void -vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) -{ - uritest_main_t *utm = &uritest_main; - svm_fifo_segment_create_args_t _a, *a = &_a; - ssvm_shared_header_t *sh; - svm_fifo_segment_header_t *fsh; - int rv; - - ASSERT (utm->i_am_master == 0); - - if (mp->segment_name_length == 0) - { - clib_warning ("segment_name_length zero"); - return; - } - - memset (a, 0, sizeof (*a)); - - a->segment_name = (char *) mp->segment_name; - - rv = svm_fifo_segment_attach (a); - if (rv) - { - clib_warning ("sm_fifo_segment_create ('%s') failed", mp->segment_name); - return; - } - - vec_validate (utm->seg, 0); - - memcpy (utm->seg, a->rv, sizeof (*utm->seg)); - sh = utm->seg->ssvm.sh; - fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; - - while (vec_len (fsh->fifos) < 2) - sleep (1); - - utm->rx_fifo = (svm_fifo_t *) fsh->fifos[1]; - ASSERT (utm->rx_fifo); - utm->tx_fifo = (svm_fifo_t *) fsh->fifos[0]; - ASSERT (utm->tx_fifo); - - /* security: could unlink /dev/shm/segment_name> here, maybe */ - - utm->state = STATE_READY; -} - -static void -vl_api_unbind_uri_reply_t_handler (vl_api_unbind_uri_reply_t * mp) -{ - uritest_main_t *utm = &uritest_main; - - if (mp->retval != 0) - clib_warning ("returned %d", ntohl (mp->retval)); - - utm->state = STATE_START; -} - -#define foreach_uri_msg \ -_(BIND_URI_REPLY, bind_uri_reply) \ -_(CONNECT_URI_REPLY, connect_uri_reply) \ -_(UNBIND_URI_REPLY, unbind_uri_reply) - -void -uri_api_hookup (uritest_main_t * utm) -{ -#define _(N,n) \ - vl_msg_api_set_handlers(VL_API_##N, #n, \ - vl_api_##n##_t_handler, \ - vl_noop_handler, \ - vl_api_##n##_t_endian, \ - vl_api_##n##_t_print, \ - sizeof(vl_api_##n##_t), 1); - foreach_uri_msg; -#undef _ - -} - - -int -connect_to_vpp (char *name) -{ - uritest_main_t *utm = &uritest_main; - api_main_t *am = &api_main; - - if (vl_client_connect_to_vlib ("/vpe-api", name, 32) < 0) - return -1; - - utm->vl_input_queue = am->shmem_hdr->vl_input_queue; - utm->my_client_index = am->my_client_index; - - return 0; -} - -void -vlib_cli_output (struct vlib_main_t *vm, char *fmt, ...) -{ - clib_warning ("BUG"); -} - -static void -init_error_string_table (uritest_main_t * utm) -{ - utm->error_string_by_error_number = hash_create (0, sizeof (uword)); - -#define _(n,v,s) hash_set (utm->error_string_by_error_number, -v, s); - foreach_vnet_api_error; -#undef _ - - hash_set (utm->error_string_by_error_number, 99, "Misc"); -} - -void -uritest_master (uritest_main_t * utm) -{ - vl_api_bind_uri_t *bmp; - vl_api_unbind_uri_t *ump; - int i; - u8 *test_data = 0; - u8 *reply = 0; - u32 reply_len; - int mypid = getpid (); - - for (i = 0; i < 2048; i++) - vec_add1 (test_data, 'a' + (i % 32)); - - bmp = vl_msg_api_alloc (sizeof (*bmp)); - memset (bmp, 0, sizeof (*bmp)); - - bmp->_vl_msg_id = ntohs (VL_API_BIND_URI); - bmp->client_index = utm->my_client_index; - bmp->context = ntohl (0xfeedface); - bmp->segment_size = 256 << 10; - memcpy (bmp->uri, utm->uri, vec_len (utm->uri)); - vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & bmp); - - if (wait_for_state_change (utm, STATE_READY)) - { - clib_warning ("timeout waiting for STATE_READY"); - return; - } - - for (i = 0; i < NITER; i++) - svm_fifo_enqueue (utm->tx_fifo, mypid, vec_len (test_data), test_data); - - vec_validate (reply, 0); - - reply_len = svm_fifo_dequeue (utm->rx_fifo, mypid, vec_len (reply), reply); - - if (reply_len != 1) - clib_warning ("reply length %d", reply_len); - - if (reply[0] == 1) - fformat (stdout, "Test OK..."); - - ump = vl_msg_api_alloc (sizeof (*ump)); - memset (ump, 0, sizeof (*ump)); - - ump->_vl_msg_id = ntohs (VL_API_UNBIND_URI); - ump->client_index = utm->my_client_index; - memcpy (ump->uri, utm->uri, vec_len (utm->uri)); - vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & ump); - - if (wait_for_state_change (utm, STATE_START)) - { - clib_warning ("timeout waiting for STATE_READY"); - return; - } - - fformat (stdout, "Master done...\n"); -} - -void -uritest_slave (uritest_main_t * utm) -{ - vl_api_connect_uri_t *cmp; - int i, j; - u8 *test_data = 0; - u8 *reply = 0; - u32 bytes_received = 0; - u32 actual_bytes; - int mypid = getpid (); - u8 ok; - f64 before, after, delta, bytes_per_second; - - vec_validate (test_data, 4095); - - cmp = vl_msg_api_alloc (sizeof (*cmp)); - memset (cmp, 0, sizeof (*cmp)); - - cmp->_vl_msg_id = ntohs (VL_API_CONNECT_URI); - cmp->client_index = utm->my_client_index; - cmp->context = ntohl (0xfeedface); - memcpy (cmp->uri, utm->uri, vec_len (utm->uri)); - vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & cmp); - - if (wait_for_state_change (utm, STATE_READY)) - { - clib_warning ("timeout waiting for STATE_READY"); - return; - } - - ok = 1; - before = clib_time_now (&utm->clib_time); - for (i = 0; i < NITER; i++) - { - actual_bytes = svm_fifo_dequeue (utm->rx_fifo, mypid, - vec_len (test_data), test_data); - j = 0; - while (j < actual_bytes) - { - if (test_data[j] != ('a' + (bytes_received % 32))) - ok = 0; - bytes_received++; - j++; - } - if (bytes_received == NITER * 2048) - break; - } - - vec_add1 (reply, ok); - - svm_fifo_enqueue (utm->tx_fifo, mypid, vec_len (reply), reply); - after = clib_time_now (&utm->clib_time); - delta = after - before; - bytes_per_second = 0.0; - - if (delta > 0.0) - bytes_per_second = (f64) bytes_received / delta; - - fformat (stdout, - "Slave done, %d bytes in %.2f seconds, %.2f bytes/sec...\n", - bytes_received, delta, bytes_per_second); -} - -int -main (int argc, char **argv) -{ - uritest_main_t *utm = &uritest_main; - unformat_input_t _argv, *a = &_argv; - u8 *chroot_prefix; - u8 *heap; - char *bind_name = "fifo:uritest"; - mheap_t *h; - int i_am_master = 0; - - clib_mem_init (0, 128 << 20); - - heap = clib_mem_get_per_cpu_heap (); - h = mheap_header (heap); - - /* make the main heap thread-safe */ - h->flags |= MHEAP_FLAG_THREAD_SAFE; - - clib_time_init (&utm->clib_time); - init_error_string_table (utm); - svm_fifo_segment_init (0x200000000ULL, 20); - unformat_init_command_line (a, argv); - - utm->uri = format (0, "%s%c", bind_name, 0); - - while (unformat_check_input (a) != UNFORMAT_END_OF_INPUT) - { - if (unformat (a, "master")) - i_am_master = 1; - else if (unformat (a, "slave")) - i_am_master = 0; - else if (unformat (a, "chroot prefix %s", &chroot_prefix)) - { - vl_set_memory_root_path ((char *) chroot_prefix); - } - else - { - fformat (stderr, "%s: usage [master|slave]\n"); - exit (1); - } - } - - uri_api_hookup (utm); - - if (connect_to_vpp (i_am_master ? "uritest_master" : "uritest_slave") < 0) - { - svm_region_exit (); - fformat (stderr, "Couldn't connect to vpe, exiting...\n"); - exit (1); - } - - utm->i_am_master = i_am_master; - - if (i_am_master) - uritest_master (utm); - else - uritest_slave (utm); - - vl_client_disconnect_from_vlib (); - exit (0); -} - -#undef vl_api_version -#define vl_api_version(n,v) static u32 vpe_api_version = v; -#include -#undef vl_api_version - -void -vl_client_add_api_signatures (vl_api_memclnt_create_t * mp) -{ - /* - * Send the main API signature in slot 0. This bit of code must - * match the checks in ../vpe/api/api.c: vl_msg_api_version_check(). - */ - mp->api_versions[0] = clib_host_to_net_u32 (vpe_api_version); -} - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet.am b/src/vnet.am index 7125a122..4e30ee92 100644 --- a/src/vnet.am +++ b/src/vnet.am @@ -461,6 +461,7 @@ libvnet_la_SOURCES += \ vnet/tcp/tcp_output.c \ vnet/tcp/tcp_input.c \ vnet/tcp/tcp_newreno.c \ + vnet/tcp/builtin_server.c \ vnet/tcp/tcp.c nobase_include_HEADERS += \ diff --git a/src/vnet/api_errno.h b/src/vnet/api_errno.h index 5e65ac7b..74d39bdb 100644 --- a/src/vnet/api_errno.h +++ b/src/vnet/api_errno.h @@ -103,7 +103,8 @@ _(LISP_RLOC_LOCAL, -110, "RLOC address is local") \ _(BFD_EAGAIN, -111, "BFD object cannot be manipulated at this time") \ _(INVALID_GPE_MODE, -112, "Invalid GPE mode") \ _(LISP_GPE_ENTRIES_PRESENT, -113, "LISP GPE entries are present") \ -_(ADDRESS_FOUND_FOR_INTERFACE, -114, "Address found for interface") +_(ADDRESS_FOUND_FOR_INTERFACE, -114, "Address found for interface") \ +_(SESSION_CONNECT_FAIL, -115, "Session failed to connect") typedef enum { diff --git a/src/vnet/session/application.c b/src/vnet/session/application.c index a561e7d1..a542eebe 100644 --- a/src/vnet/session/application.c +++ b/src/vnet/session/application.c @@ -154,6 +154,15 @@ application_get (u32 index) return pool_elt_at_index (app_pool, index); } +application_t * +application_get_if_valid (u32 index) +{ + if (pool_is_free_index (app_pool, index)) + return 0; + + return pool_elt_at_index (app_pool, index); +} + u32 application_get_index (application_t * app) { @@ -209,7 +218,7 @@ format_application_server (u8 * s, va_list * args) regp = vl_api_client_index_to_registration (srv->api_client_index); if (!regp) - server_name = format (0, "%s%c", regp->name, 0); + server_name = format (0, "builtin-%d%c", srv->index, 0); else server_name = regp->name; @@ -269,11 +278,17 @@ static clib_error_t * show_app_command_fn (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) { + session_manager_main_t *smm = &session_manager_main; application_t *app; int do_server = 0; int do_client = 0; int verbose = 0; + if (!smm->is_enabled) + { + clib_error_return (0, "session layer is not enabled"); + } + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { if (unformat (input, "server")) @@ -323,16 +338,20 @@ show_app_command_fn (vlib_main_t * vm, unformat_input_t * input, /* *INDENT-ON* */ } else - vlib_cli_output (vm, "No active server bindings"); + vlib_cli_output (vm, "No active client bindings"); } return 0; } +/* *INDENT-OFF* */ VLIB_CLI_COMMAND (show_app_command, static) = { -.path = "show app",.short_help = - "show app [server|client] [verbose]",.function = show_app_command_fn,}; + .path = "show app", + .short_help = "show app [server|client] [verbose]", + .function = show_app_command_fn, +}; +/* *INDENT-ON* */ /* * fd.io coding-style-patch-verification: ON diff --git a/src/vnet/session/application.h b/src/vnet/session/application.h index 027d6967..480828f7 100644 --- a/src/vnet/session/application.h +++ b/src/vnet/session/application.h @@ -100,6 +100,7 @@ application_t *application_new (application_type_t type, session_type_t sst, session_cb_vft_t * cb_fns); void application_del (application_t * app); application_t *application_get (u32 index); +application_t *application_get_if_valid (u32 index); application_t *application_lookup (u32 api_client_index); u32 application_get_index (application_t * app); diff --git a/src/vnet/session/application_interface.c b/src/vnet/session/application_interface.c index 0ea77fd8..6ddfb70f 100644 --- a/src/vnet/session/application_interface.c +++ b/src/vnet/session/application_interface.c @@ -51,7 +51,7 @@ ip_is_local (ip46_address_t * ip46_address, u8 is_ip4) prefix.fp_proto = FIB_PROTOCOL_IP6; } - clib_memcpy (&prefix.fp_addr, ip46_address, sizeof (ip46_address)); + clib_memcpy (&prefix.fp_addr, ip46_address, sizeof (ip46_address_t)); fei = fib_table_lookup (0, &prefix); flags = fib_entry_get_flags (fei); @@ -186,9 +186,7 @@ vnet_connect_i (u32 api_client_index, u32 api_context, session_type_t sst, /* * Not connecting to a local server. Create regular session */ - stream_session_open (sst, ip46, port, app->index); - - return 0; + return stream_session_open (sst, ip46, port, app->index); } /** diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c index e467f4e9..399077de 100644 --- a/src/vnet/session/node.c +++ b/src/vnet/session/node.c @@ -104,9 +104,13 @@ session_fifo_rx_i (vlib_main_t * vm, vlib_node_runtime_t * node, snd_space0 = transport_vft->send_space (tc0); snd_mss0 = transport_vft->send_mss (tc0); + /* Can't make any progress */ if (snd_space0 == 0 || svm_fifo_max_dequeue (s0->server_tx_fifo) == 0 || snd_mss0 == 0) - return 0; + { + vec_add1 (smm->evts_partially_read[thread_index], *e0); + return 0; + } ASSERT (e0->enqueue_length > 0); @@ -143,7 +147,12 @@ session_fifo_rx_i (vlib_main_t * vm, vlib_node_runtime_t * node, if (PREDICT_FALSE (n_bufs < 0.9 * VLIB_FRAME_SIZE)) { /* Keep track of how much we've dequeued and exit */ - e0->enqueue_length -= max_len_to_snd0 - left_to_snd0; + if (left_to_snd0 != max_len_to_snd0) + { + e0->enqueue_length -= max_len_to_snd0 - left_to_snd0; + vec_add1 (smm->evts_partially_read[thread_index], *e0); + } + return -1; } @@ -185,12 +194,13 @@ session_fifo_rx_i (vlib_main_t * vm, vlib_node_runtime_t * node, t0->server_thread_index = s0->thread_index; } + /* *INDENT-OFF* */ if (1) { - ELOG_TYPE_DECLARE (e) = - { - .format = "evt-dequeue: id %d length %d",.format_args = - "i4i4",}; + ELOG_TYPE_DECLARE (e) = { + .format = "evt-dequeue: id %d length %d", + .format_args = "i4i4", + }; struct { u32 data[2]; @@ -199,6 +209,7 @@ session_fifo_rx_i (vlib_main_t * vm, vlib_node_runtime_t * node, ed->data[0] = e0->event_id; ed->data[1] = e0->enqueue_length; } + /* *INDENT-ON* */ len_to_deq0 = (left_to_snd0 < snd_mss0) ? left_to_snd0 : snd_mss0; @@ -289,7 +300,7 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, { session_manager_main_t *smm = vnet_get_session_manager_main (); session_fifo_event_t *my_fifo_events, *e; - u32 n_to_dequeue; + u32 n_to_dequeue, n_events; unix_shared_memory_queue_t *q; int n_tx_packets = 0; u32 my_thread_index = vm->cpu_index; @@ -309,14 +320,16 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, /* min number of events we can dequeue without blocking */ n_to_dequeue = q->cursize; - if (n_to_dequeue == 0) - return 0; - my_fifo_events = smm->fifo_events[my_thread_index]; - /* If we didn't manage to process previous events try going + if (n_to_dequeue == 0 && vec_len (my_fifo_events) == 0) + return 0; + + /* + * If we didn't manage to process previous events try going * over them again without dequeuing new ones. - * XXX: Block senders to sessions that can't keep up */ + */ + /* XXX: Block senders to sessions that can't keep up */ if (vec_len (my_fifo_events) >= 100) goto skip_dequeue; @@ -338,8 +351,8 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, smm->fifo_events[my_thread_index] = my_fifo_events; skip_dequeue: - - for (i = 0; i < n_to_dequeue; i++) + n_events = vec_len (my_fifo_events); + for (i = 0; i < n_events; i++) { svm_fifo_t *f0; /* $$$ prefetch 1 ahead maybe */ stream_session_t *s0; @@ -354,8 +367,13 @@ skip_dequeue: /* $$$ add multiple event queues, per vpp worker thread */ ASSERT (server_thread_index0 == my_thread_index); - s0 = pool_elt_at_index (smm->sessions[my_thread_index], - server_session_index0); + s0 = stream_session_get_if_valid (server_session_index0, + my_thread_index); + if (!s0) + { + clib_warning ("It's dead Jim!"); + continue; + } ASSERT (s0->thread_index == my_thread_index); @@ -380,11 +398,11 @@ skip_dequeue: done: /* Couldn't process all events. Probably out of buffers */ - if (PREDICT_FALSE (i < n_to_dequeue)) + if (PREDICT_FALSE (i < n_events)) { session_fifo_event_t *partially_read = smm->evts_partially_read[my_thread_index]; - vec_add (partially_read, &my_fifo_events[i], n_to_dequeue - i); + vec_add (partially_read, &my_fifo_events[i], n_events - i); vec_free (my_fifo_events); smm->fifo_events[my_thread_index] = partially_read; smm->evts_partially_read[my_thread_index] = 0; @@ -413,8 +431,7 @@ VLIB_REGISTER_NODE (session_queue_node) = .n_errors = ARRAY_LEN (session_queue_error_strings), .error_strings = session_queue_error_strings, .n_next_nodes = SESSION_QUEUE_N_NEXT, - /* .state = VLIB_NODE_STATE_DISABLED, enable on-demand? */ - /* edit / add dispositions here */ + .state = VLIB_NODE_STATE_DISABLED, .next_nodes = { [SESSION_QUEUE_NEXT_DROP] = "error-drop", diff --git a/src/vnet/session/session.api b/src/vnet/session/session.api index a7b28c1d..582765b5 100644 --- a/src/vnet/session/session.api +++ b/src/vnet/session/session.api @@ -422,6 +422,28 @@ define reset_sock_reply { i32 retval; u64 handle; }; + +/** \brief enable/disable session layer + @param client_index - opaque cookie to identify the sender + client to vpp direction only + @param context - sender context, to match reply w/ request + @param is_enable - disable session layer if 0, enable otherwise +*/ +define session_enable_disable { + u32 client_index; + u32 context; + u8 is_enable; +}; + +/** \brief Reply for session enable/disable + @param context - returned sender context, to match reply w/ request + @param retval - return code +*/ +define session_enable_disable_reply { + u32 context; + i32 retval; +}; + /* * Local Variables: * eval: (c-set-style "gnu") diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 539da613..422527e0 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -311,11 +311,11 @@ stream_session_half_open_lookup (session_manager_main_t * smm, } transport_connection_t * -stream_session_lookup_transport4 (session_manager_main_t * smm, - ip4_address_t * lcl, ip4_address_t * rmt, +stream_session_lookup_transport4 (ip4_address_t * lcl, ip4_address_t * rmt, u16 lcl_port, u16 rmt_port, u8 proto, u32 my_thread_index) { + session_manager_main_t *smm = &session_manager_main; session_kv4_t kv4; stream_session_t *s; int rv; @@ -345,11 +345,11 @@ stream_session_lookup_transport4 (session_manager_main_t * smm, } transport_connection_t * -stream_session_lookup_transport6 (session_manager_main_t * smm, - ip6_address_t * lcl, ip6_address_t * rmt, +stream_session_lookup_transport6 (ip6_address_t * lcl, ip6_address_t * rmt, u16 lcl_port, u16 rmt_port, u8 proto, u32 my_thread_index) { + session_manager_main_t *smm = &session_manager_main; stream_session_t *s; session_kv6_t kv6; int rv; @@ -554,7 +554,7 @@ session_manager_allocate_session_fifos (session_manager_main_t * smm, u8 * added_a_segment) { svm_fifo_segment_private_t *fifo_segment; - u32 fifo_size, default_fifo_size = 8192 /* TODO config */ ; + u32 fifo_size, default_fifo_size = 128 << 10; /* TODO config */ int i; *added_a_segment = 0; @@ -948,7 +948,7 @@ void connects_session_manager_init (session_manager_main_t * smm, u8 session_type) { session_manager_t *sm; - u32 connect_fifo_size = 8 << 10; /* Config? */ + u32 connect_fifo_size = 256 << 10; /* Config? */ u32 default_segment_size = 1 << 20; pool_get (smm->session_managers, sm); @@ -1055,10 +1055,15 @@ stream_session_delete (stream_session_t * s) svm_fifo_segment_free_fifo (fifo_segment, s->server_rx_fifo); svm_fifo_segment_free_fifo (fifo_segment, s->server_tx_fifo); - /* Cleanup app if client */ - app = application_get (s->app_index); + app = application_get_if_valid (s->app_index); + + /* No app. A possibility: after disconnect application called unbind */ + if (!app) + return; + if (app->mode == APP_CLIENT) { + /* Cleanup app if client */ application_del (app); } else if (app->mode == APP_SERVER) @@ -1068,6 +1073,7 @@ stream_session_delete (stream_session_t * s) svm_fifo_t **fifos; u32 fifo_index; + /* For server, see if any segments can be removed */ sm = session_manager_get (app->session_manager_index); /* Delete fifo */ @@ -1096,10 +1102,10 @@ stream_session_delete_notify (transport_connection_t * tc) { stream_session_t *s; + /* App might've been removed already */ s = stream_session_get_if_valid (tc->s_index, tc->thread_index); if (!s) { - clib_warning ("Surprised!"); return; } stream_session_delete (s); @@ -1151,16 +1157,24 @@ stream_session_accept (transport_connection_t * tc, u32 listener_index, return 0; } -void +int stream_session_open (u8 sst, ip46_address_t * addr, u16 port_host_byte_order, u32 app_index) { transport_connection_t *tc; u32 tci; u64 value; + int rv; /* Ask transport to open connection */ - tci = tp_vfts[sst].open (addr, port_host_byte_order); + rv = tp_vfts[sst].open (addr, port_host_byte_order); + if (rv < 0) + { + clib_warning ("Transport failed to open connection."); + return VNET_API_ERROR_SESSION_CONNECT_FAIL; + } + + tci = rv; /* Get transport connection */ tc = tp_vfts[sst].get_half_open (tci); @@ -1170,6 +1184,8 @@ stream_session_open (u8 sst, ip46_address_t * addr, u16 port_host_byte_order, /* Add to the half-open lookup table */ stream_session_half_open_table_add (sst, tc, value); + + return 0; } /** @@ -1216,16 +1232,13 @@ session_get_transport_vft (u8 type) } static clib_error_t * -session_manager_main_init (vlib_main_t * vm) +session_manager_main_enable (vlib_main_t * vm) { - u32 num_threads; - vlib_thread_main_t *vtm = vlib_get_thread_main (); session_manager_main_t *smm = &session_manager_main; + vlib_thread_main_t *vtm = vlib_get_thread_main (); + u32 num_threads; int i; - smm->vlib_main = vm; - smm->vnet_main = vnet_get_main (); - num_threads = 1 /* main thread */ + vtm->n_threads; if (num_threads < 1) @@ -1272,11 +1285,48 @@ session_manager_main_init (vlib_main_t * vm) for (i = 0; i < SESSION_N_TYPES; i++) smm->connect_manager_index[i] = INVALID_INDEX; + smm->is_enabled = 1; + return 0; } -VLIB_INIT_FUNCTION (session_manager_main_init); +clib_error_t * +vnet_session_enable_disable (vlib_main_t * vm, u8 is_en) +{ + if (is_en) + { + if (session_manager_main.is_enabled) + return 0; + + vlib_node_set_state (vm, session_queue_node.index, + VLIB_NODE_STATE_POLLING); + + return session_manager_main_enable (vm); + } + else + { + session_manager_main.is_enabled = 0; + vlib_node_set_state (vm, session_queue_node.index, + VLIB_NODE_STATE_DISABLED); + } + + return 0; +} + + +clib_error_t * +session_manager_main_init (vlib_main_t * vm) +{ + session_manager_main_t *smm = &session_manager_main; + + smm->vlib_main = vm; + smm->vnet_main = vnet_get_main (); + smm->is_enabled = 0; + + return 0; +} +VLIB_INIT_FUNCTION (session_manager_main_init) /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index cf14cca9..46e5ce2c 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -213,12 +213,15 @@ struct _session_manager_main /** Per transport rx function that can either dequeue or peek */ session_fifo_rx_fn *session_rx_fns[SESSION_N_TYPES]; + u8 is_enabled; + /* Convenience */ vlib_main_t *vlib_main; vnet_main_t *vnet_main; }; extern session_manager_main_t session_manager_main; +extern vlib_node_registration_t session_queue_node; /* * Session manager function @@ -276,14 +279,12 @@ stream_session_t *stream_session_lookup6 (ip6_address_t * lcl, ip6_address_t * rmt, u16 lcl_port, u16 rmt_port, u8, u32 thread_index); transport_connection_t - * stream_session_lookup_transport4 (session_manager_main_t * smm, - ip4_address_t * lcl, + * stream_session_lookup_transport4 (ip4_address_t * lcl, ip4_address_t * rmt, u16 lcl_port, u16 rmt_port, u8 proto, u32 thread_index); transport_connection_t - * stream_session_lookup_transport6 (session_manager_main_t * smm, - ip6_address_t * lcl, + * stream_session_lookup_transport6 (ip6_address_t * lcl, ip6_address_t * rmt, u16 lcl_port, u16 rmt_port, u8 proto, u32 thread_index); @@ -338,6 +339,14 @@ stream_session_max_enqueue (transport_connection_t * tc) return svm_fifo_max_enqueue (s->server_rx_fifo); } +always_inline u32 +stream_session_fifo_size (transport_connection_t * tc) +{ + stream_session_t *s = stream_session_get (tc->s_index, tc->thread_index); + return s->server_rx_fifo->nitems; +} + + int stream_session_enqueue_data (transport_connection_t * tc, u8 * data, u16 len, u8 queue_event); @@ -356,8 +365,8 @@ void stream_session_reset_notify (transport_connection_t * tc); int stream_session_accept (transport_connection_t * tc, u32 listener_index, u8 sst, u8 notify); -void stream_session_open (u8 sst, ip46_address_t * addr, - u16 port_host_byte_order, u32 api_client_index); +int stream_session_open (u8 sst, ip46_address_t * addr, + u16 port_host_byte_order, u32 api_client_index); void stream_session_disconnect (stream_session_t * s); void stream_session_cleanup (stream_session_t * s); int @@ -369,6 +378,8 @@ u8 *format_stream_session (u8 * s, va_list * args); void session_register_transport (u8 type, const transport_proto_vft_t * vft); transport_proto_vft_t *session_get_transport_vft (u8 type); +clib_error_t *vnet_session_enable_disable (vlib_main_t * vm, u8 is_en); + #endif /* __included_session_h__ */ /* diff --git a/src/vnet/session/session_api.c b/src/vnet/session/session_api.c index 9d068684..8852fc6e 100644 --- a/src/vnet/session/session_api.c +++ b/src/vnet/session/session_api.c @@ -52,6 +52,8 @@ _(DISCONNECT_SOCK, disconnect_sock) \ _(DISCONNECT_SOCK_REPLY, disconnect_sock_reply) \ _(ACCEPT_SOCK_REPLY, accept_sock_reply) \ _(RESET_SOCK_REPLY, reset_sock_reply) \ +_(SESSION_ENABLE_DISABLE, session_enable_disable) \ + static int send_add_segment_callback (u32 api_client_index, const u8 * segment_name, @@ -146,7 +148,6 @@ send_session_connected_uri_callback (u32 api_client_index, mp = vl_msg_api_alloc (sizeof (*mp)); mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_CONNECT_URI_REPLY); mp->context = app->api_context; - mp->retval = is_fail; if (!is_fail) { vpp_queue = session_manager_get_vpp_event_queue (s->thread_index); @@ -157,6 +158,7 @@ send_session_connected_uri_callback (u32 api_client_index, mp->session_type = s->session_type; mp->vpp_event_queue_address = (u64) vpp_queue; mp->client_event_queue_address = (u64) app->event_queue; + mp->retval = 0; session_manager_get_segment_info (s->server_segment_index, &seg_name, &mp->segment_size); @@ -164,12 +166,22 @@ send_session_connected_uri_callback (u32 api_client_index, if (mp->segment_name_length) clib_memcpy (mp->segment_name, seg_name, mp->segment_name_length); } + else + { + mp->retval = VNET_API_ERROR_SESSION_CONNECT_FAIL; + } vl_msg_api_send_shmem (q, (u8 *) & mp); /* Remove client if connect failed */ if (is_fail) - application_del (app); + { + application_del (app); + } + else + { + s->session_state = SESSION_STATE_READY; + } return 0; } @@ -431,6 +443,17 @@ api_session_not_valid (u32 session_index, u32 thread_index) return 0; } +static void +vl_api_session_enable_disable_t_handler (vl_api_session_enable_disable_t * mp) +{ + vl_api_session_enable_disable_reply_t *rmp; + vlib_main_t *vm = vlib_get_main (); + int rv = 0; + + vnet_session_enable_disable (vm, mp->is_enable); + REPLY_MACRO (VL_API_SESSION_ENABLE_DISABLE_REPLY); +} + static void vl_api_bind_uri_t_handler (vl_api_bind_uri_t * mp) { @@ -476,7 +499,6 @@ vl_api_bind_uri_t_handler (vl_api_bind_uri_t * mp) } })); /* *INDENT-ON* */ - } static void @@ -493,7 +515,9 @@ vl_api_unbind_uri_t_handler (vl_api_unbind_uri_t * mp) static void vl_api_connect_uri_t_handler (vl_api_connect_uri_t * mp) { + vl_api_connect_uri_reply_t *rmp; vnet_connect_args_t _a, *a = &_a; + int rv; a->uri = (char *) mp->uri; a->api_client_index = mp->client_index; @@ -501,7 +525,19 @@ vl_api_connect_uri_t_handler (vl_api_connect_uri_t * mp) a->options = mp->options; a->session_cb_vft = &uri_session_cb_vft; a->mp = mp; - vnet_connect_uri (a); + + rv = vnet_connect_uri (a); + + if (rv == 0 || rv == VNET_CONNECT_REDIRECTED) + return; + + /* Got some error, relay it */ + + /* *INDENT-OFF* */ + REPLY_MACRO2 (VL_API_CONNECT_URI_REPLY, ({ + rmp->retval = rv; + })); + /* *INDENT-ON* */ } static void @@ -662,7 +698,9 @@ vl_api_unbind_sock_t_handler (vl_api_unbind_sock_t * mp) static void vl_api_connect_sock_t_handler (vl_api_connect_sock_t * mp) { + vl_api_connect_sock_reply_t *rmp; vnet_connect_args_t _a, *a = &_a; + int rv; clib_memcpy (&a->tep.ip, mp->ip, (mp->is_ip4 ? sizeof (ip4_address_t) : @@ -675,7 +713,18 @@ vl_api_connect_sock_t_handler (vl_api_connect_sock_t * mp) a->api_context = mp->context; a->mp = mp; - vnet_connect (a); + rv = vnet_connect (a); + + if (rv == 0 || rv == VNET_CONNECT_REDIRECTED) + return; + + /* Got some error, relay it */ + + /* *INDENT-OFF* */ + REPLY_MACRO2 (VL_API_CONNECT_URI_REPLY, ({ + rmp->retval = rv; + })); + /* *INDENT-ON* */ } static void diff --git a/src/vnet/session/session_cli.c b/src/vnet/session/session_cli.c index b2943a1c..b029ee65 100644 --- a/src/vnet/session/session_cli.c +++ b/src/vnet/session/session_cli.c @@ -60,7 +60,7 @@ format_stream_session (u8 * s, va_list * args) } else { - clib_warning ("Session in unknown state!"); + clib_warning ("Session in state: %d!", ss->session_state); } vec_free (str); @@ -78,6 +78,11 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, stream_session_t *s; u8 *str = 0; + if (!smm->is_enabled) + { + clib_error_return (0, "session layer is not enabled"); + } + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { if (unformat (input, "verbose")) @@ -126,11 +131,14 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, return 0; } -VLIB_CLI_COMMAND (show_uri_command, static) = +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_session_command, static) = { -.path = "show session",.short_help = "show session [verbose]",.function = - show_session_command_fn,}; - + .path = "show session", + .short_help = "show session [verbose]", + .function = show_session_command_fn, +}; +/* *INDENT-ON* */ static clib_error_t * clear_session_command_fn (vlib_main_t * vm, unformat_input_t * input, @@ -142,6 +150,11 @@ clear_session_command_fn (vlib_main_t * vm, unformat_input_t * input, stream_session_t *pool, *session; application_t *server; + if (!smm->is_enabled) + { + clib_error_return (0, "session layer is not enabled"); + } + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { if (unformat (input, "thread %d", &thread_index)) @@ -174,11 +187,43 @@ clear_session_command_fn (vlib_main_t * vm, unformat_input_t * input, return 0; } -VLIB_CLI_COMMAND (clear_uri_session_command, static) = +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (clear_session_command, static) = +{ + .path = "clear session", + .short_help = "clear session thread session ", + .function = clear_session_command_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +session_enable_disable_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + u8 is_en = 1; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "enable")) + is_en = 1; + else if (unformat (input, "disable")) + is_en = 0; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + return vnet_session_enable_disable (vm, is_en); +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (session_enable_disable_command, static) = { -.path = "clear session",.short_help = - "clear session thread session ",.function = - clear_session_command_fn,}; + .path = "session", + .short_help = "session [enable|disable]", + .function = session_enable_disable_fn, +}; +/* *INDENT-ON* */ /* * fd.io coding-style-patch-verification: ON diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c new file mode 100644 index 00000000..be65642a --- /dev/null +++ b/src/vnet/tcp/builtin_server.c @@ -0,0 +1,135 @@ +/* +* Copyright (c) 2015-2017 Cisco and/or its affiliates. +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at: +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include +#include +#include +#include + +int +builtin_session_accept_callback (stream_session_t * s) +{ + clib_warning ("called..."); + s->session_state = SESSION_STATE_READY; + return 0; +} + +void +builtin_session_disconnect_callback (stream_session_t * s) +{ + clib_warning ("called..."); +} + +int +builtin_session_connected_callback (u32 client_index, + stream_session_t * s, u8 is_fail) +{ + clib_warning ("called..."); + return -1; +} + +int +builtin_add_segment_callback (u32 client_index, + const u8 * seg_name, u32 seg_size) +{ + clib_warning ("called..."); + return -1; +} + +int +builtin_redirect_connect_callback (u32 client_index, void *mp) +{ + clib_warning ("called..."); + return -1; +} + +int +builtin_server_rx_callback (stream_session_t * s) +{ + clib_warning ("called..."); + return 0; +} + +static session_cb_vft_t builtin_session_cb_vft = { + .session_accept_callback = builtin_session_accept_callback, + .session_disconnect_callback = builtin_session_disconnect_callback, + .session_connected_callback = builtin_session_connected_callback, + .add_segment_callback = builtin_add_segment_callback, + .redirect_connect_callback = builtin_redirect_connect_callback, + .builtin_server_rx_callback = builtin_server_rx_callback +}; + +static int +server_create (vlib_main_t * vm) +{ + vnet_bind_args_t _a, *a = &_a; + u64 options[SESSION_OPTIONS_N_OPTIONS]; + char segment_name[128]; + + memset (a, 0, sizeof (*a)); + memset (options, 0, sizeof (options)); + + a->uri = "tcp://0.0.0.0/80"; + a->api_client_index = ~0; + a->session_cb_vft = &builtin_session_cb_vft; + a->options = options; + a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 256 << 10; + a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 64 << 10; + a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 64 << 10; + a->segment_name = segment_name; + a->segment_name_length = ARRAY_LEN (segment_name); + + return vnet_bind_uri (a); +} + +static clib_error_t * +server_create_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + int rv; +#if 0 + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "whatever %d", &whatever)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } +#endif + + rv = server_create (vm); + switch (rv) + { + case 0: + break; + default: + return clib_error_return (0, "server_create returned %d", rv); + } + return 0; +} + +VLIB_CLI_COMMAND (server_create_command, static) = +{ +.path = "test server",.short_help = "test server",.function = + server_create_command_fn,}; + +/* +* fd.io coding-style-patch-verification: ON +* +* Local Variables: +* eval: (c-set-style "gnu") +* End: +*/ diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 0f9b7097..e5feaeb1 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -217,6 +217,7 @@ ip_interface_get_first_ip (u32 sw_if_index, u8 is_ip4) return 0; } +#define PORT_MASK ((1 << 16)- 1) /** * Allocate local port and add if successful add entry to local endpoint * table to mark the pair as used. @@ -224,7 +225,6 @@ ip_interface_get_first_ip (u32 sw_if_index, u8 is_ip4) u16 tcp_allocate_local_port (tcp_main_t * tm, ip46_address_t * ip) { - u8 unique = 0; transport_endpoint_t *tep; u32 time_now, tei; u16 min = 1024, max = 65535, tries; /* XXX configurable ? */ @@ -235,37 +235,34 @@ tcp_allocate_local_port (tcp_main_t * tm, ip46_address_t * ip) /* Start at random point or max */ pool_get (tm->local_endpoints, tep); clib_memcpy (&tep->ip, ip, sizeof (*ip)); - tep->port = random_u32 (&time_now) << 16; - tep->port = tep->port < min ? max : tep->port; /* Search for first free slot */ - while (tries) + for (; tries >= 0; tries--) { + u16 port = 0; + + /* Find a port in the specified range */ + while (1) + { + port = random_u32 (&time_now) & PORT_MASK; + if (PREDICT_TRUE (port >= min && port < max)) + break; + } + + tep->port = port; + + /* Look it up */ tei = transport_endpoint_lookup (&tm->local_endpoints_table, &tep->ip, tep->port); + /* If not found, we're done */ if (tei == TRANSPORT_ENDPOINT_INVALID_INDEX) { - unique = 1; - break; + transport_endpoint_table_add (&tm->local_endpoints_table, tep, + tep - tm->local_endpoints); + return tep->port; } - - tep->port--; - - if (tep->port < min) - tep->port = max; - - tries--; } - - if (unique) - { - transport_endpoint_table_add (&tm->local_endpoints_table, tep, - tep - tm->local_endpoints); - - return tep->port; - } - - /* Failed */ + /* No free ports */ pool_put (tm->local_endpoints, tep); return -1; } @@ -360,7 +357,10 @@ tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4) /* Allocate source port */ lcl_port = tcp_allocate_local_port (tm, &lcl_addr); if (lcl_port < 1) - return -1; + { + clib_warning ("Failed to allocate src port"); + return -1; + } /* * Create connection and send SYN diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 22f00a63..3560509d 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -30,7 +30,8 @@ #define TCP_MAX_OPTION_SPACE 40 #define TCP_DUPACK_THRESHOLD 3 -#define TCP_DEFAULT_RX_FIFO_SIZE 64 << 10 +#define TCP_MAX_RX_FIFO_SIZE 2 << 20 +#define TCP_IW_N_SEGMENTS 10 /** TCP FSM state definitions as per RFC793. */ #define foreach_tcp_fsm_state \ @@ -590,7 +591,6 @@ vlib_buffer_push_tcp_net_order (vlib_buffer_t * b, u16 sp, u16 dp, u32 seq, /** * Push TCP header to buffer * - * @param vm - vlib_main * @param b - buffer to write the header to * @param sp_net - source port net order * @param dp_net - destination port net order diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index daa0683b..0a907d0a 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -711,7 +711,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, if (tcp_opts_sack_permitted (&tc->opt)) tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number); - new_snd_wnd = clib_net_to_host_u32 (th->window) << tc->snd_wscale; + new_snd_wnd = clib_net_to_host_u16 (th->window) << tc->snd_wscale; if (tcp_ack_is_dupack (tc, b, new_snd_wnd)) { @@ -1320,7 +1320,6 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Parse options */ tcp_options_parse (tcp0, &new_tc0->opt); - tcp_connection_init_vars (new_tc0); if (tcp_opts_tstamp (&new_tc0->opt)) { @@ -1331,11 +1330,13 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (tcp_opts_wscale (&new_tc0->opt)) new_tc0->snd_wscale = new_tc0->opt.wscale; - new_tc0->snd_wnd = clib_net_to_host_u32 (tcp0->window) - << new_tc0->snd_wscale; + /* No scaling */ + new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window); new_tc0->snd_wl1 = seq0; new_tc0->snd_wl2 = ack0; + tcp_connection_init_vars (new_tc0); + /* SYN-ACK: See if we can switch to ESTABLISHED state */ if (tcp_ack (tcp0)) { @@ -1345,6 +1346,9 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, new_tc0->snd_una = ack0; new_tc0->state = TCP_STATE_ESTABLISHED; + /* Make sure las is initialized for the wnd computation */ + new_tc0->rcv_las = new_tc0->rcv_nxt; + /* Notify app that we have connection */ stream_session_connect_notify (&new_tc0->connection, sst, 0); @@ -1575,7 +1579,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Initialize session variables */ tc0->snd_una = vnet_buffer (b0)->tcp.ack_number; - tc0->snd_wnd = clib_net_to_host_u32 (tcp0->window) + tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window) << tc0->opt.wscale; tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number; tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number; @@ -1899,7 +1903,6 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } tcp_options_parse (th0, &child0->opt); - tcp_connection_init_vars (child0); child0->irs = vnet_buffer (b0)->tcp.seq_number; child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1; @@ -1913,6 +1916,16 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, child0->tsval_recent_age = tcp_time_now (); } + if (tcp_opts_wscale (&child0->opt)) + child0->snd_wscale = child0->opt.wscale; + + /* No scaling */ + child0->snd_wnd = clib_net_to_host_u16 (th0->window); + child0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number; + child0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number; + + tcp_connection_init_vars (child0); + /* Reuse buffer to make syn-ack and send */ tcp_make_synack (child0, b0); next0 = tcp_next_output (is_ip4); @@ -1923,7 +1936,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } - b0->error = error0 ? node->errors[error0] : 0; + b0->error = node->errors[error0]; vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, n_left_to_next, bi0, next0); @@ -2069,7 +2082,6 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, u32 n_left_from, next_index, *from, *to_next; u32 my_thread_index = vm->cpu_index; tcp_main_t *tm = vnet_get_tcp_main (); - session_manager_main_t *ssm = vnet_get_session_manager_main (); from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -2109,26 +2121,26 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* lookup session */ tc0 = - (tcp_connection_t *) stream_session_lookup_transport4 (ssm, - &ip40->dst_address, - &ip40->src_address, - tcp0->dst_port, - tcp0->src_port, - SESSION_TYPE_IP4_TCP, - my_thread_index); + (tcp_connection_t *) + stream_session_lookup_transport4 (&ip40->dst_address, + &ip40->src_address, + tcp0->dst_port, + tcp0->src_port, + SESSION_TYPE_IP4_TCP, + my_thread_index); } else { ip60 = vlib_buffer_get_current (b0); tcp0 = ip6_next_header (ip60); tc0 = - (tcp_connection_t *) stream_session_lookup_transport6 (ssm, - &ip60->src_address, - &ip60->dst_address, - tcp0->src_port, - tcp0->dst_port, - SESSION_TYPE_IP6_TCP, - my_thread_index); + (tcp_connection_t *) + stream_session_lookup_transport6 (&ip60->src_address, + &ip60->dst_address, + tcp0->src_port, + tcp0->dst_port, + SESSION_TYPE_IP6_TCP, + my_thread_index); } /* Session exists */ diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index dbcf1f74..7e431cd0 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -90,6 +90,15 @@ tcp_window_compute_scale (u32 available_space) return wnd_scale; } +/** + * TCP's IW as recommended by RFC6928 + */ +always_inline u32 +tcp_initial_wnd_unscaled (tcp_connection_t * tc) +{ + return TCP_IW_N_SEGMENTS * dummy_mtu; +} + /** * Compute initial window and scale factor. As per RFC1323, window field in * SYN and SYN-ACK segments is never scaled. @@ -97,18 +106,15 @@ tcp_window_compute_scale (u32 available_space) u32 tcp_initial_window_to_advertise (tcp_connection_t * tc) { - u32 available_space; + u32 max_fifo; /* Initial wnd for SYN. Fifos are not allocated yet. - * Use some predefined value */ - if (tc->state != TCP_STATE_SYN_RCVD) - { - return TCP_DEFAULT_RX_FIFO_SIZE; - } + * Use some predefined value. For SYN-ACK we still want the + * scale to be computed in the same way */ + max_fifo = TCP_MAX_RX_FIFO_SIZE; - available_space = stream_session_max_enqueue (&tc->connection); - tc->rcv_wscale = tcp_window_compute_scale (available_space); - tc->rcv_wnd = clib_min (available_space, TCP_WND_MAX << tc->rcv_wscale); + tc->rcv_wscale = tcp_window_compute_scale (max_fifo); + tc->rcv_wnd = tcp_initial_wnd_unscaled (tc); return clib_min (tc->rcv_wnd, TCP_WND_MAX); } @@ -119,23 +125,43 @@ tcp_initial_window_to_advertise (tcp_connection_t * tc) u32 tcp_window_to_advertise (tcp_connection_t * tc, tcp_state_t state) { - u32 available_space, wnd, scaled_space; + u32 available_space, max_fifo, observed_wnd; - if (state != TCP_STATE_ESTABLISHED) + if (state < TCP_STATE_ESTABLISHED) return tcp_initial_window_to_advertise (tc); + /* + * Figure out how much space we have available + */ available_space = stream_session_max_enqueue (&tc->connection); - scaled_space = available_space >> tc->rcv_wscale; + max_fifo = stream_session_fifo_size (&tc->connection); + + ASSERT (tc->opt.mss < max_fifo); + + if (available_space < tc->opt.mss && available_space < max_fifo / 8) + available_space = 0; - /* Need to update scale */ - if (PREDICT_FALSE ((scaled_space == 0 && available_space != 0)) - || (scaled_space >= TCP_WND_MAX)) - tc->rcv_wscale = tcp_window_compute_scale (available_space); + /* + * Use the above and what we know about what we've previously advertised + * to compute the new window + */ + observed_wnd = tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las); - wnd = clib_min (available_space, TCP_WND_MAX << tc->rcv_wscale); - tc->rcv_wnd = wnd; + /* Bad. Thou shalt not shrink */ + if (available_space < observed_wnd) + { + if (available_space == 0) + clib_warning ("Didn't shrink rcv window despite not having space"); + } + + tc->rcv_wnd = clib_min (available_space, TCP_WND_MAX << tc->rcv_wscale); + + if (tc->rcv_wnd == 0) + { + tc->flags |= TCP_CONN_SENT_RCV_WND0; + } - return wnd >> tc->rcv_wscale; + return tc->rcv_wnd >> tc->rcv_wscale; } /** @@ -225,7 +251,7 @@ tcp_options_write (u8 * data, tcp_options_t * opts) } always_inline int -tcp_make_syn_options (tcp_options_t * opts, u32 initial_wnd) +tcp_make_syn_options (tcp_options_t * opts, u8 wnd_scale) { u8 len = 0; @@ -234,7 +260,7 @@ tcp_make_syn_options (tcp_options_t * opts, u32 initial_wnd) len += TCP_OPTION_LEN_MSS; opts->flags |= TCP_OPTS_FLAG_WSCALE; - opts->wscale = tcp_window_compute_scale (initial_wnd); + opts->wscale = wnd_scale; len += TCP_OPTION_LEN_WINDOW_SCALE; opts->flags |= TCP_OPTS_FLAG_TSTAMP; @@ -327,8 +353,7 @@ tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts, case TCP_STATE_SYN_RCVD: return tcp_make_synack_options (tc, opts); case TCP_STATE_SYN_SENT: - return tcp_make_syn_options (opts, - tcp_initial_window_to_advertise (tc)); + return tcp_make_syn_options (opts, tc->rcv_wscale); default: clib_warning ("Not handled!"); return 0; @@ -732,7 +757,7 @@ tcp_send_syn (tcp_connection_t * tc) /* Make and write options */ memset (&snd_opts, 0, sizeof (snd_opts)); - tcp_opts_len = tcp_make_syn_options (&snd_opts, initial_wnd); + tcp_opts_len = tcp_make_syn_options (&snd_opts, tc->rcv_wscale); tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t); th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss, @@ -900,7 +925,7 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, tcp_reuse_buffer (vm, b); - ASSERT (tc->state == TCP_STATE_ESTABLISHED); + ASSERT (tc->state >= TCP_STATE_ESTABLISHED); ASSERT (max_bytes != 0); if (tcp_opts_sack_permitted (&tc->opt)) @@ -929,7 +954,6 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, max_bytes); ASSERT (n_bytes != 0); - tc->snd_nxt += n_bytes; tcp_push_hdr_i (tc, b, tc->state); return n_bytes; @@ -967,7 +991,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); - if (tc->state == TCP_STATE_ESTABLISHED) + if (tc->state >= TCP_STATE_ESTABLISHED) { tcp_fastrecovery_off (tc); @@ -977,6 +1001,12 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Figure out what and how many bytes we can send */ snd_space = tcp_available_snd_space (tc); max_bytes = clib_min (tc->snd_mss, snd_space); + + if (max_bytes == 0) + { + clib_warning ("no wnd to retransmit"); + return; + } tcp_prepare_retransmit_segment (tc, b, max_bytes); tc->rtx_bytes += max_bytes; @@ -996,7 +1026,11 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); vlib_buffer_make_headroom (b, MAX_HDRS_LEN); + tcp_push_hdr_i (tc, b, tc->state); + + /* Account for the SYN */ + tc->snd_nxt += 1; } if (!is_syn) @@ -1163,8 +1197,8 @@ tcp46_output_inline (vlib_main_t * vm, if (PREDICT_FALSE (vnet_buffer (b0)->tcp.flags & TCP_BUF_FLAG_DUPACK)) { + ASSERT (tc0->snt_dupacks > 0); tc0->snt_dupacks--; - ASSERT (tc0->snt_dupacks >= 0); if (!tcp_session_has_ooo_data (tc0)) { error0 = TCP_ERROR_FILTERED_DUPACKS; -- cgit 1.2.3-korg From d79b41e993981df80245b0e6d90eb691bdaae648 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Sat, 4 Mar 2017 05:37:52 -0800 Subject: VPP-659 TCP improvements - builtin test echo server - fix SYN-ACK retransmit canceling - avoid sending spurious ACK if in LAST_ACK - improved client dummy test app - renamed tx fifo dequeuing and sending functions to avoid confusion - improved RST handling Change-Id: Ia14aad3df319540dcf6e6a4e18a9f8d423a4b83b Signed-off-by: Florin Coras Signed-off-by: Dave Barach --- src/scripts/vnet/uri/afp_setup.cli | 5 ++ src/scripts/vnet/uri/dummy_app.py | 26 ++++++++- src/scripts/vnet/uri/tap_setup.cli | 5 ++ src/scripts/vnet/uri/tcp_server | 5 -- src/uri/uri_tcp_test.c | 7 ++- src/vnet/session/application.c | 16 ++++++ src/vnet/session/application.h | 3 +- src/vnet/session/application_interface.c | 5 +- src/vnet/session/application_interface.h | 4 +- src/vnet/session/node.c | 56 +++++++++++-------- src/vnet/session/session.c | 38 +++++++++---- src/vnet/session/session.h | 28 +++++----- src/vnet/session/session_api.c | 50 +++++++++++++++-- src/vnet/session/transport.h | 66 +++++++++++----------- src/vnet/tcp/builtin_server.c | 94 ++++++++++++++++++++++++++++++-- src/vnet/tcp/tcp.c | 63 +++++++++++++++++---- src/vnet/tcp/tcp.h | 14 +++-- src/vnet/tcp/tcp_error.def | 11 ++-- src/vnet/tcp/tcp_input.c | 63 ++++++++++++++------- src/vnet/tcp/tcp_output.c | 47 +++++++++++++--- src/vnet/udp/builtin_server.c | 2 +- src/vnet/udp/udp_input.c | 14 ++--- src/vnet/unix/tapcli.c | 3 +- 23 files changed, 459 insertions(+), 166 deletions(-) create mode 100644 src/scripts/vnet/uri/afp_setup.cli create mode 100644 src/scripts/vnet/uri/tap_setup.cli delete mode 100644 src/scripts/vnet/uri/tcp_server (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/scripts/vnet/uri/afp_setup.cli b/src/scripts/vnet/uri/afp_setup.cli new file mode 100644 index 00000000..c29afc6f --- /dev/null +++ b/src/scripts/vnet/uri/afp_setup.cli @@ -0,0 +1,5 @@ +create host-interface name vpp1 +set int state host-vpp1 up +set int ip address host-vpp1 6.0.1.1/24 +trace add af-packet-input 10 +session enable diff --git a/src/scripts/vnet/uri/dummy_app.py b/src/scripts/vnet/uri/dummy_app.py index b80fbb28..50333923 100644 --- a/src/scripts/vnet/uri/dummy_app.py +++ b/src/scripts/vnet/uri/dummy_app.py @@ -2,7 +2,7 @@ import socket import sys -import bitstring +import time # action can be reflect or drop action = "drop" @@ -22,6 +22,7 @@ def handle_connection (connection, client_address): def run_server(ip, port): print("Starting server {}:{}".format(repr(ip), repr(port))) sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) server_address = (ip, int(port)) sock.bind(server_address) sock.listen(1) @@ -39,12 +40,31 @@ def prepare_data(): def run_client(ip, port): print("Starting client {}:{}".format(repr(ip), repr(port))) sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - server_address = ("6.0.1.1", 1234) + server_address = (ip, port) sock.connect(server_address) data = prepare_data() + n_rcvd = 0 + n_sent = len (data) try: sock.sendall(data) + + timeout = time.time() + 2 + while n_rcvd < n_sent and time.time() < timeout: + tmp = sock.recv(1500) + tmp = bytearray (tmp) + n_read = len(tmp) + for i in range(n_read): + if (data[n_rcvd + i] != tmp[i]): + print("Difference at byte {}. Sent {} got {}" + .format(n_rcvd + i, data[n_rcvd + i], tmp[i])) + n_rcvd += n_read + + if (n_rcvd < n_sent or n_rcvd > n_sent): + print("Sent {} and got back {}".format(n_sent, n_rcvd)) + else: + print("Got back what we've sent!!"); + finally: sock.close() @@ -62,4 +82,4 @@ if __name__ == "__main__": if (len(sys.argv) == 5): action = sys.argv[4] - run (sys.argv[1], sys.argv[2], sys.argv[3]) + run (sys.argv[1], sys.argv[2], int(sys.argv[3])) diff --git a/src/scripts/vnet/uri/tap_setup.cli b/src/scripts/vnet/uri/tap_setup.cli new file mode 100644 index 00000000..1d9a1b36 --- /dev/null +++ b/src/scripts/vnet/uri/tap_setup.cli @@ -0,0 +1,5 @@ +tap connect tap0 address 6.0.1.2/24 +set int ip addr tap-0 6.0.1.1/24 +set int state tap-0 up +trace add tapcli-rx 10 +session enable diff --git a/src/scripts/vnet/uri/tcp_server b/src/scripts/vnet/uri/tcp_server deleted file mode 100644 index c29afc6f..00000000 --- a/src/scripts/vnet/uri/tcp_server +++ /dev/null @@ -1,5 +0,0 @@ -create host-interface name vpp1 -set int state host-vpp1 up -set int ip address host-vpp1 6.0.1.1/24 -trace add af-packet-input 10 -session enable diff --git a/src/uri/uri_tcp_test.c b/src/uri/uri_tcp_test.c index 6c9cf1db..261fd288 100644 --- a/src/uri/uri_tcp_test.c +++ b/src/uri/uri_tcp_test.c @@ -287,6 +287,7 @@ vl_api_reset_session_t_handler (vl_api_reset_session_t * mp) session = pool_elt_at_index (utm->sessions, p[0]); hash_unset (utm->session_index_by_vpp_handles, key); pool_put (utm->sessions, session); + utm->time_to_stop = 1; } else { @@ -296,7 +297,7 @@ vl_api_reset_session_t_handler (vl_api_reset_session_t * mp) rmp = vl_msg_api_alloc (sizeof (*rmp)); memset (rmp, 0, sizeof (*rmp)); - rmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION_REPLY); + rmp->_vl_msg_id = ntohs (VL_API_RESET_SESSION_REPLY); rmp->retval = rv; rmp->session_index = mp->session_index; rmp->session_thread_index = mp->session_thread_index; @@ -734,7 +735,7 @@ server_handle_fifo_event_rx (uri_tcp_test_main_t * utm, { rv = svm_fifo_enqueue_nowait (tx_fifo, 0, n_read, utm->rx_buf); } - while (rv == -2); + while (rv == -2 && !utm->time_to_stop); /* Fabricate TX event, send to vpp */ evt.fifo = tx_fifo; @@ -750,7 +751,7 @@ server_handle_fifo_event_rx (uri_tcp_test_main_t * utm, if (n_read > 0) bytes -= n_read; } - while (n_read < 0 || bytes > 0); + while ((n_read < 0 || bytes > 0) && !utm->time_to_stop); } void diff --git a/src/vnet/session/application.c b/src/vnet/session/application.c index a542eebe..513e5fac 100644 --- a/src/vnet/session/application.c +++ b/src/vnet/session/application.c @@ -92,6 +92,19 @@ application_del (application_t * app) pool_put (app_pool, app); } +static void +application_verify_cb_fns (application_type_t type, session_cb_vft_t * cb_fns) +{ + if (type == APP_SERVER && cb_fns->session_accept_callback == 0) + clib_warning ("No accept callback function provided"); + if (type == APP_CLIENT && cb_fns->session_connected_callback == 0) + clib_warning ("No session connected callback function provided"); + if (cb_fns->session_disconnect_callback == 0) + clib_warning ("No session disconnect callback function provided"); + if (cb_fns->session_reset_callback == 0) + clib_warning ("No session reset callback function provided"); +} + application_t * application_new (application_type_t type, session_type_t sst, u32 api_client_index, u32 flags, session_cb_vft_t * cb_fns) @@ -142,6 +155,9 @@ application_new (application_type_t type, session_type_t sst, app->flags = flags; app->cb_fns = *cb_fns; + /* Check that the obvious things are properly set up */ + application_verify_cb_fns (type, cb_fns); + /* Add app to lookup by api_client_index table */ application_table_add (app); diff --git a/src/vnet/session/application.h b/src/vnet/session/application.h index 480828f7..a60a8b8b 100644 --- a/src/vnet/session/application.h +++ b/src/vnet/session/application.h @@ -45,7 +45,8 @@ typedef struct _stream_session_cb_vft void (*session_reset_callback) (stream_session_t * s); /* Direct RX callback, for built-in servers */ - int (*builtin_server_rx_callback) (stream_session_t * session); + int (*builtin_server_rx_callback) (stream_session_t * session, + session_fifo_event_t * ep); /* Redirect connection to local server */ int (*redirect_connect_callback) (u32 api_client_index, void *mp); diff --git a/src/vnet/session/application_interface.c b/src/vnet/session/application_interface.c index 6ddfb70f..4b30bd87 100644 --- a/src/vnet/session/application_interface.c +++ b/src/vnet/session/application_interface.c @@ -98,7 +98,7 @@ vnet_bind_i (u32 api_client_index, ip46_address_t * ip46, u16 port_host_order, if (application_lookup (api_client_index)) { - clib_warning ("Only one bind supported for now"); + clib_warning ("Only one connection supported for now"); return VNET_API_ERROR_ADDRESS_IN_USE; } @@ -364,8 +364,7 @@ vnet_connect_uri (vnet_connect_args_t * a) } int -vnet_disconnect_session (u32 client_index, u32 session_index, - u32 thread_index) +vnet_disconnect_session (u32 session_index, u32 thread_index) { stream_session_t *session; diff --git a/src/vnet/session/application_interface.h b/src/vnet/session/application_interface.h index 8d87c067..a5f2b9a6 100644 --- a/src/vnet/session/application_interface.h +++ b/src/vnet/session/application_interface.h @@ -112,9 +112,7 @@ typedef enum int vnet_bind_uri (vnet_bind_args_t *); int vnet_unbind_uri (char *uri, u32 api_client_index); int vnet_connect_uri (vnet_connect_args_t * a); -int -vnet_disconnect_session (u32 client_index, u32 session_index, - u32 thread_index); +int vnet_disconnect_session (u32 session_index, u32 thread_index); int vnet_bind (vnet_bind_args_t * a); int vnet_connect (vnet_connect_args_t * a); diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c index 399077de..7fd7e0b7 100644 --- a/src/vnet/session/node.c +++ b/src/vnet/session/node.c @@ -78,10 +78,11 @@ static u32 session_type_to_next[] = { }; always_inline int -session_fifo_rx_i (vlib_main_t * vm, vlib_node_runtime_t * node, - session_manager_main_t * smm, session_fifo_event_t * e0, - stream_session_t * s0, u32 thread_index, int *n_tx_packets, - u8 peek_data) +session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, + session_manager_main_t * smm, + session_fifo_event_t * e0, + stream_session_t * s0, u32 thread_index, + int *n_tx_packets, u8 peek_data) { u32 n_trace = vlib_get_trace_count (vm, node); u32 left_to_snd0, max_len_to_snd0, len_to_deq0, n_bufs, snd_space0; @@ -120,7 +121,7 @@ session_fifo_rx_i (vlib_main_t * vm, vlib_node_runtime_t * node, if (peek_data) { /* Offset in rx fifo from where to peek data */ - rx_offset = transport_vft->rx_fifo_offset (tc0); + rx_offset = transport_vft->tx_fifo_offset (tc0); } /* TODO check if transport is willing to send len_to_snd0 @@ -194,25 +195,27 @@ session_fifo_rx_i (vlib_main_t * vm, vlib_node_runtime_t * node, t0->server_thread_index = s0->thread_index; } + len_to_deq0 = (left_to_snd0 < snd_mss0) ? left_to_snd0 : snd_mss0; + /* *INDENT-OFF* */ if (1) { ELOG_TYPE_DECLARE (e) = { - .format = "evt-dequeue: id %d length %d", - .format_args = "i4i4", + .format = "evt-deq: id %d len %d rd %d wnd %d", + .format_args = "i4i4i4i4", }; struct { - u32 data[2]; + u32 data[4]; } *ed; ed = ELOG_DATA (&vm->elog_main, e); ed->data[0] = e0->event_id; ed->data[1] = e0->enqueue_length; + ed->data[2] = len_to_deq0; + ed->data[3] = left_to_snd0; } /* *INDENT-ON* */ - len_to_deq0 = (left_to_snd0 < snd_mss0) ? left_to_snd0 : snd_mss0; - /* Make room for headers */ data0 = vlib_buffer_make_headroom (b0, MAX_HDRS_LEN); @@ -276,22 +279,25 @@ dequeue_fail: } int -session_fifo_rx_peek (vlib_main_t * vm, vlib_node_runtime_t * node, - session_manager_main_t * smm, session_fifo_event_t * e0, - stream_session_t * s0, u32 thread_index, int *n_tx_pkts) +session_tx_fifo_peek_and_snd (vlib_main_t * vm, vlib_node_runtime_t * node, + session_manager_main_t * smm, + session_fifo_event_t * e0, + stream_session_t * s0, u32 thread_index, + int *n_tx_pkts) { - return session_fifo_rx_i (vm, node, smm, e0, s0, thread_index, n_tx_pkts, - 1); + return session_tx_fifo_read_and_snd_i (vm, node, smm, e0, s0, thread_index, + n_tx_pkts, 1); } int -session_fifo_rx_dequeue (vlib_main_t * vm, vlib_node_runtime_t * node, - session_manager_main_t * smm, - session_fifo_event_t * e0, stream_session_t * s0, - u32 thread_index, int *n_tx_pkts) +session_tx_fifo_dequeue_and_snd (vlib_main_t * vm, vlib_node_runtime_t * node, + session_manager_main_t * smm, + session_fifo_event_t * e0, + stream_session_t * s0, u32 thread_index, + int *n_tx_pkts) { - return session_fifo_rx_i (vm, node, smm, e0, s0, thread_index, n_tx_pkts, - 0); + return session_tx_fifo_read_and_snd_i (vm, node, smm, e0, s0, thread_index, + n_tx_pkts, 0); } static uword @@ -369,12 +375,16 @@ skip_dequeue: s0 = stream_session_get_if_valid (server_session_index0, my_thread_index); - if (!s0) + + if (CLIB_DEBUG && !s0) { - clib_warning ("It's dead Jim!"); + clib_warning ("It's dead, Jim!"); continue; } + if (PREDICT_FALSE (s0->session_state == SESSION_STATE_CLOSED)) + continue; + ASSERT (s0->thread_index == my_thread_index); switch (e0->event_type) diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index b5a168ca..8867e794 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -373,7 +373,7 @@ stream_session_lookup_transport6 (ip6_address_t * lcl, ip6_address_t * rmt, /* Finally, try half-open connections */ rv = clib_bihash_search_inline_48_8 (&smm->v6_half_open_hash, &kv6); if (rv == 0) - return tp_vfts[s->session_type].get_half_open (kv6.value & 0xFFFFFFFF); + return tp_vfts[proto].get_half_open (kv6.value & 0xFFFFFFFF); return 0; } @@ -617,7 +617,10 @@ again: goto again; } else - return SESSION_ERROR_NO_SPACE; + { + clib_warning ("No space to allocate fifos!"); + return SESSION_ERROR_NO_SPACE; + } } return 0; } @@ -806,6 +809,10 @@ stream_session_enqueue_notify (stream_session_t * s, u8 block) evt.event_id = serial_number++; evt.enqueue_length = svm_fifo_max_dequeue (s->server_rx_fifo); + /* Built-in server? Hand event to the callback... */ + if (app->cb_fns.builtin_server_rx_callback) + return app->cb_fns.builtin_server_rx_callback (s, &evt); + /* Add event to server's event queue */ q = app->event_queue; @@ -1043,13 +1050,9 @@ stream_session_delete (stream_session_t * s) session_manager_main_t *smm = vnet_get_session_manager_main (); svm_fifo_segment_private_t *fifo_segment; application_t *app; - int rv; - /* delete from the main lookup table */ - rv = stream_session_table_del (smm, s); - - if (rv) - clib_warning ("hash delete error, rv %d", rv); + /* Delete from the main lookup table. */ + stream_session_table_del (smm, s); /* Cleanup fifo segments */ fifo_segment = svm_fifo_get_segment (s->server_segment_index); @@ -1197,18 +1200,30 @@ stream_session_open (u8 sst, ip46_address_t * addr, u16 port_host_byte_order, void stream_session_disconnect (stream_session_t * s) { - tp_vfts[s->session_type].close (s->connection_index, s->thread_index); s->session_state = SESSION_STATE_CLOSED; + tp_vfts[s->session_type].close (s->connection_index, s->thread_index); } /** * Cleanup transport and session state. + * + * Notify transport of the cleanup, wait for a delete notify to actually + * remove the session state. */ void stream_session_cleanup (stream_session_t * s) { + session_manager_main_t *smm = &session_manager_main; + int rv; + + s->session_state = SESSION_STATE_CLOSED; + + /* Delete from the main lookup table to avoid more enqueues */ + rv = stream_session_table_del (smm, s); + if (rv) + clib_warning ("hash delete error, rv %d", rv); + tp_vfts[s->session_type].cleanup (s->connection_index, s->thread_index); - stream_session_delete (s); } void @@ -1221,7 +1236,8 @@ session_register_transport (u8 type, const transport_proto_vft_t * vft) /* If an offset function is provided, then peek instead of dequeue */ smm->session_rx_fns[type] = - (vft->rx_fifo_offset) ? session_fifo_rx_peek : session_fifo_rx_dequeue; + (vft->tx_fifo_offset) ? session_tx_fifo_peek_and_snd : + session_tx_fifo_dequeue_and_snd; } transport_proto_vft_t * diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index 46e5ce2c..1b712e2e 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -102,33 +102,33 @@ typedef CLIB_PACKED (struct typedef struct _stream_session_t { + /** fifo pointers. Once allocated, these do not move */ + svm_fifo_t *server_rx_fifo; + svm_fifo_t *server_tx_fifo; + /** Type */ u8 session_type; /** State */ u8 session_state; + u8 thread_index; + + /** used during unbind processing */ + u8 is_deleted; + + /** To avoid n**2 "one event per frame" check */ + u8 enqueue_epoch; + /** Session index in per_thread pool */ u32 session_index; /** Transport specific */ u32 connection_index; - u8 thread_index; - /** Application specific */ u32 pid; - /** fifo pointers. Once allocated, these do not move */ - svm_fifo_t *server_rx_fifo; - svm_fifo_t *server_tx_fifo; - - /** To avoid n**2 "one event per frame" check */ - u8 enqueue_epoch; - - /** used during unbind processing */ - u8 is_deleted; - /** stream server pool index */ u32 app_index; @@ -162,8 +162,8 @@ typedef int session_fifo_event_t * e0, stream_session_t * s0, u32 thread_index, int *n_tx_pkts); -extern session_fifo_rx_fn session_fifo_rx_peek; -extern session_fifo_rx_fn session_fifo_rx_dequeue; +extern session_fifo_rx_fn session_tx_fifo_peek_and_snd; +extern session_fifo_rx_fn session_tx_fifo_dequeue_and_snd; struct _session_manager_main { diff --git a/src/vnet/session/session_api.c b/src/vnet/session/session_api.c index 8852fc6e..9c38428a 100644 --- a/src/vnet/session/session_api.c +++ b/src/vnet/session/session_api.c @@ -130,6 +130,27 @@ send_session_disconnect_uri_callback (stream_session_t * s) vl_msg_api_send_shmem (q, (u8 *) & mp); } +static void +send_session_reset_uri_callback (stream_session_t * s) +{ + vl_api_reset_session_t *mp; + unix_shared_memory_queue_t *q; + application_t *app = application_get (s->app_index); + + q = vl_api_client_index_to_input_queue (app->api_client_index); + + if (!q) + return; + + mp = vl_msg_api_alloc (sizeof (*mp)); + memset (mp, 0, sizeof (*mp)); + mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_RESET_SESSION); + + mp->session_thread_index = s->thread_index; + mp->session_index = s->session_index; + vl_msg_api_send_shmem (q, (u8 *) & mp); +} + static int send_session_connected_uri_callback (u32 api_client_index, stream_session_t * s, u8 is_fail) @@ -347,6 +368,26 @@ send_session_disconnect_callback (stream_session_t * s) vl_msg_api_send_shmem (q, (u8 *) & mp); } +static void +send_session_reset_callback (stream_session_t * s) +{ + vl_api_reset_sock_t *mp; + unix_shared_memory_queue_t *q; + application_t *app = application_get (s->app_index); + + q = vl_api_client_index_to_input_queue (app->api_client_index); + + if (!q) + return; + + mp = vl_msg_api_alloc (sizeof (*mp)); + memset (mp, 0, sizeof (*mp)); + mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_RESET_SOCK); + + mp->handle = make_session_handle (s); + vl_msg_api_send_shmem (q, (u8 *) & mp); +} + /** * Redirect a connect_uri message to the indicated server. * Only sent if the server has bound the related port with @@ -414,6 +455,7 @@ static session_cb_vft_t uri_session_cb_vft = { .session_accept_callback = send_session_accept_uri_callback, .session_disconnect_callback = send_session_disconnect_uri_callback, .session_connected_callback = send_session_connected_uri_callback, + .session_reset_callback = send_session_reset_uri_callback, .add_segment_callback = send_add_segment_callback, .redirect_connect_callback = redirect_connect_uri_callback }; @@ -422,6 +464,7 @@ static session_cb_vft_t session_cb_vft = { .session_accept_callback = send_session_accept_callback, .session_disconnect_callback = send_session_disconnect_callback, .session_connected_callback = send_session_connected_callback, + .session_reset_callback = send_session_reset_callback, .add_segment_callback = send_add_segment_callback, .redirect_connect_callback = redirect_connect_callback }; @@ -548,8 +591,8 @@ vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) rv = api_session_not_valid (mp->session_index, mp->session_thread_index); if (!rv) - rv = vnet_disconnect_session (mp->client_index, mp->session_index, - mp->session_thread_index); + rv = + vnet_disconnect_session (mp->session_index, mp->session_thread_index); REPLY_MACRO (VL_API_DISCONNECT_SESSION_REPLY); } @@ -572,8 +615,7 @@ vl_api_disconnect_session_reply_t_handler (vl_api_disconnect_session_reply_t * } /* Disconnect has been confirmed. Confirm close to transport */ - vnet_disconnect_session (mp->client_index, mp->session_index, - mp->session_thread_index); + vnet_disconnect_session (mp->session_index, mp->session_thread_index); } static void diff --git a/src/vnet/session/transport.h b/src/vnet/session/transport.h index f486dbb2..0da30261 100644 --- a/src/vnet/session/transport.h +++ b/src/vnet/session/transport.h @@ -74,7 +74,7 @@ typedef struct _transport_proto_vft u32 (*push_header) (transport_connection_t * tconn, vlib_buffer_t * b); u16 (*send_mss) (transport_connection_t * tc); u32 (*send_space) (transport_connection_t * tc); - u32 (*rx_fifo_offset) (transport_connection_t * tc); + u32 (*tx_fifo_offset) (transport_connection_t * tc); /* * Connection retrieval @@ -92,39 +92,39 @@ typedef struct _transport_proto_vft } transport_proto_vft_t; +/* *INDENT-OFF* */ /* 16 octets */ -typedef CLIB_PACKED (struct - { - union - { - struct - { - ip4_address_t src; ip4_address_t dst; - u16 src_port; - u16 dst_port; - /* align by making this 4 octets even though its a 1-bit field - * NOTE: avoid key overlap with other transports that use 5 tuples for - * session identification. - */ - u32 proto; - }; - u64 as_u64[2]; - }; - }) v4_connection_key_t; - -typedef CLIB_PACKED (struct - { - union - { - struct - { - /* 48 octets */ - ip6_address_t src; ip6_address_t dst; - u16 src_port; - u16 dst_port; u32 proto; u8 unused_for_now[8]; - }; u64 as_u64[6]; - }; - }) v6_connection_key_t; +typedef CLIB_PACKED (struct { + union + { + struct + { + ip4_address_t src; ip4_address_t dst; + u16 src_port; + u16 dst_port; + /* align by making this 4 octets even though its a 1-bit field + * NOTE: avoid key overlap with other transports that use 5 tuples for + * session identification. + */ + u32 proto; + }; + u64 as_u64[2]; + }; +}) v4_connection_key_t; + +typedef CLIB_PACKED (struct { + union + { + struct + { + /* 48 octets */ + ip6_address_t src; ip6_address_t dst; + u16 src_port; + u16 dst_port; u32 proto; u8 unused_for_now[8]; + }; u64 as_u64[6]; + }; +}) v6_connection_key_t; +/* *INDENT-ON* */ typedef clib_bihash_kv_16_8_t session_kv4_t; typedef clib_bihash_kv_48_8_t session_kv6_t; diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c index be65642a..9b697a01 100644 --- a/src/vnet/tcp/builtin_server.c +++ b/src/vnet/tcp/builtin_server.c @@ -18,10 +18,24 @@ #include #include +typedef struct +{ + u8 *rx_buf; + unix_shared_memory_queue_t **vpp_queue; + vlib_main_t *vlib_main; +} builtin_server_main_t; + +builtin_server_main_t builtin_server_main; + + int builtin_session_accept_callback (stream_session_t * s) { + builtin_server_main_t *bsm = &builtin_server_main; clib_warning ("called..."); + + bsm->vpp_queue[s->thread_index] = + session_manager_get_vpp_event_queue (s->thread_index); s->session_state = SESSION_STATE_READY; return 0; } @@ -30,8 +44,19 @@ void builtin_session_disconnect_callback (stream_session_t * s) { clib_warning ("called..."); + + vnet_disconnect_session (s->session_index, s->thread_index); } +void +builtin_session_reset_callback (stream_session_t * s) +{ + clib_warning ("called.. "); + + stream_session_cleanup (s); +} + + int builtin_session_connected_callback (u32 client_index, stream_session_t * s, u8 is_fail) @@ -56,9 +81,57 @@ builtin_redirect_connect_callback (u32 client_index, void *mp) } int -builtin_server_rx_callback (stream_session_t * s) +builtin_server_rx_callback (stream_session_t * s, session_fifo_event_t * e) { - clib_warning ("called..."); + int n_written, bytes, total_copy_bytes; + int n_read; + svm_fifo_t *tx_fifo; + builtin_server_main_t *bsm = &builtin_server_main; + session_fifo_event_t evt; + static int serial_number = 0; + + bytes = e->enqueue_length; + if (PREDICT_FALSE (bytes <= 0)) + { + clib_warning ("bizarre rx callback: bytes %d", bytes); + return 0; + } + + tx_fifo = s->server_tx_fifo; + + /* Number of bytes we're going to copy */ + total_copy_bytes = (bytes < (tx_fifo->nitems - tx_fifo->cursize)) ? bytes : + tx_fifo->nitems - tx_fifo->cursize; + + if (PREDICT_FALSE (total_copy_bytes <= 0)) + { + clib_warning ("no space in tx fifo, event had %d bytes", bytes); + return 0; + } + + vec_validate (bsm->rx_buf, total_copy_bytes - 1); + _vec_len (bsm->rx_buf) = total_copy_bytes; + + n_read = svm_fifo_dequeue_nowait (s->server_rx_fifo, 0, total_copy_bytes, + bsm->rx_buf); + ASSERT (n_read == total_copy_bytes); + + /* + * Echo back + */ + + n_written = svm_fifo_enqueue_nowait (tx_fifo, 0, n_read, bsm->rx_buf); + ASSERT (n_written == total_copy_bytes); + + /* Fabricate TX event, send to vpp */ + evt.fifo = tx_fifo; + evt.event_type = FIFO_EVENT_SERVER_TX; + evt.enqueue_length = total_copy_bytes; + evt.event_id = serial_number++; + + unix_shared_memory_queue_add (bsm->vpp_queue[s->thread_index], (u8 *) & evt, + 0 /* do wait for mutex */ ); + return 0; } @@ -68,7 +141,8 @@ static session_cb_vft_t builtin_session_cb_vft = { .session_connected_callback = builtin_session_connected_callback, .add_segment_callback = builtin_add_segment_callback, .redirect_connect_callback = builtin_redirect_connect_callback, - .builtin_server_rx_callback = builtin_server_rx_callback + .builtin_server_rx_callback = builtin_server_rx_callback, + .session_reset_callback = builtin_session_reset_callback }; static int @@ -77,6 +151,11 @@ server_create (vlib_main_t * vm) vnet_bind_args_t _a, *a = &_a; u64 options[SESSION_OPTIONS_N_OPTIONS]; char segment_name[128]; + u32 num_threads; + vlib_thread_main_t *vtm = vlib_get_thread_main (); + + num_threads = 1 /* main thread */ + vtm->n_threads; + vec_validate (builtin_server_main.vpp_queue, num_threads - 1); memset (a, 0, sizeof (*a)); memset (options, 0, sizeof (options)); @@ -110,6 +189,7 @@ server_create_command_fn (vlib_main_t * vm, } #endif + vnet_session_enable_disable (vm, 1 /* turn on TCP, etc. */ ); rv = server_create (vm); switch (rv) { @@ -121,10 +201,14 @@ server_create_command_fn (vlib_main_t * vm, return 0; } +/* *INDENT-OFF* */ VLIB_CLI_COMMAND (server_create_command, static) = { -.path = "test server",.short_help = "test server",.function = - server_create_command_fn,}; + .path = "test server", + .short_help = "test server", + .function = server_create_command_fn, +}; +/* *INDENT-ON* */ /* * fd.io coding-style-patch-verification: ON diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 69433e26..d2df5c3e 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -139,6 +139,20 @@ tcp_connection_del (tcp_connection_t * tc) tcp_connection_cleanup (tc); } +/** Notify session that connection has been reset. + * + * Switch state to closed and wait for session to call cleanup. + */ +void +tcp_connection_reset (tcp_connection_t * tc) +{ + if (tc->state == TCP_STATE_CLOSED) + return; + + tc->state = TCP_STATE_CLOSED; + stream_session_reset_notify (&tc->connection); +} + /** * Begin connection closing procedure. * @@ -149,6 +163,8 @@ tcp_connection_del (tcp_connection_t * tc) * calls cleanup. * 2) TIME_WAIT (active close) whereby after 2MSL the 2MSL timer triggers * and cleanup is called. + * + * N.B. Half-close connections are not supported */ void tcp_connection_close (tcp_connection_t * tc) @@ -166,9 +182,9 @@ tcp_connection_close (tcp_connection_t * tc) else if (tc->state == TCP_STATE_CLOSE_WAIT) tc->state = TCP_STATE_LAST_ACK; - /* Half-close connections are not supported XXX */ - - if (tc->state == TCP_STATE_CLOSED) + /* If in CLOSED and WAITCLOSE timer is not set, delete connection now */ + if (tc->timers[TCP_TIMER_WAITCLOSE] == TCP_TIMER_HANDLE_INVALID + && tc->state == TCP_STATE_CLOSED) tcp_connection_del (tc); } @@ -185,7 +201,10 @@ tcp_session_cleanup (u32 conn_index, u32 thread_index) { tcp_connection_t *tc; tc = tcp_connection_get (conn_index, thread_index); - tcp_connection_cleanup (tc); + + /* Wait for the session tx events to clear */ + tc->state = TCP_STATE_CLOSED; + tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); } void * @@ -227,7 +246,8 @@ tcp_allocate_local_port (tcp_main_t * tm, ip46_address_t * ip) { transport_endpoint_t *tep; u32 time_now, tei; - u16 min = 1024, max = 65535, tries; /* XXX configurable ? */ + u16 min = 1024, max = 65535; /* XXX configurable ? */ + int tries; tries = max - min; time_now = tcp_time_now (); @@ -505,10 +525,10 @@ tcp_session_send_space (transport_connection_t * trans_conn) } u32 -tcp_session_rx_fifo_offset (transport_connection_t * trans_conn) +tcp_session_tx_fifo_offset (transport_connection_t * trans_conn) { tcp_connection_t *tc = (tcp_connection_t *) trans_conn; - return (tc->snd_una_max - tc->snd_una); + return (tc->snd_nxt - tc->snd_una); } /* *INDENT-OFF* */ @@ -524,7 +544,7 @@ const static transport_proto_vft_t tcp4_proto = { .cleanup = tcp_session_cleanup, .send_mss = tcp_session_send_mss, .send_space = tcp_session_send_space, - .rx_fifo_offset = tcp_session_rx_fifo_offset, + .tx_fifo_offset = tcp_session_tx_fifo_offset, .format_connection = format_tcp_session_ip4, .format_listener = format_tcp_listener_session_ip4, .format_half_open = format_tcp_half_open_session_ip4 @@ -542,7 +562,7 @@ const static transport_proto_vft_t tcp6_proto = { .cleanup = tcp_session_cleanup, .send_mss = tcp_session_send_mss, .send_space = tcp_session_send_space, - .rx_fifo_offset = tcp_session_rx_fifo_offset, + .tx_fifo_offset = tcp_session_tx_fifo_offset, .format_connection = format_tcp_session_ip6, .format_listener = format_tcp_listener_session_ip6, .format_half_open = format_tcp_half_open_session_ip6 @@ -579,13 +599,32 @@ tcp_timer_establish_handler (u32 conn_index) } void -tcp_timer_2msl_handler (u32 conn_index) +tcp_timer_waitclose_handler (u32 conn_index) { u32 cpu_index = os_get_cpu_number (); tcp_connection_t *tc; tc = tcp_connection_get (conn_index, cpu_index); - tc->timers[TCP_TIMER_2MSL] = TCP_TIMER_HANDLE_INVALID; + tc->timers[TCP_TIMER_WAITCLOSE] = TCP_TIMER_HANDLE_INVALID; + + /* Session didn't come back with a close(). Send FIN either way + * and switch to LAST_ACK. */ + if (tc->state == TCP_STATE_CLOSE_WAIT) + { + if (tc->flags & TCP_CONN_FINSNT) + { + clib_warning ("FIN was sent and still in CLOSE WAIT. Weird!"); + } + + tcp_send_fin (tc); + tc->state = TCP_STATE_LAST_ACK; + + /* Make sure we don't wait in LAST ACK forever */ + tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + + /* Don't delete the connection yet */ + return; + } tcp_connection_del (tc); } @@ -597,7 +636,7 @@ static timer_expiration_handler *timer_expiration_handlers[TCP_N_TIMERS] = tcp_timer_delack_handler, 0, tcp_timer_keep_handler, - tcp_timer_2msl_handler, + tcp_timer_waitclose_handler, tcp_timer_retransmit_syn_handler, tcp_timer_establish_handler }; diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 7d443433..3b3d8fc7 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -63,8 +63,8 @@ format_function_t format_tcp_state; _(DELACK, "DELAYED ACK") \ _(PERSIST, "PERSIST") \ _(KEEP, "KEEP") \ - _(2MSL, "2MSL") \ - _(RETRANSMIT_SYN, "RETRANSMIT_SYN") \ + _(WAITCLOSE, "WAIT CLOSE") \ + _(RETRANSMIT_SYN, "RETRANSMIT SYN") \ _(ESTABLISH, "ESTABLISH") typedef enum _tcp_timers @@ -89,6 +89,8 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler; #define TCP_DELACK_TIME 1 /* 0.1s */ #define TCP_ESTABLISH_TIME 750 /* 75s */ #define TCP_2MSL_TIME 300 /* 30s */ +#define TCP_CLOSEWAIT_TIME 1 /* 0.1s */ +#define TCP_CLEANUP_TIME 5 /* 0.5s Time to wait before cleanup */ #define TCP_RTO_MAX 60 * THZ /* Min max RTO (60s) as per RFC6298 */ #define TCP_RTT_MAX 30 * THZ /* 30s (probably too much) */ @@ -102,6 +104,7 @@ void tcp_update_time (f64 now, u32 thread_index); _(DELACK, "Delay ACK") \ _(SNDACK, "Send ACK") \ _(BURSTACK, "Burst ACK set") \ + _(FINSNT, "FIN sent") \ _(SENT_RCV_WND0, "Sent 0 receive window") \ _(RECOVERY, "Recovery on") \ _(FAST_RECOVERY, "Fast Recovery on") @@ -331,6 +334,8 @@ clib_error_t *vnet_tcp_enable_disable (vlib_main_t * vm, u8 is_en); always_inline tcp_connection_t * tcp_connection_get (u32 conn_index, u32 thread_index) { + if (pool_is_free_index (tcp_main.connections[thread_index], conn_index)) + return 0; return pool_elt_at_index (tcp_main.connections[thread_index], conn_index); } @@ -347,6 +352,7 @@ tcp_connection_get_if_valid (u32 conn_index, u32 thread_index) void tcp_connection_close (tcp_connection_t * tc); void tcp_connection_cleanup (tcp_connection_t * tc); void tcp_connection_del (tcp_connection_t * tc); +void tcp_connection_reset (tcp_connection_t * tc); always_inline tcp_connection_t * tcp_listener_get (u32 tli) @@ -361,7 +367,7 @@ tcp_half_open_connection_get (u32 conn_index) } void tcp_make_ack (tcp_connection_t * ts, vlib_buffer_t * b); -void tcp_make_finack (tcp_connection_t * tc, vlib_buffer_t * b); +void tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b); void tcp_make_synack (tcp_connection_t * ts, vlib_buffer_t * b); void tcp_send_reset (vlib_buffer_t * pkt, u8 is_ip4); void tcp_send_syn (tcp_connection_t * tc); @@ -467,7 +473,7 @@ tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval) } always_inline void -tcp_retransmit_timer_set (tcp_main_t * tm, tcp_connection_t * tc) +tcp_retransmit_timer_set (tcp_connection_t * tc) { /* XXX Switch to faster TW */ tcp_timer_set (tc, TCP_TIMER_RETRANSMIT, diff --git a/src/vnet/tcp/tcp_error.def b/src/vnet/tcp/tcp_error.def index cff5ec13..2dbdd9b3 100644 --- a/src/vnet/tcp/tcp_error.def +++ b/src/vnet/tcp/tcp_error.def @@ -17,13 +17,13 @@ tcp_error (NONE, "no error") tcp_error (NO_LISTENER, "no listener for dst port") tcp_error (LOOKUP_DROPS, "lookup drops") tcp_error (DISPATCH, "Dispatch error") -tcp_error (ENQUEUED, "Packets pushed into rx fifo") +tcp_error (ENQUEUED, "Packets pushed into rx fifo") tcp_error (PURE_ACK, "Pure acks") tcp_error (SYNS_RCVD, "SYNs received") tcp_error (SYN_ACKS_RCVD, "SYN-ACKs received") -tcp_error (NOT_READY, "Session not ready for packets") -tcp_error (FIFO_FULL, "Packets dropped for lack of rx fifo space") -tcp_error (EVENT_FIFO_FULL, "Events not sent for lack of event fifo space") +tcp_error (NOT_READY, "Session not ready for packets") +tcp_error (FIFO_FULL, "Packets dropped for lack of rx fifo space") +tcp_error (EVENT_FIFO_FULL, "Events not sent for lack of event fifo space") tcp_error (API_QUEUE_FULL, "Sessions not created for lack of API queue space") tcp_error (CREATE_SESSION_FAIL, "Sessions couldn't be allocated") tcp_error (SEGMENT_INVALID, "Invalid segment") @@ -32,4 +32,5 @@ tcp_error (ACK_DUP, "Duplicate ACK") tcp_error (ACK_OLD, "Old ACK") tcp_error (PKTS_SENT, "Packets sent") tcp_error (FILTERED_DUPACKS, "Filtered duplicate ACKs") -tcp_error (RST_SENT, "Resets sent") \ No newline at end of file +tcp_error (RST_SENT, "Resets sent") +tcp_error (INVALID_CONNECTION, "Invalid connection") diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 0a907d0a..f19fbf87 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -274,10 +274,7 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, /* 2nd: check the RST bit */ if (tcp_rst (th0)) { - /* Notify session that connection has been reset. Switch - * state to closed and await for session to do the cleanup. */ - stream_session_reset_notify (&tc0->connection); - tc0->state = TCP_STATE_CLOSED; + tcp_connection_reset (tc0); return -1; } @@ -1023,6 +1020,12 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index, my_thread_index); + if (PREDICT_FALSE (tc0 == 0)) + { + error0 = TCP_ERROR_INVALID_CONNECTION; + goto drop; + } + /* Checksum computed by ipx_local no need to compute again */ if (is_ip4) @@ -1072,12 +1075,12 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* 8: check the FIN bit */ if (tcp_fin (th0)) { - /* Send ACK and enter CLOSE-WAIT */ - tcp_make_ack (tc0, b0); - tcp_connection_force_ack (tc0, b0); - next0 = tcp_next_output (tc0->c_is_ip4); + /* Enter CLOSE-WAIT and notify session. Don't send ACK, instead + * wait for session to call close. To avoid lingering + * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */ tc0->state = TCP_STATE_CLOSE_WAIT; stream_session_disconnect_notify (&tc0->connection); + tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); } drop: @@ -1468,7 +1471,7 @@ VLIB_REGISTER_NODE (tcp6_syn_sent_node) = VLIB_NODE_FUNCTION_MULTIARCH (tcp6_syn_sent_node, tcp6_syn_sent_rcv); /** - * Handles reception for all states except LISTEN, SYN-SEND and ESTABLISHED + * Handles reception for all states except LISTEN, SYN-SENT and ESTABLISHED * as per RFC793 p. 64 */ always_inline uword @@ -1511,6 +1514,11 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, b0 = vlib_get_buffer (vm, bi0); tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index, my_thread_index); + if (PREDICT_FALSE (tc0 == 0)) + { + error0 = TCP_ERROR_INVALID_CONNECTION; + goto drop; + } /* Checksum computed by ipx_local no need to compute again */ @@ -1587,7 +1595,8 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Shoulder tap the server */ stream_session_accept_notify (&tc0->connection); - tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT_SYN); + /* Reset SYN-ACK retransmit timer */ + tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT); break; case TCP_STATE_ESTABLISHED: /* We can get packets in established state here because they @@ -1602,9 +1611,14 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, * continue processing in that state. */ if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) goto drop; - tc0->state = TCP_STATE_FIN_WAIT_2; - /* Stop all timers, 2MSL will be set lower */ - tcp_connection_timers_reset (tc0); + + /* If FIN is ACKed */ + if (tc0->snd_una == tc0->snd_una_max) + { + tc0->state = TCP_STATE_FIN_WAIT_2; + /* Stop all timers, 2MSL will be set lower */ + tcp_connection_timers_reset (tc0); + } break; case TCP_STATE_FIN_WAIT_2: /* In addition to the processing for the ESTABLISHED state, if @@ -1639,7 +1653,17 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (!tcp_rcv_ack_is_acceptable (tc0, b0)) goto drop; - tcp_connection_del (tc0); + tc0->state = TCP_STATE_CLOSED; + + /* Don't delete the connection/session yet. Instead, wait a + * reasonable amount of time until the pipes are cleared. In + * particular, this makes sure that we won't have dead sessions + * when processing events on the tx path */ + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); + + /* Stop retransmit */ + tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT); + goto drop; break; @@ -1684,7 +1708,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, case TCP_STATE_SYN_RCVD: /* Send FIN-ACK notify app and enter CLOSE-WAIT */ tcp_connection_timers_reset (tc0); - tcp_make_finack (tc0, b0); + tcp_make_fin (tc0, b0); next0 = tcp_next_output (tc0->c_is_ip4); stream_session_disconnect_notify (&tc0->connection); tc0->state = TCP_STATE_CLOSE_WAIT; @@ -1697,12 +1721,12 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, case TCP_STATE_FIN_WAIT_1: tc0->state = TCP_STATE_TIME_WAIT; tcp_connection_timers_reset (tc0); - tcp_timer_set (tc0, TCP_TIMER_2MSL, TCP_2MSL_TIME); + tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); break; case TCP_STATE_FIN_WAIT_2: /* Got FIN, send ACK! */ tc0->state = TCP_STATE_TIME_WAIT; - tcp_timer_set (tc0, TCP_TIMER_2MSL, TCP_2MSL_TIME); + tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); tcp_make_ack (tc0, b0); next0 = tcp_next_output (is_ip4); break; @@ -1710,7 +1734,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Remain in the TIME-WAIT state. Restart the 2 MSL time-wait * timeout. */ - tcp_timer_update (tc0, TCP_TIMER_2MSL, TCP_2MSL_TIME); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); break; } @@ -2113,6 +2137,7 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, n_left_to_next -= 1; b0 = vlib_get_buffer (vm, bi0); + vnet_buffer (b0)->tcp.flags = 0; if (is_ip4) { @@ -2168,7 +2193,6 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Send reset */ next0 = TCP_INPUT_NEXT_RESET; error0 = TCP_ERROR_NO_LISTENER; - vnet_buffer (b0)->tcp.flags = 0; } b0->error = error0 ? node->errors[error0] : 0; @@ -2288,6 +2312,7 @@ do { \ _(ESTABLISHED, TCP_FLAG_FIN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); _(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); + _(ESTABLISHED, TCP_FLAG_RST, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); /* ACK or FIN-ACK to our FIN */ _(FIN_WAIT_1, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(FIN_WAIT_1, TCP_FLAG_ACK | TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 7e431cd0..aa43e9f3 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -396,6 +396,7 @@ tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b) /* Leave enough space for headers */ vlib_buffer_make_headroom (b, MAX_HDRS_LEN); + vnet_buffer (b)->tcp.flags = 0; } /** @@ -443,16 +444,22 @@ tcp_make_ack (tcp_connection_t * tc, vlib_buffer_t * b) * Convert buffer to FIN-ACK */ void -tcp_make_finack (tcp_connection_t * tc, vlib_buffer_t * b) +tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b) { tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = tm->vlib_main; + u8 flags = 0; tcp_reuse_buffer (vm, b); - tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_ACK | TCP_FLAG_FIN); + + if (tc->rcv_las == tc->rcv_nxt) + flags = TCP_FLAG_FIN; + else + flags = TCP_FLAG_FIN | TCP_FLAG_ACK; + + tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, flags); /* Reset flags, make sure ack is sent */ - tc->flags = TCP_CONN_SNDACK; vnet_buffer (b)->tcp.flags &= ~TCP_BUF_FLAG_DUPACK; tc->snd_nxt += 1; @@ -500,7 +507,7 @@ tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b) vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; /* Init retransmit timer */ - tcp_retransmit_timer_set (tm, tc); + tcp_retransmit_timer_set (tc); } always_inline void @@ -818,9 +825,9 @@ tcp_send_fin (tcp_connection_t * tc) /* Leave enough space for headers */ vlib_buffer_make_headroom (b, MAX_HDRS_LEN); - tcp_make_finack (tc, b); - + tcp_make_fin (tc, b); tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + tc->flags |= TCP_CONN_FINSNT; } always_inline u8 @@ -1038,7 +1045,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); /* Re-enable retransmit timer */ - tcp_retransmit_timer_set (tm, tc); + tcp_retransmit_timer_set (tc); } else { @@ -1139,7 +1146,6 @@ tcp46_output_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, int is_ip4) { - tcp_main_t *tm = vnet_get_tcp_main (); u32 n_left_from, next_index, *from, *to_next; u32 my_thread_index = vm->cpu_index; @@ -1172,6 +1178,13 @@ tcp46_output_inline (vlib_main_t * vm, b0 = vlib_get_buffer (vm, bi0); tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index, my_thread_index); + if (PREDICT_FALSE (tc0 == 0 || tc0->state == TCP_STATE_CLOSED)) + { + error0 = TCP_ERROR_INVALID_CONNECTION; + next0 = TCP_OUTPUT_NEXT_DROP; + goto done; + } + th0 = vlib_buffer_get_current (b0); if (is_ip4) @@ -1229,6 +1242,22 @@ tcp46_output_inline (vlib_main_t * vm, tc0->rtt_ts = tcp_time_now (); tc0->rtt_seq = tc0->snd_nxt; } + + if (1) + { + ELOG_TYPE_DECLARE (e) = + { + .format = + "output: snd_una %u snd_una_max %u",.format_args = + "i4i4",}; + struct + { + u32 data[2]; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->data[0] = tc0->snd_una - tc0->iss; + ed->data[1] = tc0->snd_una_max - tc0->iss; + } } /* Set the retransmit timer if not set already and not @@ -1236,7 +1265,7 @@ tcp46_output_inline (vlib_main_t * vm, if (!tcp_timer_is_active (tc0, TCP_TIMER_RETRANSMIT) && tc0->snd_nxt != tc0->snd_una) { - tcp_retransmit_timer_set (tm, tc0); + tcp_retransmit_timer_set (tc0); tc0->rto_boff = 0; } diff --git a/src/vnet/udp/builtin_server.c b/src/vnet/udp/builtin_server.c index afa66ba4..46c8e734 100644 --- a/src/vnet/udp/builtin_server.c +++ b/src/vnet/udp/builtin_server.c @@ -39,7 +39,7 @@ builtin_session_disconnect_callback (stream_session_t * s) } static int -builtin_server_rx_callback (stream_session_t * s) +builtin_server_rx_callback (stream_session_t * s, session_fifo_event_t * ep) { svm_fifo_t *rx_fifo, *tx_fifo; u32 this_transfer; diff --git a/src/vnet/udp/udp_input.c b/src/vnet/udp/udp_input.c index 4d509335..88278735 100644 --- a/src/vnet/udp/udp_input.c +++ b/src/vnet/udp/udp_input.c @@ -244,19 +244,19 @@ udp4_uri_input_node_fn (vlib_main_t * vm, /* Get session's server */ server0 = application_get (s0->app_index); - /* Built-in server? Deliver the goods... */ - if (server0->cb_fns.builtin_server_rx_callback) - { - server0->cb_fns.builtin_server_rx_callback (s0); - continue; - } - /* Fabricate event */ evt.fifo = s0->server_rx_fifo; evt.event_type = FIFO_EVENT_SERVER_RX; evt.event_id = serial_number++; evt.enqueue_length = svm_fifo_max_dequeue (s0->server_rx_fifo); + /* Built-in server? Deliver the goods... */ + if (server0->cb_fns.builtin_server_rx_callback) + { + server0->cb_fns.builtin_server_rx_callback (s0, &evt); + continue; + } + /* Add event to server's event queue */ q = server0->event_queue; diff --git a/src/vnet/unix/tapcli.c b/src/vnet/unix/tapcli.c index 496f3885..fb1a8bac 100644 --- a/src/vnet/unix/tapcli.c +++ b/src/vnet/unix/tapcli.c @@ -1435,7 +1435,8 @@ done: VLIB_CLI_COMMAND (tap_connect_command, static) = { .path = "tap connect", - .short_help = "tap connect [hwaddr ]", + .short_help = + "tap connect [address /mw] [hwaddr ]", .function = tap_connect_command_fn, }; -- cgit 1.2.3-korg From e69f4954a9de40a47f0bc27cdab0ba44e6985dac Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Tue, 7 Mar 2017 10:06:24 -0800 Subject: VPP-659 Improve tcp/session debugging and testing - event-logging support for tcp and session layer - improvements to uri test code - builtin_server on port 1234 - use the CLOSEWAIT timer when we rx FIN in FIN_WAIT_2 state Change-Id: Ibc445f164b2086b20323bf89c77cffd3059f570f Signed-off-by: Florin Coras Signed-off-by: Dave Barach Signed-off-by: Dave Barach --- src/scripts/vnet/uri/dpdk_setup.cli | 4 + src/uri/uri_tcp_test.c | 43 +++-- src/vnet.am | 2 + src/vnet/session/node.c | 18 +- src/vnet/session/session.c | 24 +-- src/vnet/session/session.h | 3 +- src/vnet/session/session_debug.h | 86 ++++++++++ src/vnet/session/transport.h | 12 +- src/vnet/tcp/builtin_server.c | 2 +- src/vnet/tcp/tcp.c | 181 +++++++++++++-------- src/vnet/tcp/tcp.h | 6 +- src/vnet/tcp/tcp_debug.h | 316 ++++++++++++++++++++++++++++++++++++ src/vnet/tcp/tcp_input.c | 34 ++-- src/vnet/tcp/tcp_output.c | 76 ++++----- src/vnet/udp/udp.c | 10 +- 15 files changed, 636 insertions(+), 181 deletions(-) create mode 100644 src/scripts/vnet/uri/dpdk_setup.cli create mode 100644 src/vnet/session/session_debug.h create mode 100644 src/vnet/tcp/tcp_debug.h (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/scripts/vnet/uri/dpdk_setup.cli b/src/scripts/vnet/uri/dpdk_setup.cli new file mode 100644 index 00000000..02bba58f --- /dev/null +++ b/src/scripts/vnet/uri/dpdk_setup.cli @@ -0,0 +1,4 @@ +set int state GigabitEthernet1b/0/0 up +set int ip address GigabitEthernet1b/0/0 6.0.1.1/24 +trace add dpdk-input 10 +session enable diff --git a/src/uri/uri_tcp_test.c b/src/uri/uri_tcp_test.c index 261fd288..406a5f4e 100644 --- a/src/uri/uri_tcp_test.c +++ b/src/uri/uri_tcp_test.c @@ -848,13 +848,21 @@ server_test (uri_tcp_test_main_t * utm) fformat (stdout, "Test complete...\n"); } -#define foreach_uri_msg \ -_(BIND_URI_REPLY, bind_uri_reply) \ -_(UNBIND_URI_REPLY, unbind_uri_reply) \ -_(ACCEPT_SESSION, accept_session) \ -_(CONNECT_URI_REPLY, connect_uri_reply) \ -_(DISCONNECT_SESSION, disconnect_session) \ -_(RESET_SESSION, reset_session) \ +static void +vl_api_disconnect_session_reply_t_handler (vl_api_disconnect_session_reply_t * + mp) +{ + clib_warning ("retval %d", ntohl (mp->retval)); +} + +#define foreach_uri_msg \ +_(BIND_URI_REPLY, bind_uri_reply) \ +_(UNBIND_URI_REPLY, unbind_uri_reply) \ +_(ACCEPT_SESSION, accept_session) \ +_(CONNECT_URI_REPLY, connect_uri_reply) \ +_(DISCONNECT_SESSION, disconnect_session) \ +_(DISCONNECT_SESSION_REPLY, disconnect_session_reply) \ +_(RESET_SESSION, reset_session) \ _(MAP_ANOTHER_SEGMENT, map_another_segment) void @@ -877,8 +885,9 @@ main (int argc, char **argv) uri_tcp_test_main_t *utm = &uri_tcp_test_main; unformat_input_t _argv, *a = &_argv; u8 *chroot_prefix; - u8 *heap; - u8 *bind_name = (u8 *) "tcp://0.0.0.0/1234"; + u8 *heap, *uri = 0; + u8 *bind_uri = (u8 *) "tcp://0.0.0.0/1234"; + u8 *connect_uri = (u8 *) "tcp://6.0.1.2/1234"; u32 tmp; mheap_t *h; session_t *session; @@ -911,7 +920,7 @@ main (int argc, char **argv) { vl_set_memory_root_path ((char *) chroot_prefix); } - else if (unformat (a, "uri %s", &bind_name)) + else if (unformat (a, "uri %s", &uri)) ; else if (unformat (a, "segment-size %dM", &tmp)) utm->configured_segment_size = tmp << 20; @@ -932,12 +941,21 @@ main (int argc, char **argv) } } - utm->uri = format (0, "%s%c", bind_name, 0); + if (uri) + { + utm->uri = format (0, "%s%c", uri, 0); + utm->connect_uri = format (0, "%s%c", uri, 0); + } + else + { + utm->uri = format (0, "%s%c", bind_uri, 0); + utm->connect_uri = format (0, "%s%c", connect_uri, 0); + } + utm->i_am_master = i_am_master; utm->segment_main = &svm_fifo_segment_main; utm->drop_packets = drop_packets; utm->test_return_packets = test_return_packets; - utm->connect_uri = format (0, "tcp://6.0.1.2/1234%c", 0); setup_signal_handlers (); uri_api_hookup (utm); @@ -952,6 +970,7 @@ main (int argc, char **argv) if (i_am_master == 0) { client_test (utm); + vl_client_disconnect_from_vlib (); exit (0); } diff --git a/src/vnet.am b/src/vnet.am index bc9655cc..223d5d93 100644 --- a/src/vnet.am +++ b/src/vnet.am @@ -469,6 +469,7 @@ libvnet_la_SOURCES += \ nobase_include_HEADERS += \ vnet/tcp/tcp_packet.h \ vnet/tcp/tcp_timer.h \ + vnet/tcp/tcp_debug.h \ vnet/tcp/tcp.h ######################################## @@ -837,6 +838,7 @@ nobase_include_HEADERS += \ vnet/session/application.h \ vnet/session/transport.h \ vnet/session/application_interface.h \ + vnet/session/session_debug.h \ vnet/session/session.api.h API_FILES += vnet/session/session.api diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c index 7fd7e0b7..822afebd 100644 --- a/src/vnet/session/node.c +++ b/src/vnet/session/node.c @@ -26,8 +26,8 @@ #include #include -#include #include +#include vlib_node_registration_t session_queue_node; @@ -198,22 +198,12 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, len_to_deq0 = (left_to_snd0 < snd_mss0) ? left_to_snd0 : snd_mss0; /* *INDENT-OFF* */ - if (1) - { - ELOG_TYPE_DECLARE (e) = { - .format = "evt-deq: id %d len %d rd %d wnd %d", - .format_args = "i4i4i4i4", - }; - struct - { - u32 data[4]; - } *ed; - ed = ELOG_DATA (&vm->elog_main, e); + SESSION_EVT_DBG(s0, SESSION_EVT_DEQ, ({ ed->data[0] = e0->event_id; ed->data[1] = e0->enqueue_length; ed->data[2] = len_to_deq0; ed->data[3] = left_to_snd0; - } + })); /* *INDENT-ON* */ /* Make room for headers */ @@ -392,7 +382,7 @@ skip_dequeue: case FIFO_EVENT_SERVER_TX: /* Spray packets in per session type frames, since they go to * different nodes */ - rv = (smm->session_rx_fns[s0->session_type]) (vm, node, smm, e0, s0, + rv = (smm->session_tx_fns[s0->session_type]) (vm, node, smm, e0, s0, my_thread_index, &n_tx_packets); if (rv < 0) diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 8867e794..06e2a09a 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -23,6 +23,7 @@ #include #include #include +#include /** * Per-type vector of transport protocol virtual function tables @@ -823,19 +824,12 @@ stream_session_enqueue_notify (stream_session_t * s, u8 block) else return -1; - if (1) - { - ELOG_TYPE_DECLARE (e) = - { - .format = "evt-enqueue: id %d length %d",.format_args = "i4i4",}; - struct - { - u32 data[2]; - } *ed; - ed = ELOG_DATA (&vlib_global_main.elog_main, e); + /* *INDENT-OFF* */ + SESSION_EVT_DBG(s, SESSION_EVT_ENQ, ({ ed->data[0] = evt.event_id; ed->data[1] = evt.enqueue_length; - } + })); + /* *INDENT-ON* */ return 0; } @@ -908,8 +902,7 @@ stream_session_start_listen (u32 server_index, ip46_address_t * ip, u16 port) s->app_index = srv->index; /* Transport bind/listen */ - tci = tp_vfts[srv->session_type].bind (smm->vlib_main, s->session_index, ip, - port); + tci = tp_vfts[srv->session_type].bind (s->session_index, ip, port); /* Attach transport to session */ s->connection_index = tci; @@ -938,8 +931,7 @@ stream_session_stop_listen (u32 server_index) tc = tp_vfts[srv->session_type].get_listener (listener->connection_index); stream_session_table_del_for_tc (smm, listener->session_type, tc); - tp_vfts[srv->session_type].unbind (smm->vlib_main, - listener->connection_index); + tp_vfts[srv->session_type].unbind (listener->connection_index); pool_put (smm->listen_sessions[srv->session_type], listener); } @@ -1235,7 +1227,7 @@ session_register_transport (u8 type, const transport_proto_vft_t * vft) tp_vfts[type] = *vft; /* If an offset function is provided, then peek instead of dequeue */ - smm->session_rx_fns[type] = + smm->session_tx_fns[type] = (vft->tx_fifo_offset) ? session_tx_fifo_peek_and_snd : session_tx_fifo_dequeue_and_snd; } diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index 1b712e2e..96c00d87 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -211,7 +211,7 @@ struct _session_manager_main session_manager_t *session_managers; /** Per transport rx function that can either dequeue or peek */ - session_fifo_rx_fn *session_rx_fns[SESSION_N_TYPES]; + session_fifo_rx_fn *session_tx_fns[SESSION_N_TYPES]; u8 is_enabled; @@ -358,6 +358,7 @@ u32 stream_session_dequeue_drop (transport_connection_t * tc, u32 max_bytes); void stream_session_connect_notify (transport_connection_t * tc, u8 sst, u8 is_fail); + void stream_session_accept_notify (transport_connection_t * tc); void stream_session_disconnect_notify (transport_connection_t * tc); void stream_session_delete_notify (transport_connection_t * tc); diff --git a/src/vnet/session/session_debug.h b/src/vnet/session/session_debug.h new file mode 100644 index 00000000..858f12e0 --- /dev/null +++ b/src/vnet/session/session_debug.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_VNET_SESSION_SESSION_DEBUG_H_ +#define SRC_VNET_SESSION_SESSION_DEBUG_H_ + +#include +#include +#include + +#define foreach_session_dbg_evt \ + _(ENQ, "enqueue") \ + _(DEQ, "dequeue") + +typedef enum _session_evt_dbg +{ +#define _(sym, str) SESSION_EVT_##sym, + foreach_session_dbg_evt +#undef _ +} session_evt_dbg_e; + +#if TRANSPORT_DEBUG + +#define DEC_SESSION_ETD(_s, _e, _size) \ + struct \ + { \ + u32 data[_size]; \ + } * ed; \ + transport_proto_vft_t *vft = \ + session_get_transport_vft (_s->session_type); \ + transport_connection_t *_tc = \ + vft->get_connection (_s->connection_index, _s->thread_index); \ + ed = ELOG_TRACK_DATA (&vlib_global_main.elog_main, \ + _e, _tc->elog_track) + + +#define SESSION_EVT_DEQ_HANDLER(_s, _body) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "deq: id %d len %d rd %d wnd %d", \ + .format_args = "i4i4i4i4", \ + }; \ + DEC_SESSION_ETD(_s, _e, 4); \ + do { _body; } while (0); \ +} + +#define SESSION_EVT_ENQ_HANDLER(_s, _body) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "enq: id %d length %d", \ + .format_args = "i4i4", \ + }; \ + DEC_SESSION_ETD(_s, _e, 2); \ + do { _body; } while (0); \ +} + +#define CONCAT_HELPER(_a, _b) _a##_b +#define CC(_a, _b) CONCAT_HELPER(_a, _b) + +#define SESSION_EVT_DBG(_s, _evt, _body) CC(_evt, _HANDLER)(_s, _body) + +#else +#define SESSION_EVT_DBG(_s, _evt, _body) +#endif + +#endif /* SRC_VNET_SESSION_SESSION_DEBUG_H_ */ +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/transport.h b/src/vnet/session/transport.h index 0da30261..421121d2 100644 --- a/src/vnet/session/transport.h +++ b/src/vnet/session/transport.h @@ -20,7 +20,7 @@ #include #include #include - +#include /* * Protocol independent transport properties associated to a session */ @@ -37,6 +37,10 @@ typedef struct _transport_connection u8 is_ip4; /**< Flag if IP4 connection */ u32 thread_index; /**< Worker-thread index */ +#if TRANSPORT_DEBUG + elog_track_t elog_track; /**< Debug purposes */ +#endif + /** Macros for 'derived classes' where base is named "connection" */ #define c_lcl_ip connection.lcl_ip #define c_rmt_ip connection.rmt_ip @@ -52,6 +56,7 @@ typedef struct _transport_connection #define c_c_index connection.c_index #define c_is_ip4 connection.is_ip4 #define c_thread_index connection.thread_index +#define c_elog_track connection.elog_track } transport_connection_t; /* @@ -62,8 +67,8 @@ typedef struct _transport_proto_vft /* * Setup */ - u32 (*bind) (vlib_main_t *, u32, ip46_address_t *, u16); - u32 (*unbind) (vlib_main_t *, u32); + u32 (*bind) (u32, ip46_address_t *, u16); + u32 (*unbind) (u32); int (*open) (ip46_address_t * addr, u16 port_host_byte_order); void (*close) (u32 conn_index, u32 thread_index); void (*cleanup) (u32 conn_index, u32 thread_index); @@ -89,7 +94,6 @@ typedef struct _transport_proto_vft u8 *(*format_connection) (u8 * s, va_list * args); u8 *(*format_listener) (u8 * s, va_list * args); u8 *(*format_half_open) (u8 * s, va_list * args); - } transport_proto_vft_t; /* *INDENT-OFF* */ diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c index 9b697a01..dd6759c5 100644 --- a/src/vnet/tcp/builtin_server.c +++ b/src/vnet/tcp/builtin_server.c @@ -160,7 +160,7 @@ server_create (vlib_main_t * vm) memset (a, 0, sizeof (*a)); memset (options, 0, sizeof (options)); - a->uri = "tcp://0.0.0.0/80"; + a->uri = "tcp://0.0.0.0/1234"; a->api_client_index = ~0; a->session_cb_vft = &builtin_session_cb_vft; a->options = options; diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index d2df5c3e..0d2e6d0e 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -21,7 +21,7 @@ tcp_main_t tcp_main; static u32 -tcp_connection_bind (vlib_main_t * vm, u32 session_index, ip46_address_t * ip, +tcp_connection_bind (u32 session_index, ip46_address_t * ip, u16 port_host_byte_order, u8 is_ip4) { tcp_main_t *tm = &tcp_main; @@ -43,42 +43,41 @@ tcp_connection_bind (vlib_main_t * vm, u32 session_index, ip46_address_t * ip, listener->state = TCP_STATE_LISTEN; listener->c_is_ip4 = 1; + tcp_connection_timers_init (listener); + + TCP_EVT_DBG (TCP_EVT_BIND, listener); + return listener->c_c_index; } u32 -tcp_session_bind_ip4 (vlib_main_t * vm, u32 session_index, - ip46_address_t * ip, u16 port_host_byte_order) +tcp_session_bind_ip4 (u32 session_index, ip46_address_t * ip, + u16 port_host_byte_order) { - return tcp_connection_bind (vm, session_index, ip, port_host_byte_order, 1); + return tcp_connection_bind (session_index, ip, port_host_byte_order, 1); } u32 -tcp_session_bind_ip6 (vlib_main_t * vm, u32 session_index, - ip46_address_t * ip, u16 port_host_byte_order) +tcp_session_bind_ip6 (u32 session_index, ip46_address_t * ip, + u16 port_host_byte_order) { - return tcp_connection_bind (vm, session_index, ip, port_host_byte_order, 0); + return tcp_connection_bind (session_index, ip, port_host_byte_order, 0); } static void -tcp_session_unbind (u32 listener_index) +tcp_connection_unbind (u32 listener_index) { tcp_main_t *tm = vnet_get_tcp_main (); + TCP_EVT_DBG (TCP_EVT_UNBIND, + pool_elt_at_index (tm->listener_pool, listener_index)); pool_put_index (tm->listener_pool, listener_index); } u32 -tcp_session_unbind_ip4 (vlib_main_t * vm, u32 listener_index) -{ - tcp_session_unbind (listener_index); - return 0; -} - -u32 -tcp_session_unbind_ip6 (vlib_main_t * vm, u32 listener_index) +tcp_session_unbind (u32 listener_index) { - tcp_session_unbind (listener_index); + tcp_connection_unbind (listener_index); return 0; } @@ -135,6 +134,7 @@ tcp_connection_cleanup (tcp_connection_t * tc) void tcp_connection_del (tcp_connection_t * tc) { + TCP_EVT_DBG (TCP_EVT_DELETE, tc); stream_session_delete_notify (&tc->connection); tcp_connection_cleanup (tc); } @@ -169,6 +169,8 @@ tcp_connection_reset (tcp_connection_t * tc) void tcp_connection_close (tcp_connection_t * tc) { + TCP_EVT_DBG (TCP_EVT_CLOSE, tc); + /* Send FIN if needed */ if (tc->state == TCP_STATE_ESTABLISHED || tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_CLOSE_WAIT) @@ -403,6 +405,8 @@ tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4) tc->state = TCP_STATE_SYN_SENT; + TCP_EVT_DBG (TCP_EVT_OPEN, tc); + return tc->c_c_index; } @@ -418,82 +422,119 @@ tcp_session_open_ip6 (ip46_address_t * addr, u16 port) return tcp_connection_open (addr, port, 0); } +const char *tcp_dbg_evt_str[] = { +#define _(sym, str) str, + foreach_tcp_dbg_evt +#undef _ +}; + +const char *tcp_fsm_states[] = { +#define _(sym, str) str, + foreach_tcp_fsm_state +#undef _ +}; + u8 * -format_tcp_session_ip4 (u8 * s, va_list * args) +format_tcp_state (u8 * s, va_list * args) { - u32 tci = va_arg (*args, u32); - u32 thread_index = va_arg (*args, u32); - tcp_connection_t *tc; + tcp_state_t *state = va_arg (*args, tcp_state_t *); - tc = tcp_connection_get (tci, thread_index); + if (*state < TCP_N_STATES) + s = format (s, "%s", tcp_fsm_states[*state]); + else + s = format (s, "UNKNOWN"); + + return s; +} + +const char *tcp_conn_timers[] = { +#define _(sym, str) str, + foreach_tcp_timer +#undef _ +}; + +u8 * +format_tcp_timers (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + int i, last = 0; - s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip4_address, - &tc->c_lcl_ip4, clib_net_to_host_u16 (tc->c_lcl_port), - format_ip4_address, &tc->c_rmt_ip4, - clib_net_to_host_u16 (tc->c_rmt_port)); + for (i = 0; i < TCP_N_TIMERS; i++) + if (tc->timers[i] != TCP_TIMER_HANDLE_INVALID) + last = i; + + s = format (s, "["); + for (i = 0; i < last; i++) + { + if (tc->timers[i] != TCP_TIMER_HANDLE_INVALID) + s = format (s, "%s,", tcp_conn_timers[i]); + } + + if (last > 0) + s = format (s, "%s]", tcp_conn_timers[i]); + else + s = format (s, "]"); return s; } u8 * -format_tcp_session_ip6 (u8 * s, va_list * args) +format_tcp_connection (u8 * s, va_list * args) { - u32 tci = va_arg (*args, u32); - u32 thread_index = va_arg (*args, u32); - tcp_connection_t *tc = tcp_connection_get (tci, thread_index); - s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip6_address, - &tc->c_lcl_ip6, clib_net_to_host_u16 (tc->c_lcl_port), - format_ip6_address, &tc->c_rmt_ip6, - clib_net_to_host_u16 (tc->c_rmt_port)); + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + + if (tc->c_is_ip4) + { + s = format (s, "[#%d][%s] %U:%d->%U:%d", tc->c_thread_index, "T", + format_ip4_address, &tc->c_lcl_ip4, + clib_net_to_host_u16 (tc->c_lcl_port), format_ip4_address, + &tc->c_rmt_ip4, clib_net_to_host_u16 (tc->c_rmt_port)); + } + else + { + s = format (s, "[#%d][%s] %U:%d->%U:%d", tc->c_thread_index, "T", + format_ip6_address, &tc->c_lcl_ip6, + clib_net_to_host_u16 (tc->c_lcl_port), format_ip6_address, + &tc->c_rmt_ip6, clib_net_to_host_u16 (tc->c_rmt_port)); + } + return s; } u8 * -format_tcp_listener_session_ip4 (u8 * s, va_list * args) +format_tcp_connection_verbose (u8 * s, va_list * args) { - u32 tci = va_arg (*args, u32); - tcp_connection_t *tc = tcp_listener_get (tci); - s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip4_address, - &tc->c_lcl_ip4, clib_net_to_host_u16 (tc->c_lcl_port), - format_ip4_address, &tc->c_rmt_ip4, - clib_net_to_host_u16 (tc->c_rmt_port)); + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + s = format (s, "%U %U %U", format_tcp_connection, tc, format_tcp_state, + &tc->state, format_tcp_timers, tc); return s; } u8 * -format_tcp_listener_session_ip6 (u8 * s, va_list * args) +format_tcp_session (u8 * s, va_list * args) { u32 tci = va_arg (*args, u32); - tcp_connection_t *tc = tcp_listener_get (tci); - s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip6_address, - &tc->c_lcl_ip6, clib_net_to_host_u16 (tc->c_lcl_port), - format_ip6_address, &tc->c_rmt_ip6, - clib_net_to_host_u16 (tc->c_rmt_port)); - return s; + u32 thread_index = va_arg (*args, u32); + tcp_connection_t *tc; + + tc = tcp_connection_get (tci, thread_index); + return format (s, "%U", format_tcp_connection, tc); } u8 * -format_tcp_half_open_session_ip4 (u8 * s, va_list * args) +format_tcp_listener_session (u8 * s, va_list * args) { u32 tci = va_arg (*args, u32); - tcp_connection_t *tc = tcp_half_open_connection_get (tci); - s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip4_address, - &tc->c_lcl_ip4, clib_net_to_host_u16 (tc->c_lcl_port), - format_ip4_address, &tc->c_rmt_ip4, - clib_net_to_host_u16 (tc->c_rmt_port)); - return s; + tcp_connection_t *tc = tcp_listener_get (tci); + return format (s, "%U", format_tcp_connection, tc); } u8 * -format_tcp_half_open_session_ip6 (u8 * s, va_list * args) +format_tcp_half_open_session (u8 * s, va_list * args) { u32 tci = va_arg (*args, u32); tcp_connection_t *tc = tcp_half_open_connection_get (tci); - s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip6_address, - &tc->c_lcl_ip6, clib_net_to_host_u16 (tc->c_lcl_port), - format_ip6_address, &tc->c_rmt_ip6, - clib_net_to_host_u16 (tc->c_rmt_port)); - return s; + return format (s, "%U", format_tcp_connection, tc); } transport_connection_t * @@ -534,7 +575,7 @@ tcp_session_tx_fifo_offset (transport_connection_t * trans_conn) /* *INDENT-OFF* */ const static transport_proto_vft_t tcp4_proto = { .bind = tcp_session_bind_ip4, - .unbind = tcp_session_unbind_ip4, + .unbind = tcp_session_unbind, .push_header = tcp_push_header, .get_connection = tcp_session_get_transport, .get_listener = tcp_session_get_listener, @@ -545,14 +586,14 @@ const static transport_proto_vft_t tcp4_proto = { .send_mss = tcp_session_send_mss, .send_space = tcp_session_send_space, .tx_fifo_offset = tcp_session_tx_fifo_offset, - .format_connection = format_tcp_session_ip4, - .format_listener = format_tcp_listener_session_ip4, - .format_half_open = format_tcp_half_open_session_ip4 + .format_connection = format_tcp_session, + .format_listener = format_tcp_listener_session, + .format_half_open = format_tcp_half_open_session, }; const static transport_proto_vft_t tcp6_proto = { .bind = tcp_session_bind_ip6, - .unbind = tcp_session_unbind_ip6, + .unbind = tcp_session_unbind, .push_header = tcp_push_header, .get_connection = tcp_session_get_transport, .get_listener = tcp_session_get_listener, @@ -563,9 +604,9 @@ const static transport_proto_vft_t tcp6_proto = { .send_mss = tcp_session_send_mss, .send_space = tcp_session_send_space, .tx_fifo_offset = tcp_session_tx_fifo_offset, - .format_connection = format_tcp_session_ip6, - .format_listener = format_tcp_listener_session_ip6, - .format_half_open = format_tcp_half_open_session_ip6 + .format_connection = format_tcp_session, + .format_listener = format_tcp_listener_session, + .format_half_open = format_tcp_half_open_session, }; /* *INDENT-ON* */ @@ -654,6 +695,8 @@ tcp_expired_timers_dispatch (u32 * expired_timers) connection_index = expired_timers[i] & 0x0FFFFFFF; timer_id = expired_timers[i] >> 28; + TCP_EVT_DBG (TCP_EVT_TIMER_POP, connection_index, timer_id); + /* Handle expiration */ (*timer_expiration_handlers[timer_id]) (connection_index); } diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 3b3d8fc7..082ab1d8 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -22,6 +22,7 @@ #include #include #include +#include #define TCP_TICK 10e-3 /**< TCP tick period (s) */ #define THZ 1/TCP_TICK /**< TCP tick frequency */ @@ -222,7 +223,7 @@ typedef struct _tcp_connection u32 prev_ssthresh; /**< ssthresh before congestion */ u32 bytes_acked; /**< Bytes acknowledged by current segment */ u32 rtx_bytes; /**< Retransmitted bytes */ - u32 tsecr_last_ack; /**< Timestamp echoed to us in last health ACK */ + u32 tsecr_last_ack; /**< Timestamp echoed to us in last healthy ACK */ tcp_cc_algorithm_t *cc_algo; /**< Congestion control algorithm */ /* RTT and RTO */ @@ -354,6 +355,9 @@ void tcp_connection_cleanup (tcp_connection_t * tc); void tcp_connection_del (tcp_connection_t * tc); void tcp_connection_reset (tcp_connection_t * tc); +u8 *format_tcp_connection (u8 * s, va_list * args); +u8 *format_tcp_connection_verbose (u8 * s, va_list * args); + always_inline tcp_connection_t * tcp_listener_get (u32 tli) { diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h new file mode 100644 index 00000000..069c512d --- /dev/null +++ b/src/vnet/tcp/tcp_debug.h @@ -0,0 +1,316 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SRC_VNET_TCP_TCP_DEBUG_H_ +#define SRC_VNET_TCP_TCP_DEBUG_H_ + +#include + +#define TCP_DEBUG (1) + +#define foreach_tcp_dbg_evt \ + _(INIT, "") \ + _(DEALLOC, "") \ + _(OPEN, "open") \ + _(CLOSE, "close") \ + _(BIND, "bind") \ + _(UNBIND, "unbind") \ + _(DELETE, "delete") \ + _(SYN_SENT, "SYN sent") \ + _(FIN_SENT, "FIN sent") \ + _(RST_SENT, "RST sent") \ + _(SYN_RCVD, "SYN rcvd") \ + _(ACK_RCVD, "ACK rcvd") \ + _(FIN_RCVD, "FIN rcvd") \ + _(RST_RCVD, "RST rcvd") \ + _(PKTIZE, "packetize") \ + _(INPUT, "in") \ + _(TIMER_POP, "timer pop") + +typedef enum _tcp_dbg +{ +#define _(sym, str) TCP_DBG_##sym, + foreach_tcp_dbg_evt +#undef _ +} tcp_dbg_e; + +typedef enum _tcp_dbg_evt +{ +#define _(sym, str) TCP_EVT_##sym, + foreach_tcp_dbg_evt +#undef _ +} tcp_dbg_evt_e; + +#if TCP_DEBUG + +#define TRANSPORT_DEBUG (1) + +#define TCP_DBG(_tc, _evt, _args...) \ +{ \ + u8 *_tmp = 0; \ + _tmp = format(_tmp, "%U", format_tcp_connection_verbose, _tc); \ + clib_warning("%s", _tmp); \ + vec_free(_tmp); \ +} + +#define DECLARE_ETD(_tc, _e, _size) \ + struct \ + { \ + u32 data[_size]; \ + } * ed; \ + ed = ELOG_TRACK_DATA (&vlib_global_main.elog_main, \ + _e, _tc->c_elog_track) + +#define TCP_EVT_INIT_HANDLER(_tc, ...) \ +{ \ + _tc->c_elog_track.name = \ + (char *) format (0, "%d%c", _tc->c_c_index, 0); \ + elog_track_register (&vlib_global_main.elog_main, &_tc->c_elog_track);\ +} + +#define TCP_EVT_DEALLOC_HANDLER(_tc, ...) \ +{ \ + vec_free (_tc->c_elog_track.name); \ +} + +#define TCP_EVT_OPEN_HANDLER(_tc, ...) \ +{ \ + TCP_EVT_INIT_HANDLER(_tc); \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "open: index %d", \ + .format_args = "i4", \ + }; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->c_c_index; \ +} + +#define TCP_EVT_CLOSE_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "close: %d", \ + .format_args = "i4", \ + }; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->c_c_index; \ +} + +#define TCP_EVT_BIND_HANDLER(_tc, ...) \ +{ \ + TCP_EVT_INIT_HANDLER(_tc); \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "bind: listener %d", \ + }; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->c_c_index; \ +} + +#define TCP_EVT_UNBIND_HANDLER(_tc, ...) \ +{ \ + TCP_EVT_DEALLOC_HANDLER(_tc); \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "unbind: listener %d", \ + }; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->c_c_index; \ + TCP_EVT_DEALLOC_HANDLER(_tc); \ +} + +#define TCP_EVT_DELETE_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "delete: %d", \ + .format_args = "i4", \ + }; \ + DECLARE_ETD(_tc, _e, 0); \ + ed->data[0] = _tc->c_c_index; \ + TCP_EVT_DEALLOC_HANDLER(_tc); \ +} + +#define TCP_EVT_SYN_SENT_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "SYN: iss %d", \ + .format_args = "i4", \ + }; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->iss; \ +} + +#define TCP_EVT_FIN_SENT_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "FIN: snd_nxt %d rcv_nxt %d", \ + .format_args = "i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = _tc->snd_nxt - _tc->iss; \ + ed->data[1] = _tc->rcv_nxt - _tc->irs; \ +} + +#define TCP_EVT_RST_SENT_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "RST: snd_nxt %d rcv_nxt %d", \ + .format_args = "i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = _tc->snd_nxt - _tc->iss; \ + ed->data[1] = _tc->rcv_nxt - _tc->irs; \ +} + +#define TCP_EVT_SYN_RCVD_HANDLER(_tc, ...) \ +{ \ + TCP_EVT_INIT_HANDLER(_tc); \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "SYN rcvd: irs %d", \ + .format_args = "i4", \ + }; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->irs; \ +} + +#define TCP_EVT_FIN_RCVD_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "FIN rcvd: snd_nxt %d rcv_nxt %d", \ + .format_args = "i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = _tc->snd_nxt - _tc->iss; \ + ed->data[1] = _tc->rcv_nxt - _tc->irs; \ +} + +#define TCP_EVT_RST_RCVD_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "RST rcvd: snd_nxt %d rcv_nxt %d", \ + .format_args = "i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = _tc->snd_nxt - _tc->iss; \ + ed->data[1] = _tc->rcv_nxt - _tc->irs; \ +} + +#define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "ACK: acked %u cwnd %u inflight %u", \ + .format_args = "i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 3); \ + ed->data[0] = _tc->bytes_acked; \ + ed->data[1] = _tc->cwnd; \ + ed->data[2] = tcp_flight_size(_tc); \ +} + +#define TCP_EVT_PKTIZE_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "pktize: snd_una %u snd_nxt %u una_max %u", \ + .format_args = "i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 3); \ + ed->data[0] = _tc->snd_una - _tc->iss; \ + ed->data[1] = _tc->snd_nxt - _tc->iss; \ + ed->data[2] = _tc->snd_una_max - _tc->iss; \ +} + +#define TCP_EVT_OUTPUT_HANDLER(_tc, flags, n_bytes,...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "out: flags %x, bytes %u", \ + .format_args = "i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = flags; \ + ed->data[1] = n_bytes; \ +} + +#define TCP_EVT_INPUT_HANDLER(_tc, n_bytes, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "in: bytes %u rcv_nxt %u", \ + .format_args = "i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = n_bytes; \ + ed->data[1] = _tc->rcv_nxt - _tc->irs; \ +} + +#define TCP_EVT_TIMER_POP_HANDLER(_tc_index, _timer_id, ...) \ +{ \ + tcp_connection_t *_tc; \ + if (_timer_id == TCP_TIMER_RETRANSMIT_SYN) \ + { \ + _tc = tcp_half_open_connection_get (_tc_index); \ + } \ + else \ + { \ + u32 _thread_index = os_get_cpu_number (); \ + _tc = tcp_connection_get (_tc_index, _thread_index); \ + } \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "TimerPop: %s (%d)", \ + .format_args = "t4i4", \ + .n_enum_strings = 7, \ + .enum_strings = { \ + "retransmit", \ + "delack", \ + "BUG", \ + "keep", \ + "waitclose", \ + "retransmit syn", \ + "establish", \ + }, \ + }; \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = _timer_id; \ + ed->data[1] = _timer_id; \ +} + +#define CONCAT_HELPER(_a, _b) _a##_b +#define CC(_a, _b) CONCAT_HELPER(_a, _b) + +#define TCP_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args) + +#else +#define TCP_EVT_DBG(_evt, _args...) +#endif + + +#endif /* SRC_VNET_TCP_TCP_DEBUG_H_ */ +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index f19fbf87..67af4321 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -730,6 +730,8 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, /* Updates congestion control (slow start/congestion avoidance) */ tcp_cc_rcv_ack (tc); + TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc); + /* If everything has been acked, stop retransmit timer * otherwise update */ if (tc->snd_una == tc->snd_una_max) @@ -922,6 +924,8 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, * segments can be enqueued after fifo tail offset changes. */ error = tcp_session_enqueue_data (tc, b, n_data_bytes); + TCP_EVT_DBG (TCP_EVT_INPUT, tc, n_data_bytes); + /* Check if ACK can be delayed */ if (tcp_can_delack (tc)) { @@ -1079,6 +1083,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, * wait for session to call close. To avoid lingering * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */ tc0->state = TCP_STATE_CLOSE_WAIT; + TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0); stream_session_disconnect_notify (&tc0->connection); tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); } @@ -1134,7 +1139,8 @@ VLIB_REGISTER_NODE (tcp4_established_node) = .name = "tcp4-established", /* Takes a vector of packets. */ .vector_size = sizeof (u32), - .n_errors = TCP_N_ERROR,.error_strings = tcp_error_strings, + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, .n_next_nodes = TCP_ESTABLISHED_N_NEXT, .next_nodes = { @@ -1363,7 +1369,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { new_tc0->state = TCP_STATE_SYN_RCVD; - /* Notify app that we have connection XXX */ + /* Notify app that we have connection */ stream_session_connect_notify (&new_tc0->connection, sst, 0); tcp_make_synack (new_tc0, b0); @@ -1726,7 +1732,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, case TCP_STATE_FIN_WAIT_2: /* Got FIN, send ACK! */ tc0->state = TCP_STATE_TIME_WAIT; - tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); tcp_make_ack (tc0, b0); next0 = tcp_next_output (is_ip4); break; @@ -1737,6 +1743,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); break; } + TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0); b0->error = error0 ? node->errors[error0] : 0; @@ -1950,6 +1957,8 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_connection_init_vars (child0); + TCP_EVT_DBG (TCP_EVT_SYN_RCVD, child0); + /* Reuse buffer to make syn-ack and send */ tcp_make_synack (child0, b0); next0 = tcp_next_output (is_ip4); @@ -2064,25 +2073,6 @@ typedef struct u8 state; } tcp_rx_trace_t; -const char *tcp_fsm_states[] = { -#define _(sym, str) str, - foreach_tcp_fsm_state -#undef _ -}; - -u8 * -format_tcp_state (u8 * s, va_list * args) -{ - tcp_state_t *state = va_arg (*args, tcp_state_t *); - - if (state[0] < TCP_N_STATES) - s = format (s, "%s", tcp_fsm_states[state[0]]); - else - s = format (s, "UNKNOWN"); - - return s; -} - u8 * format_tcp_rx_trace (u8 * s, va_list * args) { diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index aa43e9f3..114a5b9e 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -452,11 +452,7 @@ tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b) tcp_reuse_buffer (vm, b); - if (tc->rcv_las == tc->rcv_nxt) - flags = TCP_FLAG_FIN; - else - flags = TCP_FLAG_FIN | TCP_FLAG_ACK; - + flags = TCP_FLAG_FIN | TCP_FLAG_ACK; tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, flags); /* Reset flags, make sure ack is sent */ @@ -828,6 +824,7 @@ tcp_send_fin (tcp_connection_t * tc) tcp_make_fin (tc, b); tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); tc->flags |= TCP_CONN_FINSNT; + TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc); } always_inline u8 @@ -887,6 +884,7 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, vnet_buffer (b)->tcp.connection_index = tc->c_c_index; tc->snd_nxt += data_len; + TCP_EVT_DBG (TCP_EVT_PKTIZE, tc); } /* Send delayed ACK when timer expires */ @@ -1186,6 +1184,7 @@ tcp46_output_inline (vlib_main_t * vm, } th0 = vlib_buffer_get_current (b0); + TCP_EVT_DBG (TCP_EVT_OUTPUT, tc0, th0->flags, b0->current_length); if (is_ip4) { @@ -1242,22 +1241,6 @@ tcp46_output_inline (vlib_main_t * vm, tc0->rtt_ts = tcp_time_now (); tc0->rtt_seq = tc0->snd_nxt; } - - if (1) - { - ELOG_TYPE_DECLARE (e) = - { - .format = - "output: snd_una %u snd_una_max %u",.format_args = - "i4i4",}; - struct - { - u32 data[2]; - } *ed; - ed = ELOG_DATA (&vm->elog_main, e); - ed->data[0] = tc0->snd_una - tc0->iss; - ed->data[1] = tc0->snd_una_max - tc0->iss; - } } /* Set the retransmit timer if not set already and not @@ -1275,9 +1258,8 @@ tcp46_output_inline (vlib_main_t * vm, vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; - done: - b0->error = error0 != 0 ? node->errors[error0] : 0; + b0->error = node->errors[error0]; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { @@ -1307,34 +1289,50 @@ tcp6_output (vlib_main_t * vm, vlib_node_runtime_t * node, return tcp46_output_inline (vm, node, from_frame, 0 /* is_ip4 */ ); } +/* *INDENT-OFF* */ VLIB_REGISTER_NODE (tcp4_output_node) = { .function = tcp4_output,.name = "tcp4-output", /* Takes a vector of packets. */ - .vector_size = sizeof (u32),.n_errors = TCP_N_ERROR,.error_strings = - tcp_error_strings,.n_next_nodes = TCP_OUTPUT_N_NEXT,.next_nodes = - { + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_OUTPUT_N_NEXT, + .next_nodes = { #define _(s,n) [TCP_OUTPUT_NEXT_##s] = n, foreach_tcp4_output_next #undef _ - } -,.format_buffer = format_tcp_header,.format_trace = format_tcp_tx_trace,}; + }, + .format_buffer = format_tcp_header, + .format_trace = format_tcp_tx_trace, +}; +/* *INDENT-ON* */ -VLIB_NODE_FUNCTION_MULTIARCH (tcp4_output_node, tcp4_output) +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_output_node, tcp4_output); + +/* *INDENT-OFF* */ VLIB_REGISTER_NODE (tcp6_output_node) = { - .function = tcp6_output,.name = "tcp6-output", + .function = tcp6_output, + .name = "tcp6-output", /* Takes a vector of packets. */ - .vector_size = sizeof (u32),.n_errors = TCP_N_ERROR,.error_strings = - tcp_error_strings,.n_next_nodes = TCP_OUTPUT_N_NEXT,.next_nodes = - { + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_OUTPUT_N_NEXT, + .next_nodes = { #define _(s,n) [TCP_OUTPUT_NEXT_##s] = n, foreach_tcp6_output_next #undef _ - } -,.format_buffer = format_tcp_header,.format_trace = format_tcp_tx_trace,}; + }, + .format_buffer = format_tcp_header, + .format_trace = format_tcp_tx_trace, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_output_node, tcp6_output); -VLIB_NODE_FUNCTION_MULTIARCH (tcp6_output_node, tcp6_output) u32 +u32 tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b) { tcp_connection_t *tc; @@ -1405,7 +1403,7 @@ tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, next0 = TCP_RESET_NEXT_IP_LOOKUP; done: - b0->error = error0 != 0 ? node->errors[error0] : 0; + b0->error = node->errors[error0]; b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { @@ -1450,6 +1448,8 @@ VLIB_REGISTER_NODE (tcp4_reset_node) = { }; /* *INDENT-ON* */ +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_reset_node, tcp4_send_reset); + /* *INDENT-OFF* */ VLIB_REGISTER_NODE (tcp6_reset_node) = { .function = tcp6_send_reset, @@ -1466,6 +1466,8 @@ VLIB_REGISTER_NODE (tcp6_reset_node) = { }; /* *INDENT-ON* */ +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_reset_node, tcp6_send_reset); + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/udp/udp.c b/src/vnet/udp/udp.c index 9e740466..57e4a60e 100644 --- a/src/vnet/udp/udp.c +++ b/src/vnet/udp/udp.c @@ -25,7 +25,7 @@ udp_uri_main_t udp_uri_main; u32 -udp_session_bind_ip4 (vlib_main_t * vm, u32 session_index, +udp_session_bind_ip4 (u32 session_index, ip46_address_t * ip, u16 port_number_host_byte_order) { udp_uri_main_t *um = vnet_get_udp_main (); @@ -42,7 +42,7 @@ udp_session_bind_ip4 (vlib_main_t * vm, u32 session_index, } u32 -udp_session_bind_ip6 (vlib_main_t * vm, u32 session_index, +udp_session_bind_ip6 (u32 session_index, ip46_address_t * ip, u16 port_number_host_byte_order) { udp_uri_main_t *um = vnet_get_udp_main (); @@ -58,8 +58,9 @@ udp_session_bind_ip6 (vlib_main_t * vm, u32 session_index, } u32 -udp_session_unbind_ip4 (vlib_main_t * vm, u32 listener_index) +udp_session_unbind_ip4 (u32 listener_index) { + vlib_main_t *vm = vlib_get_main (); udp_connection_t *listener; listener = udp_listener_get (listener_index); @@ -69,8 +70,9 @@ udp_session_unbind_ip4 (vlib_main_t * vm, u32 listener_index) } u32 -udp_session_unbind_ip6 (vlib_main_t * vm, u32 listener_index) +udp_session_unbind_ip6 (u32 listener_index) { + vlib_main_t *vm = vlib_get_main (); udp_connection_t *listener; listener = udp_listener_get (listener_index); -- cgit 1.2.3-korg From 6792ec059696a358b6c98d8d86e9740b34c01e24 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Mon, 13 Mar 2017 03:49:51 -0700 Subject: TCP/session improvements - Added svm fifo flag for tracking fifo dequeue events (replaces event length). Updated all code to switch to the new scheme. - More session debugging - Fix peek index wrap - Add a trivial socket test client - Fast retransmit/cc fixes - tx and rx SACK fixes and unit testing - SRTT computation fix - remove dupack/ack burst filters - improve ack rx - improved segment rx - builtin client test code Change-Id: Ic4eb2d5ca446eb2260ccd3ccbcdaa73c64e7f4e1 Signed-off-by: Florin Coras Signed-off-by: Dave Barach --- src/svm/svm_fifo.c | 35 +-- src/svm/svm_fifo.h | 28 ++- src/svm/svm_fifo_segment.h | 4 +- src/uri.am | 5 +- src/uri/uri_socket_test.c | 126 ++++++++++ src/uri/uri_tcp_test.c | 161 +++++++++---- src/uri/uri_udp_test.c | 13 +- src/vnet.am | 2 + src/vnet/session/application.h | 3 +- src/vnet/session/node.c | 127 ++++++---- src/vnet/session/session.c | 63 +++-- src/vnet/session/session.h | 19 +- src/vnet/session/session_cli.c | 2 +- src/vnet/session/session_debug.h | 38 ++- src/vnet/session/transport.h | 2 +- src/vnet/tcp/builtin_client.c | 411 +++++++++++++++++++++++++++++++ src/vnet/tcp/builtin_client.h | 131 ++++++++++ src/vnet/tcp/builtin_server.c | 91 +++++-- src/vnet/tcp/tcp.c | 37 ++- src/vnet/tcp/tcp.h | 111 +++++++-- src/vnet/tcp/tcp_debug.h | 252 ++++++++++++++++--- src/vnet/tcp/tcp_error.def | 7 +- src/vnet/tcp/tcp_input.c | 507 +++++++++++++++++++++++++-------------- src/vnet/tcp/tcp_output.c | 295 ++++++++++++++++------- src/vnet/tcp/tcp_packet.h | 2 +- src/vnet/tcp/tcp_test.c | 216 +++++++++++++++++ src/vnet/udp/builtin_server.c | 29 ++- src/vnet/udp/udp_input.c | 47 ++-- 28 files changed, 2201 insertions(+), 563 deletions(-) create mode 100644 src/uri/uri_socket_test.c create mode 100644 src/vnet/tcp/builtin_client.c create mode 100644 src/vnet/tcp/builtin_client.h create mode 100644 src/vnet/tcp/tcp_test.c (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c index e3f534b1..07b0d2df 100644 --- a/src/svm/svm_fifo.c +++ b/src/svm/svm_fifo.c @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "svm_fifo.h" +#include /** create an svm fifo, in the current heap. Fails vs blow up the process */ svm_fifo_t * @@ -362,18 +362,19 @@ svm_fifo_enqueue_nowait (svm_fifo_t * f, return svm_fifo_enqueue_internal (f, pid, max_bytes, copy_from_here); } -/** Enqueue a future segment. +/** + * Enqueue a future segment. + * * Two choices: either copies the entire segment, or copies nothing * Returns 0 of the entire segment was copied * Returns -1 if none of the segment was copied due to lack of space */ - static int -svm_fifo_enqueue_with_offset_internal2 (svm_fifo_t * f, - int pid, - u32 offset, - u32 required_bytes, - u8 * copy_from_here) +svm_fifo_enqueue_with_offset_internal (svm_fifo_t * f, + int pid, + u32 offset, + u32 required_bytes, + u8 * copy_from_here) { u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; u32 cursize, nitems; @@ -424,14 +425,14 @@ svm_fifo_enqueue_with_offset (svm_fifo_t * f, u32 offset, u32 required_bytes, u8 * copy_from_here) { - return svm_fifo_enqueue_with_offset_internal2 + return svm_fifo_enqueue_with_offset_internal (f, pid, offset, required_bytes, copy_from_here); } static int -svm_fifo_dequeue_internal2 (svm_fifo_t * f, - int pid, u32 max_bytes, u8 * copy_here) +svm_fifo_dequeue_internal (svm_fifo_t * f, + int pid, u32 max_bytes, u8 * copy_here) { u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; u32 cursize, nitems; @@ -484,7 +485,7 @@ int svm_fifo_dequeue_nowait (svm_fifo_t * f, int pid, u32 max_bytes, u8 * copy_here) { - return svm_fifo_dequeue_internal2 (f, pid, max_bytes, copy_here); + return svm_fifo_dequeue_internal (f, pid, max_bytes, copy_here); } int @@ -492,7 +493,7 @@ svm_fifo_peek (svm_fifo_t * f, int pid, u32 offset, u32 max_bytes, u8 * copy_here) { u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; - u32 cursize, nitems; + u32 cursize, nitems, real_head; if (PREDICT_FALSE (f->cursize == 0)) return -2; /* nothing in the fifo */ @@ -500,6 +501,8 @@ svm_fifo_peek (svm_fifo_t * f, int pid, u32 offset, u32 max_bytes, /* read cursize, which can only increase while we're working */ cursize = f->cursize; nitems = f->nitems; + real_head = f->head + offset; + real_head = real_head >= nitems ? real_head - nitems : real_head; /* Number of bytes we're going to copy */ total_copy_bytes = (cursize < max_bytes) ? cursize : max_bytes; @@ -508,9 +511,9 @@ svm_fifo_peek (svm_fifo_t * f, int pid, u32 offset, u32 max_bytes, { /* Number of bytes in first copy segment */ first_copy_bytes = - ((nitems - f->head + offset) < total_copy_bytes) ? - (nitems - f->head + offset) : total_copy_bytes; - clib_memcpy (copy_here, &f->data[f->head + offset], first_copy_bytes); + ((nitems - real_head) < total_copy_bytes) ? + (nitems - real_head) : total_copy_bytes; + clib_memcpy (copy_here, &f->data[real_head], first_copy_bytes); /* Number of bytes in second copy segment, if any */ second_copy_bytes = total_copy_bytes - first_copy_bytes; diff --git a/src/svm/svm_fifo.h b/src/svm/svm_fifo.h index 70624b74..39556173 100644 --- a/src/svm/svm_fifo.h +++ b/src/svm/svm_fifo.h @@ -46,9 +46,11 @@ typedef struct { pthread_mutex_t mutex; /* 8 bytes */ pthread_cond_t condvar; /* 8 bytes */ - u32 owner_pid; svm_lock_tag_t tag; - volatile u32 cursize; + + volatile u32 cursize; /**< current fifo size */ + volatile u8 has_event; /**< non-zero if deq event exists */ + u32 owner_pid; u32 nitems; /* Backpointers */ @@ -112,6 +114,28 @@ svm_fifo_has_ooo_data (svm_fifo_t * f) return f->ooos_list_head != OOO_SEGMENT_INVALID_INDEX; } +/** + * Sets fifo event flag. + * + * @return 1 if flag was not set. + */ +always_inline u8 +svm_fifo_set_event (svm_fifo_t * f) +{ + /* Probably doesn't need to be atomic. Still, better avoid surprises */ + return __sync_lock_test_and_set (&f->has_event, 1) == 0; +} + +/** + * Unsets fifo event flag. + */ +always_inline void +svm_fifo_unset_event (svm_fifo_t * f) +{ + /* Probably doesn't need to be atomic. Still, better avoid surprises */ + __sync_lock_test_and_set (&f->has_event, 0); +} + svm_fifo_t *svm_fifo_create (u32 data_size_in_bytes); int svm_fifo_enqueue_nowait (svm_fifo_t * f, int pid, u32 max_bytes, diff --git a/src/svm/svm_fifo_segment.h b/src/svm/svm_fifo_segment.h index 793fa7c8..ecb5653a 100644 --- a/src/svm/svm_fifo_segment.h +++ b/src/svm/svm_fifo_segment.h @@ -15,8 +15,8 @@ #ifndef __included_ssvm_fifo_segment_h__ #define __included_ssvm_fifo_segment_h__ -#include "svm_fifo.h" -#include "ssvm.h" +#include +#include typedef struct { diff --git a/src/uri.am b/src/uri.am index 09b5b15b..ad4d65d8 100644 --- a/src/uri.am +++ b/src/uri.am @@ -11,7 +11,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -noinst_PROGRAMS += uri_udp_test uri_tcp_test +noinst_PROGRAMS += uri_udp_test uri_tcp_test uri_socket_test uri_udp_test_SOURCES = uri/uri_udp_test.c uri_udp_test_LDADD = libvlibmemoryclient.la libvlibapi.la libsvm.la \ @@ -20,3 +20,6 @@ uri_udp_test_LDADD = libvlibmemoryclient.la libvlibapi.la libsvm.la \ uri_tcp_test_SOURCES = uri/uri_tcp_test.c uri_tcp_test_LDADD = libvlibmemoryclient.la libvlibapi.la libsvm.la \ libvppinfra.la -lpthread -lm -lrt + +uri_socket_test_SOURCES = uri/uri_socket_test.c +uri_socket_test_LDADD = libvppinfra.la -lpthread -lm -lrt diff --git a/src/uri/uri_socket_test.c b/src/uri/uri_socket_test.c new file mode 100644 index 00000000..9f049bda --- /dev/null +++ b/src/uri/uri_socket_test.c @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +int +main (int argc, char *argv[]) +{ + int sockfd, portno, n; + struct sockaddr_in serv_addr; + struct hostent *server; + u8 *rx_buffer = 0, *tx_buffer = 0; + u32 offset; + int iter, i; + if (0 && argc < 3) + { + fformat (stderr, "usage %s hostname port\n", argv[0]); + exit (0); + } + + portno = 1234; // atoi(argv[2]); + sockfd = socket (AF_INET, SOCK_STREAM, 0); + if (sockfd < 0) + { + clib_unix_error ("socket"); + exit (1); + } + server = gethostbyname ("6.0.1.1" /* argv[1] */ ); + if (server == NULL) + { + clib_unix_warning ("gethostbyname"); + exit (1); + } + bzero ((char *) &serv_addr, sizeof (serv_addr)); + serv_addr.sin_family = AF_INET; + bcopy ((char *) server->h_addr, + (char *) &serv_addr.sin_addr.s_addr, server->h_length); + serv_addr.sin_port = htons (portno); + if (connect (sockfd, (const void *) &serv_addr, sizeof (serv_addr)) < 0) + { + clib_unix_warning ("connect"); + exit (1); + } + + vec_validate (rx_buffer, 1400); + vec_validate (tx_buffer, 1400); + + for (i = 0; i < vec_len (tx_buffer); i++) + tx_buffer[i] = (i + 1) % 0xff; + + /* + * Send one packet to warm up the RX pipeline + */ + n = send (sockfd, tx_buffer, vec_len (tx_buffer), 0 /* flags */ ); + if (n != vec_len (tx_buffer)) + { + clib_unix_warning ("write"); + exit (0); + } + + for (iter = 0; iter < 100000; iter++) + { + if (iter < 99999) + { + n = send (sockfd, tx_buffer, vec_len (tx_buffer), 0 /* flags */ ); + if (n != vec_len (tx_buffer)) + { + clib_unix_warning ("write"); + exit (0); + } + } + offset = 0; + + do + { + n = recv (sockfd, rx_buffer + offset, + vec_len (rx_buffer) - offset, 0 /* flags */ ); + if (n < 0) + { + clib_unix_warning ("read"); + exit (0); + } + offset += n; + } + while (offset < vec_len (rx_buffer)); + + for (i = 0; i < vec_len (rx_buffer); i++) + { + if (rx_buffer[i] != tx_buffer[i]) + { + clib_warning ("[%d] read 0x%x not 0x%x", + rx_buffer[i], tx_buffer[i]); + exit (1); + } + } + + } + close (sockfd); + return 0; +} + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/uri/uri_tcp_test.c b/src/uri/uri_tcp_test.c index 406a5f4e..e2834817 100644 --- a/src/uri/uri_tcp_test.c +++ b/src/uri/uri_tcp_test.c @@ -116,6 +116,7 @@ typedef struct pthread_t client_rx_thread_handle; u32 client_bytes_received; u8 test_return_packets; + u32 bytes_to_send; /* convenience */ svm_fifo_segment_main_t *segment_main; @@ -313,11 +314,16 @@ client_handle_fifo_event_rx (uri_tcp_test_main_t * utm, rx_fifo = e->fifo; - bytes = e->enqueue_length; + bytes = svm_fifo_max_dequeue (rx_fifo); + /* Allow enqueuing of new event */ + svm_fifo_unset_event (rx_fifo); + + /* Read the bytes */ do { - n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len (utm->rx_buf), - utm->rx_buf); + n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, + clib_min (vec_len (utm->rx_buf), + bytes), utm->rx_buf); if (n_read > 0) { bytes -= n_read; @@ -333,9 +339,17 @@ client_handle_fifo_event_rx (uri_tcp_test_main_t * utm, } utm->client_bytes_received += n_read; } + else + { + if (n_read == -2) + { + clib_warning ("weird!"); + break; + } + } } - while (n_read < 0 || bytes > 0); + while (bytes > 0); } void @@ -479,47 +493,41 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) } } -void -client_send_data (uri_tcp_test_main_t * utm) +static void +send_test_chunk (uri_tcp_test_main_t * utm, svm_fifo_t * tx_fifo, int mypid, + u32 bytes) { u8 *test_data = utm->connect_test_data; u64 bytes_sent = 0; - int rv; - int mypid = getpid (); - session_t *session; - svm_fifo_t *tx_fifo; - int buffer_offset, bytes_to_send = 0; + int test_buf_offset = 0; + u32 bytes_to_snd; + u32 queue_max_chunk = 64 << 10, actual_write; session_fifo_event_t evt; static int serial_number = 0; - int i; - u32 max_chunk = 64 << 10, write; - - session = pool_elt_at_index (utm->sessions, utm->connected_session_index); - tx_fifo = session->server_tx_fifo; + int rv; - vec_validate (utm->rx_buf, vec_len (test_data) - 1); + bytes_to_snd = (bytes == 0) ? vec_len (test_data) : bytes; + if (bytes_to_snd > vec_len (test_data)) + bytes_to_snd = vec_len (test_data); - for (i = 0; i < 1; i++) + while (bytes_to_snd > 0) { - bytes_to_send = vec_len (test_data); - buffer_offset = 0; - while (bytes_to_send > 0) + actual_write = + bytes_to_snd > queue_max_chunk ? queue_max_chunk : bytes_to_snd; + rv = svm_fifo_enqueue_nowait (tx_fifo, mypid, actual_write, + test_data + test_buf_offset); + + if (rv > 0) { - write = bytes_to_send > max_chunk ? max_chunk : bytes_to_send; - rv = svm_fifo_enqueue_nowait (tx_fifo, mypid, write, - test_data + buffer_offset); + bytes_to_snd -= rv; + test_buf_offset += rv; + bytes_sent += rv; - if (rv > 0) + if (svm_fifo_set_event (tx_fifo)) { - bytes_to_send -= rv; - buffer_offset += rv; - bytes_sent += rv; - /* Fabricate TX event, send to vpp */ evt.fifo = tx_fifo; evt.event_type = FIFO_EVENT_SERVER_TX; - /* $$$$ for event logging */ - evt.enqueue_length = rv; evt.event_id = serial_number++; unix_shared_memory_queue_add (utm->vpp_event_queue, @@ -528,13 +536,40 @@ client_send_data (uri_tcp_test_main_t * utm) } } } +} + +void +client_send_data (uri_tcp_test_main_t * utm) +{ + u8 *test_data = utm->connect_test_data; + int mypid = getpid (); + session_t *session; + svm_fifo_t *tx_fifo; + u32 n_iterations, leftover; + int i; + + session = pool_elt_at_index (utm->sessions, utm->connected_session_index); + tx_fifo = session->server_tx_fifo; + + vec_validate (utm->rx_buf, vec_len (test_data) - 1); + n_iterations = utm->bytes_to_send / vec_len (test_data); + + for (i = 0; i < n_iterations; i++) + { + send_test_chunk (utm, tx_fifo, mypid, 0); + } + + leftover = utm->bytes_to_send % vec_len (test_data); + if (leftover) + send_test_chunk (utm, tx_fifo, mypid, leftover); if (utm->test_return_packets) { f64 timeout = clib_time_now (&utm->clib_time) + 2; /* Wait for the outstanding packets */ - while (utm->client_bytes_received < vec_len (test_data)) + while (utm->client_bytes_received < + vec_len (test_data) * n_iterations + leftover) { if (clib_time_now (&utm->clib_time) > timeout) { @@ -542,9 +577,8 @@ client_send_data (uri_tcp_test_main_t * utm) break; } } - - utm->time_to_stop = 1; } + utm->time_to_stop = 1; } void @@ -599,6 +633,11 @@ client_test (uri_tcp_test_main_t * utm) /* Disconnect */ client_disconnect (utm); + + if (wait_for_state_change (utm, STATE_START)) + { + return; + } } static void @@ -714,7 +753,6 @@ server_handle_fifo_event_rx (uri_tcp_test_main_t * utm, { svm_fifo_t *rx_fifo, *tx_fifo; int n_read; - session_fifo_event_t evt; unix_shared_memory_queue_t *q; int rv, bytes; @@ -722,34 +760,46 @@ server_handle_fifo_event_rx (uri_tcp_test_main_t * utm, rx_fifo = e->fifo; tx_fifo = utm->sessions[rx_fifo->client_session_index].server_tx_fifo; - bytes = e->enqueue_length; + bytes = svm_fifo_max_dequeue (rx_fifo); + /* Allow enqueuing of a new event */ + svm_fifo_unset_event (rx_fifo); + + if (bytes == 0) + return; + + /* Read the bytes */ do { n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len (utm->rx_buf), utm->rx_buf); + if (n_read > 0) + bytes -= n_read; + + if (utm->drop_packets) + continue; /* Reflect if a non-drop session */ - if (!utm->drop_packets && n_read > 0) + if (n_read > 0) { do { rv = svm_fifo_enqueue_nowait (tx_fifo, 0, n_read, utm->rx_buf); } - while (rv == -2 && !utm->time_to_stop); - - /* Fabricate TX event, send to vpp */ - evt.fifo = tx_fifo; - evt.event_type = FIFO_EVENT_SERVER_TX; - /* $$$$ for event logging */ - evt.enqueue_length = n_read; - evt.event_id = e->event_id; - q = utm->vpp_event_queue; - unix_shared_memory_queue_add (q, (u8 *) & evt, - 0 /* do wait for mutex */ ); - } + while (rv <= 0 && !utm->time_to_stop); - if (n_read > 0) - bytes -= n_read; + /* If event wasn't set, add one */ + if (svm_fifo_set_event (tx_fifo)) + { + /* Fabricate TX event, send to vpp */ + evt.fifo = tx_fifo; + evt.event_type = FIFO_EVENT_SERVER_TX; + evt.event_id = e->event_id; + + q = utm->vpp_event_queue; + unix_shared_memory_queue_add (q, (u8 *) & evt, + 0 /* do wait for mutex */ ); + } + } } while ((n_read < 0 || bytes > 0) && !utm->time_to_stop); } @@ -852,7 +902,10 @@ static void vl_api_disconnect_session_reply_t_handler (vl_api_disconnect_session_reply_t * mp) { + uri_tcp_test_main_t *utm = &uri_tcp_test_main; + clib_warning ("retval %d", ntohl (mp->retval)); + utm->state = STATE_START; } #define foreach_uri_msg \ @@ -888,6 +941,7 @@ main (int argc, char **argv) u8 *heap, *uri = 0; u8 *bind_uri = (u8 *) "tcp://0.0.0.0/1234"; u8 *connect_uri = (u8 *) "tcp://6.0.1.2/1234"; + u32 bytes_to_send = 64 << 10, mbytes; u32 tmp; mheap_t *h; session_t *session; @@ -934,6 +988,10 @@ main (int argc, char **argv) drop_packets = 1; else if (unformat (a, "test")) test_return_packets = 1; + else if (unformat (a, "mbytes %d", &mbytes)) + { + bytes_to_send = mbytes << 20; + } else { fformat (stderr, "%s: usage [master|slave]\n"); @@ -956,6 +1014,7 @@ main (int argc, char **argv) utm->segment_main = &svm_fifo_segment_main; utm->drop_packets = drop_packets; utm->test_return_packets = test_return_packets; + utm->bytes_to_send = bytes_to_send; setup_signal_handlers (); uri_api_hookup (utm); diff --git a/src/uri/uri_udp_test.c b/src/uri/uri_udp_test.c index 54625d64..e6c239c1 100644 --- a/src/uri/uri_udp_test.c +++ b/src/uri/uri_udp_test.c @@ -742,17 +742,20 @@ server_handle_fifo_event_rx (uri_udp_test_main_t * utm, /* Fabricate TX event, send to vpp */ evt.fifo = tx_fifo; evt.event_type = FIFO_EVENT_SERVER_TX; - /* $$$$ for event logging */ - evt.enqueue_length = nbytes; evt.event_id = e->event_id; - q = utm->vpp_event_queue; - unix_shared_memory_queue_add (q, (u8 *) & evt, 0 /* do wait for mutex */ ); + + if (svm_fifo_set_event (tx_fifo)) + { + q = utm->vpp_event_queue; + unix_shared_memory_queue_add (q, (u8 *) & evt, + 0 /* do wait for mutex */ ); + } } void server_handle_event_queue (uri_udp_test_main_t * utm) { - session_fifo_event_t _e, *e = &_e;; + session_fifo_event_t _e, *e = &_e; while (1) { diff --git a/src/vnet.am b/src/vnet.am index 3e73de8f..9c55e336 100644 --- a/src/vnet.am +++ b/src/vnet.am @@ -462,7 +462,9 @@ libvnet_la_SOURCES += \ vnet/tcp/tcp_output.c \ vnet/tcp/tcp_input.c \ vnet/tcp/tcp_newreno.c \ + vnet/tcp/builtin_client.c \ vnet/tcp/builtin_server.c \ + vnet/tcp/tcp_test.c \ vnet/tcp/tcp.c nobase_include_HEADERS += \ diff --git a/src/vnet/session/application.h b/src/vnet/session/application.h index a60a8b8b..480828f7 100644 --- a/src/vnet/session/application.h +++ b/src/vnet/session/application.h @@ -45,8 +45,7 @@ typedef struct _stream_session_cb_vft void (*session_reset_callback) (stream_session_t * s); /* Direct RX callback, for built-in servers */ - int (*builtin_server_rx_callback) (stream_session_t * session, - session_fifo_event_t * ep); + int (*builtin_server_rx_callback) (stream_session_t * session); /* Redirect connection to local server */ int (*redirect_connect_callback) (u32 api_client_index, void *mp); diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c index 822afebd..8681105c 100644 --- a/src/vnet/session/node.c +++ b/src/vnet/session/node.c @@ -13,21 +13,14 @@ * limitations under the License. */ +#include #include #include -#include -#include - #include - -#include -#include #include -#include - -#include -#include +#include #include +#include vlib_node_registration_t session_queue_node; @@ -52,8 +45,8 @@ format_session_queue_trace (u8 * s, va_list * args) vlib_node_registration_t session_queue_node; -#define foreach_session_queue_error \ -_(TX, "Packets transmitted") \ +#define foreach_session_queue_error \ +_(TX, "Packets transmitted") \ _(TIMER, "Timer events") typedef enum @@ -91,10 +84,10 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, transport_proto_vft_t *transport_vft; u32 next_index, next0, *to_next, n_left_to_next, bi0; vlib_buffer_t *b0; - u32 rx_offset; + u32 rx_offset = 0, max_dequeue0; u16 snd_mss0; u8 *data0; - int i; + int i, n_bytes_read; next_index = next0 = session_type_to_next[s0->session_type]; @@ -106,24 +99,33 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, snd_mss0 = transport_vft->send_mss (tc0); /* Can't make any progress */ - if (snd_space0 == 0 || svm_fifo_max_dequeue (s0->server_tx_fifo) == 0 - || snd_mss0 == 0) + if (snd_space0 == 0 || snd_mss0 == 0) { vec_add1 (smm->evts_partially_read[thread_index], *e0); return 0; } - ASSERT (e0->enqueue_length > 0); - - /* Ensure we're not writing more than transport window allows */ - max_len_to_snd0 = clib_min (e0->enqueue_length, snd_space0); - if (peek_data) { /* Offset in rx fifo from where to peek data */ rx_offset = transport_vft->tx_fifo_offset (tc0); } + /* Check how much we can pull. If buffering, subtract the offset */ + max_dequeue0 = svm_fifo_max_dequeue (s0->server_tx_fifo) - rx_offset; + + /* Allow enqueuing of a new event */ + svm_fifo_unset_event (s0->server_tx_fifo); + + /* Nothing to read return */ + if (max_dequeue0 == 0) + { + return 0; + } + + /* Ensure we're not writing more than transport window allows */ + max_len_to_snd0 = clib_min (max_dequeue0, snd_space0); + /* TODO check if transport is willing to send len_to_snd0 * bytes (Nagle) */ @@ -147,13 +149,10 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, * XXX 0.9 because when debugging we might not get a full frame */ if (PREDICT_FALSE (n_bufs < 0.9 * VLIB_FRAME_SIZE)) { - /* Keep track of how much we've dequeued and exit */ - if (left_to_snd0 != max_len_to_snd0) + if (svm_fifo_set_event (s0->server_tx_fifo)) { - e0->enqueue_length -= max_len_to_snd0 - left_to_snd0; vec_add1 (smm->evts_partially_read[thread_index], *e0); } - return -1; } @@ -198,9 +197,9 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, len_to_deq0 = (left_to_snd0 < snd_mss0) ? left_to_snd0 : snd_mss0; /* *INDENT-OFF* */ - SESSION_EVT_DBG(s0, SESSION_EVT_DEQ, ({ + SESSION_EVT_DBG(SESSION_EVT_DEQ, s0, ({ ed->data[0] = e0->event_id; - ed->data[1] = e0->enqueue_length; + ed->data[1] = max_dequeue0; ed->data[2] = len_to_deq0; ed->data[3] = left_to_snd0; })); @@ -214,29 +213,30 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, * 2) buffer chains */ if (peek_data) { - int n_bytes_read; n_bytes_read = svm_fifo_peek (s0->server_tx_fifo, s0->pid, rx_offset, len_to_deq0, data0); - if (n_bytes_read < 0) + if (n_bytes_read <= 0) goto dequeue_fail; /* Keep track of progress locally, transport is also supposed to - * increment it independently when pushing header */ + * increment it independently when pushing the header */ rx_offset += n_bytes_read; } else { - if (svm_fifo_dequeue_nowait (s0->server_tx_fifo, s0->pid, - len_to_deq0, data0) < 0) + n_bytes_read = svm_fifo_dequeue_nowait (s0->server_tx_fifo, + s0->pid, len_to_deq0, + data0); + if (n_bytes_read <= 0) goto dequeue_fail; } - b0->current_length = len_to_deq0; + b0->current_length = n_bytes_read; /* Ask transport to push header */ transport_vft->push_header (tc0, b0); - left_to_snd0 -= len_to_deq0; + left_to_snd0 -= n_bytes_read; *n_tx_packets = *n_tx_packets + 1; vlib_validate_buffer_enqueue_x1 (vm, node, next_index, @@ -246,25 +246,31 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_put_next_frame (vm, node, next_index, n_left_to_next); } - /* If we couldn't dequeue all bytes store progress */ - if (max_len_to_snd0 < e0->enqueue_length) + /* If we couldn't dequeue all bytes mark as partially read */ + if (max_len_to_snd0 < max_dequeue0) { - e0->enqueue_length -= max_len_to_snd0; - vec_add1 (smm->evts_partially_read[thread_index], *e0); + /* If we don't already have new event */ + if (svm_fifo_set_event (s0->server_tx_fifo)) + { + vec_add1 (smm->evts_partially_read[thread_index], *e0); + } } return 0; dequeue_fail: - /* Can't read from fifo. Store event rx progress, save as partially read, - * return buff to free list and return */ - e0->enqueue_length -= max_len_to_snd0 - left_to_snd0; - vec_add1 (smm->evts_partially_read[thread_index], *e0); + /* + * Can't read from fifo. If we don't already have an event, save as partially + * read, return buff to free list and return + */ + clib_warning ("dequeue fail"); - to_next -= 1; - n_left_to_next += 1; + if (svm_fifo_set_event (s0->server_tx_fifo)) + { + vec_add1 (smm->evts_partially_read[thread_index], *e0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next + 1); _vec_len (smm->tx_buffers[thread_index]) += 1; - clib_warning ("dequeue fail"); return 0; } @@ -298,6 +304,7 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, session_fifo_event_t *my_fifo_events, *e; u32 n_to_dequeue, n_events; unix_shared_memory_queue_t *q; + application_t *app; int n_tx_packets = 0; u32 my_thread_index = vm->cpu_index; int i, rv; @@ -321,13 +328,18 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, if (n_to_dequeue == 0 && vec_len (my_fifo_events) == 0) return 0; + SESSION_EVT_DBG (SESSION_EVT_DEQ_NODE, 0); + /* * If we didn't manage to process previous events try going * over them again without dequeuing new ones. */ /* XXX: Block senders to sessions that can't keep up */ if (vec_len (my_fifo_events) >= 100) - goto skip_dequeue; + { + clib_warning ("too many fifo events unsolved"); + goto skip_dequeue; + } /* See you in the next life, don't be late */ if (pthread_mutex_trylock (&q->mutex)) @@ -352,19 +364,17 @@ skip_dequeue: { svm_fifo_t *f0; /* $$$ prefetch 1 ahead maybe */ stream_session_t *s0; - u32 server_session_index0, server_thread_index0; + u32 session_index0; session_fifo_event_t *e0; e0 = &my_fifo_events[i]; f0 = e0->fifo; - server_session_index0 = f0->server_session_index; - server_thread_index0 = f0->server_thread_index; + session_index0 = f0->server_session_index; /* $$$ add multiple event queues, per vpp worker thread */ - ASSERT (server_thread_index0 == my_thread_index); + ASSERT (f0->server_thread_index == my_thread_index); - s0 = stream_session_get_if_valid (server_session_index0, - my_thread_index); + s0 = stream_session_get_if_valid (session_index0, my_thread_index); if (CLIB_DEBUG && !s0) { @@ -385,11 +395,20 @@ skip_dequeue: rv = (smm->session_tx_fns[s0->session_type]) (vm, node, smm, e0, s0, my_thread_index, &n_tx_packets); + /* Out of buffers */ if (rv < 0) goto done; break; - + case FIFO_EVENT_SERVER_EXIT: + stream_session_disconnect (s0); + break; + case FIFO_EVENT_BUILTIN_RX: + svm_fifo_unset_event (s0->server_rx_fifo); + /* Get session's server */ + app = application_get (s0->app_index); + app->cb_fns.builtin_server_rx_callback (s0); + break; default: clib_warning ("unhandled event type %d", e0->event_type); } @@ -418,6 +437,8 @@ done: vlib_node_increment_counter (vm, session_queue_node.index, SESSION_QUEUE_ERROR_TX, n_tx_packets); + SESSION_EVT_DBG (SESSION_EVT_DEQ_NODE, 1); + return n_tx_packets; } diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 06e2a09a..f10918aa 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -804,30 +804,36 @@ stream_session_enqueue_notify (stream_session_t * s, u8 block) /* Get session's server */ app = application_get (s->app_index); - /* Fabricate event */ - evt.fifo = s->server_rx_fifo; - evt.event_type = FIFO_EVENT_SERVER_RX; - evt.event_id = serial_number++; - evt.enqueue_length = svm_fifo_max_dequeue (s->server_rx_fifo); - /* Built-in server? Hand event to the callback... */ if (app->cb_fns.builtin_server_rx_callback) - return app->cb_fns.builtin_server_rx_callback (s, &evt); - - /* Add event to server's event queue */ - q = app->event_queue; + return app->cb_fns.builtin_server_rx_callback (s); - /* Based on request block (or not) for lack of space */ - if (block || PREDICT_TRUE (q->cursize < q->maxsize)) - unix_shared_memory_queue_add (app->event_queue, (u8 *) & evt, - 0 /* do wait for mutex */ ); - else - return -1; + /* If no event, send one */ + if (svm_fifo_set_event (s->server_rx_fifo)) + { + /* Fabricate event */ + evt.fifo = s->server_rx_fifo; + evt.event_type = FIFO_EVENT_SERVER_RX; + evt.event_id = serial_number++; + + /* Add event to server's event queue */ + q = app->event_queue; + + /* Based on request block (or not) for lack of space */ + if (block || PREDICT_TRUE (q->cursize < q->maxsize)) + unix_shared_memory_queue_add (app->event_queue, (u8 *) & evt, + 0 /* do wait for mutex */ ); + else + { + clib_warning ("fifo full"); + return -1; + } + } /* *INDENT-OFF* */ - SESSION_EVT_DBG(s, SESSION_EVT_ENQ, ({ + SESSION_EVT_DBG(SESSION_EVT_ENQ, s, ({ ed->data[0] = evt.event_id; - ed->data[1] = evt.enqueue_length; + ed->data[1] = svm_fifo_max_dequeue (s->server_rx_fifo); })); /* *INDENT-ON* */ @@ -1192,8 +1198,29 @@ stream_session_open (u8 sst, ip46_address_t * addr, u16 port_host_byte_order, void stream_session_disconnect (stream_session_t * s) { +// session_fifo_event_t evt; + s->session_state = SESSION_STATE_CLOSED; + /* RPC to vpp evt queue in the right thread */ + tp_vfts[s->session_type].close (s->connection_index, s->thread_index); + +// { +// /* Fabricate event */ +// evt.fifo = s->server_rx_fifo; +// evt.event_type = FIFO_EVENT_SERVER_RX; +// evt.event_id = serial_number++; +// +// /* Based on request block (or not) for lack of space */ +// if (PREDICT_TRUE(q->cursize < q->maxsize)) +// unix_shared_memory_queue_add (app->event_queue, (u8 *) &evt, +// 0 /* do wait for mutex */); +// else +// { +// clib_warning("fifo full"); +// return -1; +// } +// } } /** diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index 96c00d87..a39bc06f 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -33,6 +33,7 @@ typedef enum FIFO_EVENT_SERVER_TX, FIFO_EVENT_TIMEOUT, FIFO_EVENT_SERVER_EXIT, + FIFO_EVENT_BUILTIN_RX } fifo_event_type_t; #define foreach_session_input_error \ @@ -91,14 +92,13 @@ typedef enum SESSION_STATE_N_STATES, } stream_session_state_t; -typedef CLIB_PACKED (struct - { - svm_fifo_t * fifo; - u8 event_type; - /* $$$$ for event logging */ - u16 event_id; - u32 enqueue_length; - }) session_fifo_event_t; +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct { + svm_fifo_t * fifo; + u8 event_type; + u16 event_id; +}) session_fifo_event_t; +/* *INDENT-ON* */ typedef struct _stream_session_t { @@ -333,7 +333,7 @@ stream_session_get_index (stream_session_t * s) } always_inline u32 -stream_session_max_enqueue (transport_connection_t * tc) +stream_session_max_rx_enqueue (transport_connection_t * tc) { stream_session_t *s = stream_session_get (tc->s_index, tc->thread_index); return svm_fifo_max_enqueue (s->server_rx_fifo); @@ -346,7 +346,6 @@ stream_session_fifo_size (transport_connection_t * tc) return s->server_rx_fifo->nitems; } - int stream_session_enqueue_data (transport_connection_t * tc, u8 * data, u16 len, u8 queue_event); diff --git a/src/vnet/session/session_cli.c b/src/vnet/session/session_cli.c index b029ee65..38762afc 100644 --- a/src/vnet/session/session_cli.c +++ b/src/vnet/session/session_cli.c @@ -107,7 +107,7 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, { if (once_per_pool) { - str = format (str, "%-40s%-20s%-20s%-15s", + str = format (str, "%-50s%-20s%-20s%-15s", "Connection", "Rx fifo", "Tx fifo", "Session Index"); vlib_cli_output (vm, "%v", str); diff --git a/src/vnet/session/session_debug.h b/src/vnet/session/session_debug.h index 858f12e0..80a97cd5 100644 --- a/src/vnet/session/session_debug.h +++ b/src/vnet/session/session_debug.h @@ -21,7 +21,8 @@ #define foreach_session_dbg_evt \ _(ENQ, "enqueue") \ - _(DEQ, "dequeue") + _(DEQ, "dequeue") \ + _(DEQ_NODE, "dequeue") typedef enum _session_evt_dbg { @@ -30,7 +31,10 @@ typedef enum _session_evt_dbg #undef _ } session_evt_dbg_e; -#if TRANSPORT_DEBUG +#define SESSION_DBG (0) +#define SESSION_DEQ_NODE_EVTS (0) + +#if TRANSPORT_DEBUG && SESSION_DBG #define DEC_SESSION_ETD(_s, _e, _size) \ struct \ @@ -44,6 +48,12 @@ typedef enum _session_evt_dbg ed = ELOG_TRACK_DATA (&vlib_global_main.elog_main, \ _e, _tc->elog_track) +#define DEC_SESSION_ED(_e, _size) \ + struct \ + { \ + u32 data[_size]; \ + } * ed; \ + ed = ELOG_DATA (&vlib_global_main.elog_main, _e) #define SESSION_EVT_DEQ_HANDLER(_s, _body) \ { \ @@ -67,13 +77,33 @@ typedef enum _session_evt_dbg do { _body; } while (0); \ } +#if SESSION_DEQ_NODE_EVTS +#define SESSION_EVT_DEQ_NODE_HANDLER(_node_evt) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "deq-node: %s", \ + .format_args = "t4", \ + .n_enum_strings = 2, \ + .enum_strings = { \ + "start", \ + "end", \ + }, \ + }; \ + DEC_SESSION_ED(_e, 1); \ + ed->data[0] = _node_evt; \ +} +#else +#define SESSION_EVT_DEQ_NODE_HANDLER(_node_evt) +#endif + #define CONCAT_HELPER(_a, _b) _a##_b #define CC(_a, _b) CONCAT_HELPER(_a, _b) -#define SESSION_EVT_DBG(_s, _evt, _body) CC(_evt, _HANDLER)(_s, _body) +#define SESSION_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args) #else -#define SESSION_EVT_DBG(_s, _evt, _body) +#define SESSION_EVT_DBG(_evt, _args...) #endif #endif /* SRC_VNET_SESSION_SESSION_DEBUG_H_ */ diff --git a/src/vnet/session/transport.h b/src/vnet/session/transport.h index 421121d2..2f912cbc 100644 --- a/src/vnet/session/transport.h +++ b/src/vnet/session/transport.h @@ -38,7 +38,7 @@ typedef struct _transport_connection u32 thread_index; /**< Worker-thread index */ #if TRANSPORT_DEBUG - elog_track_t elog_track; /**< Debug purposes */ + elog_track_t elog_track; /**< Event logging */ #endif /** Macros for 'derived classes' where base is named "connection" */ diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c new file mode 100644 index 00000000..a6eeb775 --- /dev/null +++ b/src/vnet/tcp/builtin_client.c @@ -0,0 +1,411 @@ +/* + * builtin_client.c - vpp built-in tcp client/connect code + * + * Copyright (c) 2017 by Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include + +/* define message IDs */ +#include + +/* define message structures */ +#define vl_typedefs +#include +#undef vl_typedefs + +/* define generated endian-swappers */ +#define vl_endianfun +#include +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) +#define vl_printfun +#include +#undef vl_printfun + +static void +send_test_chunk (tclient_main_t * tm, session_t * s) +{ + u8 *test_data = tm->connect_test_data; + int test_buf_offset = 0; + u32 bytes_this_chunk; + session_fifo_event_t evt; + static int serial_number = 0; + int rv; + + while (s->bytes_to_send > 0) + { + bytes_this_chunk = vec_len (test_data) < s->bytes_to_send + ? vec_len (test_data) : s->bytes_to_send; + + rv = svm_fifo_enqueue_nowait (s->server_tx_fifo, 0 /*pid */ , + bytes_this_chunk, + test_data + test_buf_offset); + + if (rv > 0) + { + s->bytes_to_send -= rv; + test_buf_offset += rv; + + if (svm_fifo_set_event (s->server_tx_fifo)) + { + /* Fabricate TX event, send to vpp */ + evt.fifo = s->server_tx_fifo; + evt.event_type = FIFO_EVENT_SERVER_TX; + evt.event_id = serial_number++; + + unix_shared_memory_queue_add (tm->vpp_event_queue, (u8 *) & evt, + 0 /* do wait for mutex */ ); + } + } + } +} + +static void +receive_test_chunk (tclient_main_t * tm, session_t * s) +{ + svm_fifo_t *rx_fifo = s->server_rx_fifo; + int n_read, bytes, i; + + bytes = svm_fifo_max_dequeue (rx_fifo); + /* Allow enqueuing of new event */ + svm_fifo_unset_event (rx_fifo); + + /* Read the bytes */ + do + { + n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len (tm->rx_buf), + tm->rx_buf); + if (n_read > 0) + { + bytes -= n_read; + for (i = 0; i < n_read; i++) + { + if (tm->rx_buf[i] != ((s->bytes_received + i) & 0xff)) + { + clib_warning ("read %d error at byte %lld, 0x%x not 0x%x", + n_read, s->bytes_received + i, + tm->rx_buf[i], + ((s->bytes_received + i) & 0xff)); + } + } + s->bytes_to_receive -= n_read; + s->bytes_received += n_read; + } + + } + while (n_read < 0 || bytes > 0); +} + +static void * +tclient_thread_fn (void *arg) +{ + tclient_main_t *tm = &tclient_main; + vl_api_disconnect_session_t *dmp; + session_t *sp; + struct timespec ts, tsrem; + int i; + int try_tx, try_rx; + u32 *session_indices = 0; + + /* stats thread wants no signals. */ + { + sigset_t s; + sigfillset (&s); + pthread_sigmask (SIG_SETMASK, &s, 0); + } + + while (1) + { + /* Wait until we're told to get busy */ + while (tm->run_test == 0 + || (tm->ready_connections != tm->expected_connections)) + { + ts.tv_sec = 0; + ts.tv_nsec = 100000000; + while (nanosleep (&ts, &tsrem) < 0) + ts = tsrem; + } + tm->run_test = 0; + + clib_warning ("Run %d iterations", tm->n_iterations); + + for (i = 0; i < tm->n_iterations; i++) + { + session_t *sp; + + do + { + try_tx = try_rx = 0; + + /* *INDENT-OFF* */ + pool_foreach (sp, tm->sessions, ({ + if (sp->bytes_to_send > 0) + { + send_test_chunk (tm, sp); + try_tx = 1; + } + })); + pool_foreach (sp, tm->sessions, ({ + if (sp->bytes_to_receive > 0) + { + receive_test_chunk (tm, sp); + try_rx = 1; + } + })); + /* *INDENT-ON* */ + + } + while (try_tx || try_rx); + } + clib_warning ("Done %d iterations", tm->n_iterations); + + /* Disconnect sessions... */ + vec_reset_length (session_indices); + pool_foreach (sp, tm->sessions, ( + { + vec_add1 (session_indices, + sp - tm->sessions); + } + )); + + for (i = 0; i < vec_len (session_indices); i++) + { + sp = pool_elt_at_index (tm->sessions, session_indices[i]); + dmp = vl_msg_api_alloc_as_if_client (sizeof (*dmp)); + memset (dmp, 0, sizeof (*dmp)); + dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION); + dmp->client_index = tm->my_client_index; + dmp->session_index = sp->vpp_session_index; + dmp->session_thread_index = sp->vpp_session_thread; + vl_msg_api_send_shmem (tm->vl_input_queue, (u8 *) & dmp); + pool_put (tm->sessions, sp); + } + } + /* NOTREACHED */ + return 0; +} + +/* So we don't get "no handler for... " msgs */ +static void +vl_api_memclnt_create_reply_t_handler (vl_api_memclnt_create_reply_t * mp) +{ + tclient_main_t *tm = &tclient_main; + + tm->my_client_index = mp->index; +} + +static void +vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) +{ + tclient_main_t *tm = &tclient_main; + session_t *session; + u32 session_index; + u64 key; + i32 retval = /* clib_net_to_host_u32 ( */ mp->retval /*) */ ; + + if (retval < 0) + { + clib_warning ("connection failed: retval %d", retval); + return; + } + + tm->our_event_queue = (unix_shared_memory_queue_t *) + mp->vpp_event_queue_address; + + tm->vpp_event_queue = (unix_shared_memory_queue_t *) + mp->vpp_event_queue_address; + + /* + * Setup session + */ + pool_get (tm->sessions, session); + memset (session, 0, sizeof (*session)); + session_index = session - tm->sessions; + session->bytes_to_receive = session->bytes_to_send = tm->bytes_to_send; + + session->server_rx_fifo = (svm_fifo_t *) mp->server_rx_fifo; + session->server_rx_fifo->client_session_index = session_index; + session->server_tx_fifo = (svm_fifo_t *) mp->server_tx_fifo; + session->server_tx_fifo->client_session_index = session_index; + + session->vpp_session_index = mp->session_index; + session->vpp_session_thread = mp->session_thread_index; + + /* Add it to the session lookup table */ + key = (((u64) mp->session_thread_index) << 32) | (u64) mp->session_index; + hash_set (tm->session_index_by_vpp_handles, key, session_index); + + tm->ready_connections++; +} + +static void +create_api_loopback (tclient_main_t * tm) +{ + vl_api_memclnt_create_t _m, *mp = &_m; + extern void vl_api_memclnt_create_t_handler (vl_api_memclnt_create_t *); + api_main_t *am = &api_main; + vl_shmem_hdr_t *shmem_hdr; + + /* + * Create a "loopback" API client connection + * Don't do things like this unless you know what you're doing... + */ + + shmem_hdr = am->shmem_hdr; + tm->vl_input_queue = shmem_hdr->vl_input_queue; + memset (mp, 0, sizeof (*mp)); + mp->_vl_msg_id = VL_API_MEMCLNT_CREATE; + mp->context = 0xFEEDFACE; + mp->input_queue = (u64) tm->vl_input_queue; + strncpy ((char *) mp->name, "tcp_tester", sizeof (mp->name) - 1); + + vl_api_memclnt_create_t_handler (mp); +} + +#define foreach_tclient_static_api_msg \ +_(MEMCLNT_CREATE_REPLY, memclnt_create_reply) \ +_(CONNECT_URI_REPLY, connect_uri_reply) + +static clib_error_t * +tclient_api_hookup (vlib_main_t * vm) +{ + tclient_main_t *tm = &tclient_main; + vl_msg_api_msg_config_t _c, *c = &_c; + int i; + + /* Init test data */ + vec_validate (tm->connect_test_data, 64 * 1024 - 1); + for (i = 0; i < vec_len (tm->connect_test_data); i++) + tm->connect_test_data[i] = i & 0xff; + + tm->session_index_by_vpp_handles = hash_create (0, sizeof (uword)); + vec_validate (tm->rx_buf, vec_len (tm->connect_test_data) - 1); + + /* Hook up client-side static APIs to our handlers */ +#define _(N,n) do { \ + c->id = VL_API_##N; \ + c->name = #n; \ + c->handler = vl_api_##n##_t_handler; \ + c->cleanup = vl_noop_handler; \ + c->endian = vl_api_##n##_t_endian; \ + c->print = vl_api_##n##_t_print; \ + c->size = sizeof(vl_api_##n##_t); \ + c->traced = 1; /* trace, so these msgs print */ \ + c->replay = 0; /* don't replay client create/delete msgs */ \ + c->message_bounce = 0; /* don't bounce this message */ \ + vl_msg_api_config(c);} while (0); + + foreach_tclient_static_api_msg; +#undef _ + + return 0; +} + +VLIB_API_INIT_FUNCTION (tclient_api_hookup); + +static clib_error_t * +test_tcp_clients_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + u8 *connect_uri = (u8 *) "tcp://6.0.1.2/1234"; + u8 *uri; + tclient_main_t *tm = &tclient_main; + int i; + u32 n_clients = 1; + + tm->bytes_to_send = 8192; + tm->n_iterations = 1; + vec_free (tm->connect_uri); + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "nclients %d", &n_clients)) + ; + else if (unformat (input, "iterations %d", &tm->n_iterations)) + ; + else if (unformat (input, "bytes %d", &tm->bytes_to_send)) + ; + else if (unformat (input, "uri %s", &tm->connect_uri)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + tm->ready_connections = 0; + tm->expected_connections = n_clients; + uri = connect_uri; + if (tm->connect_uri) + uri = tm->connect_uri; + + create_api_loopback (tm); + + /* Start a transmit thread */ + if (tm->client_thread_handle == 0) + { + int rv = pthread_create (&tm->client_thread_handle, + NULL /*attr */ , tclient_thread_fn, 0); + if (rv) + { + tm->client_thread_handle = 0; + return clib_error_return (0, "pthread_create returned %d", rv); + } + } + + /* Fire off connect requests, in something approaching a normal manner */ + for (i = 0; i < n_clients; i++) + { + vl_api_connect_uri_t *cmp; + cmp = vl_msg_api_alloc_as_if_client (sizeof (*cmp)); + memset (cmp, 0, sizeof (*cmp)); + + cmp->_vl_msg_id = ntohs (VL_API_CONNECT_URI); + cmp->client_index = tm->my_client_index; + cmp->context = ntohl (0xfeedface); + memcpy (cmp->uri, uri, strlen ((char *) uri) + 1); + vl_msg_api_send_shmem (tm->vl_input_queue, (u8 *) & cmp); + } + + tm->run_test = 1; + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (test_clients_command, static) = +{ + .path = "test tcp clients", + .short_help = "test tcp clients", + .function = test_tcp_clients_command_fn, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/builtin_client.h b/src/vnet/tcp/builtin_client.h new file mode 100644 index 00000000..64030302 --- /dev/null +++ b/src/vnet/tcp/builtin_client.h @@ -0,0 +1,131 @@ + +/* + * tclient.h - skeleton vpp engine plug-in header file + * + * Copyright (c) + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_tclient_h__ +#define __included_tclient_h__ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +typedef struct +{ + u32 bytes_to_send; + u32 bytes_sent; + u32 bytes_to_receive; + u32 bytes_received; + + svm_fifo_t *server_rx_fifo; + svm_fifo_t *server_tx_fifo; + + u32 vpp_session_index; + u32 vpp_session_thread; +} session_t; + +typedef struct +{ + /* API message ID base */ + u16 msg_id_base; + + /* vpe input queue */ + unix_shared_memory_queue_t *vl_input_queue; + + /* API client handle */ + u32 my_client_index; + + /* The URI we're playing with */ + u8 *uri; + + /* Session pool */ + session_t *sessions; + + /* Hash table for disconnect processing */ + uword *session_index_by_vpp_handles; + + /* intermediate rx buffer */ + u8 *rx_buf; + + /* URI for slave's connect */ + u8 *connect_uri; + + u32 connected_session_index; + + int i_am_master; + + /* drop all packets */ + int drop_packets; + + /* Our event queue */ + unix_shared_memory_queue_t *our_event_queue; + + /* $$$ single thread only for the moment */ + unix_shared_memory_queue_t *vpp_event_queue; + + pid_t my_pid; + + /* For deadman timers */ + clib_time_t clib_time; + + /* Connection counts */ + u32 expected_connections; + volatile u32 ready_connections; + + /* Signal variables */ + volatile int run_test; + + /* Number of iterations */ + int n_iterations; + + /* Bytes to send */ + u32 bytes_to_send; + + u32 configured_segment_size; + + /* VNET_API_ERROR_FOO -> "Foo" hash table */ + uword *error_string_by_error_number; + + u8 *connect_test_data; + pthread_t client_thread_handle; + u32 client_bytes_received; + u8 test_return_packets; + + /* convenience */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; + ethernet_main_t *ethernet_main; +} tclient_main_t; + +tclient_main_t tclient_main; + +vlib_node_registration_t tclient_node; + +#endif /* __included_tclient_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c index dd6759c5..efd26e91 100644 --- a/src/vnet/tcp/builtin_server.c +++ b/src/vnet/tcp/builtin_server.c @@ -22,6 +22,7 @@ typedef struct { u8 *rx_buf; unix_shared_memory_queue_t **vpp_queue; + u32 byte_index; vlib_main_t *vlib_main; } builtin_server_main_t; @@ -37,6 +38,7 @@ builtin_session_accept_callback (stream_session_t * s) bsm->vpp_queue[s->thread_index] = session_manager_get_vpp_event_queue (s->thread_index); s->session_state = SESSION_STATE_READY; + bsm->byte_index = 0; return 0; } @@ -80,57 +82,94 @@ builtin_redirect_connect_callback (u32 client_index, void *mp) return -1; } +void +test_bytes (builtin_server_main_t * bsm, int actual_transfer) +{ + int i; + + for (i = 0; i < actual_transfer; i++) + { + if (bsm->rx_buf[i] != ((bsm->byte_index + i) & 0xff)) + { + clib_warning ("at %d expected %d got %d", bsm->byte_index + i, + (bsm->byte_index + i) & 0xff, bsm->rx_buf[i]); + } + } + bsm->byte_index += actual_transfer; +} + int -builtin_server_rx_callback (stream_session_t * s, session_fifo_event_t * e) +builtin_server_rx_callback (stream_session_t * s) { - int n_written, bytes, total_copy_bytes; - int n_read; - svm_fifo_t *tx_fifo; + u32 n_written, max_dequeue, max_enqueue, max_transfer; + int actual_transfer; + svm_fifo_t *tx_fifo, *rx_fifo; builtin_server_main_t *bsm = &builtin_server_main; session_fifo_event_t evt; static int serial_number = 0; - bytes = e->enqueue_length; - if (PREDICT_FALSE (bytes <= 0)) + max_dequeue = svm_fifo_max_dequeue (s->server_rx_fifo); + max_enqueue = svm_fifo_max_enqueue (s->server_tx_fifo); + + if (PREDICT_FALSE (max_dequeue == 0)) { - clib_warning ("bizarre rx callback: bytes %d", bytes); return 0; } tx_fifo = s->server_tx_fifo; + rx_fifo = s->server_rx_fifo; /* Number of bytes we're going to copy */ - total_copy_bytes = (bytes < (tx_fifo->nitems - tx_fifo->cursize)) ? bytes : - tx_fifo->nitems - tx_fifo->cursize; + max_transfer = (max_dequeue < max_enqueue) ? max_dequeue : max_enqueue; - if (PREDICT_FALSE (total_copy_bytes <= 0)) + /* No space in tx fifo */ + if (PREDICT_FALSE (max_transfer == 0)) { - clib_warning ("no space in tx fifo, event had %d bytes", bytes); + /* XXX timeout for session that are stuck */ + + /* Program self-tap to retry */ + if (svm_fifo_set_event (rx_fifo)) + { + evt.fifo = rx_fifo; + evt.event_type = FIFO_EVENT_BUILTIN_RX; + evt.event_id = 0; + unix_shared_memory_queue_add (bsm->vpp_queue[s->thread_index], + (u8 *) & evt, + 0 /* do wait for mutex */ ); + } + return 0; } - vec_validate (bsm->rx_buf, total_copy_bytes - 1); - _vec_len (bsm->rx_buf) = total_copy_bytes; + svm_fifo_unset_event (rx_fifo); + + vec_validate (bsm->rx_buf, max_transfer - 1); + _vec_len (bsm->rx_buf) = max_transfer; - n_read = svm_fifo_dequeue_nowait (s->server_rx_fifo, 0, total_copy_bytes, - bsm->rx_buf); - ASSERT (n_read == total_copy_bytes); + actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, 0, max_transfer, + bsm->rx_buf); + ASSERT (actual_transfer == max_transfer); + +// test_bytes (bsm, actual_transfer); /* * Echo back */ - n_written = svm_fifo_enqueue_nowait (tx_fifo, 0, n_read, bsm->rx_buf); - ASSERT (n_written == total_copy_bytes); + n_written = + svm_fifo_enqueue_nowait (tx_fifo, 0, actual_transfer, bsm->rx_buf); + ASSERT (n_written == max_transfer); - /* Fabricate TX event, send to vpp */ - evt.fifo = tx_fifo; - evt.event_type = FIFO_EVENT_SERVER_TX; - evt.enqueue_length = total_copy_bytes; - evt.event_id = serial_number++; + if (svm_fifo_set_event (tx_fifo)) + { + /* Fabricate TX event, send to vpp */ + evt.fifo = tx_fifo; + evt.event_type = FIFO_EVENT_SERVER_TX; + evt.event_id = serial_number++; - unix_shared_memory_queue_add (bsm->vpp_queue[s->thread_index], (u8 *) & evt, - 0 /* do wait for mutex */ ); + unix_shared_memory_queue_add (bsm->vpp_queue[s->thread_index], + (u8 *) & evt, 0 /* do wait for mutex */ ); + } return 0; } @@ -164,7 +203,7 @@ server_create (vlib_main_t * vm) a->api_client_index = ~0; a->session_cb_vft = &builtin_session_cb_vft; a->options = options; - a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 256 << 10; + a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 128 << 20; a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 64 << 10; a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 64 << 10; a->segment_name = segment_name; diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 0d2e6d0e..c3df5bc1 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -328,7 +328,7 @@ tcp_connection_init_vars (tcp_connection_t * tc) { tcp_connection_timers_init (tc); tcp_set_snd_mss (tc); - tc->sack_sb.head = TCP_INVALID_SACK_HOLE_INDEX; + scoreboard_init (&tc->sack_sb); tcp_cc_init (tc); } @@ -558,17 +558,48 @@ tcp_session_send_mss (transport_connection_t * trans_conn) return tc->snd_mss; } +/** + * Compute tx window session is allowed to fill. + */ u32 tcp_session_send_space (transport_connection_t * trans_conn) { + u32 snd_space; tcp_connection_t *tc = (tcp_connection_t *) trans_conn; - return tcp_available_snd_space (tc); + + /* If we haven't gotten dupacks or if we did and have gotten sacked bytes + * then we can still send */ + if (PREDICT_TRUE (tcp_in_fastrecovery (tc) == 0 + && (tc->rcv_dupacks == 0 + || tc->sack_sb.last_sacked_bytes))) + { + snd_space = tcp_available_snd_space (tc); + + /* If we can't write at least a segment, don't try at all */ + if (snd_space < tc->snd_mss) + return 0; + return snd_space; + } + + /* If in fast recovery, send 1 SMSS if wnd allows */ + if (tcp_in_fastrecovery (tc) && tcp_available_snd_space (tc) + && tcp_fastrecovery_sent_1_smss (tc)) + { + tcp_fastrecovery_1_smss_on (tc); + return tc->snd_mss; + } + + return 0; } u32 tcp_session_tx_fifo_offset (transport_connection_t * trans_conn) { tcp_connection_t *tc = (tcp_connection_t *) trans_conn; + + ASSERT (seq_geq (tc->snd_nxt, tc->snd_una)); + + /* This still works if fast retransmit is on */ return (tc->snd_nxt - tc->snd_una); } @@ -762,7 +793,7 @@ tcp_main_enable (vlib_main_t * vm) vec_validate (tm->timer_wheels, num_threads - 1); tcp_initialize_timer_wheels (tm); - vec_validate (tm->delack_connections, num_threads - 1); +// vec_validate (tm->delack_connections, num_threads - 1); /* Initialize clocks per tick for TCP timestamp. Used to compute * monotonically increasing timestamps. */ diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 082ab1d8..b4286bc4 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -30,9 +30,10 @@ #define TCP_PAWS_IDLE 24 * 24 * 60 * 60 * THZ /**< 24 days */ #define TCP_MAX_OPTION_SPACE 40 -#define TCP_DUPACK_THRESHOLD 3 -#define TCP_MAX_RX_FIFO_SIZE 2 << 20 -#define TCP_IW_N_SEGMENTS 10 +#define TCP_DUPACK_THRESHOLD 3 +#define TCP_MAX_RX_FIFO_SIZE 2 << 20 +#define TCP_IW_N_SEGMENTS 10 +#define TCP_ALWAYS_ACK 0 /**< If on, we always ack */ /** TCP FSM state definitions as per RFC793. */ #define foreach_tcp_fsm_state \ @@ -102,13 +103,12 @@ void tcp_update_time (f64 now, u32 thread_index); /** TCP connection flags */ #define foreach_tcp_connection_flag \ - _(DELACK, "Delay ACK") \ _(SNDACK, "Send ACK") \ - _(BURSTACK, "Burst ACK set") \ _(FINSNT, "FIN sent") \ _(SENT_RCV_WND0, "Sent 0 receive window") \ _(RECOVERY, "Recovery on") \ - _(FAST_RECOVERY, "Fast Recovery on") + _(FAST_RECOVERY, "Fast Recovery on") \ + _(FR_1_SMSS, "Sent 1 SMSS") typedef enum _tcp_connection_flag_bits { @@ -160,8 +160,12 @@ typedef struct _sack_scoreboard_hole typedef struct _sack_scoreboard { sack_scoreboard_hole_t *holes; /**< Pool of holes */ - u32 head; /**< Index to first entry */ + u32 head; /**< Index of first entry */ + u32 tail; /**< Index of last entry */ u32 sacked_bytes; /**< Number of bytes sacked in sb */ + u32 last_sacked_bytes; /**< Number of bytes last sacked */ + u32 snd_una_adv; /**< Bytes to add to snd_una */ + u32 max_byte_sacked; /**< Highest byte acked */ } sack_scoreboard_t; typedef enum _tcp_cc_algorithm_type @@ -214,7 +218,7 @@ typedef struct _tcp_connection sack_block_t *snd_sacks; /**< Vector of SACKs to send. XXX Fixed size? */ sack_scoreboard_t sack_sb; /**< SACK "scoreboard" that tracks holes */ - u8 rcv_dupacks; /**< Number of DUPACKs received */ + u16 rcv_dupacks; /**< Number of DUPACKs received */ u8 snt_dupacks; /**< Number of DUPACKs sent in a burst */ /* Congestion control */ @@ -224,6 +228,7 @@ typedef struct _tcp_connection u32 bytes_acked; /**< Bytes acknowledged by current segment */ u32 rtx_bytes; /**< Retransmitted bytes */ u32 tsecr_last_ack; /**< Timestamp echoed to us in last healthy ACK */ + u32 snd_congestion; /**< snd_una_max when congestion is detected */ tcp_cc_algorithm_t *cc_algo; /**< Congestion control algorithm */ /* RTT and RTO */ @@ -250,8 +255,10 @@ struct _tcp_cc_algorithm #define tcp_fastrecovery_off(tc) (tc)->flags &= ~TCP_CONN_FAST_RECOVERY #define tcp_in_fastrecovery(tc) ((tc)->flags & TCP_CONN_FAST_RECOVERY) #define tcp_in_recovery(tc) ((tc)->flags & (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY)) -#define tcp_recovery_off(tc) ((tc)->flags &= ~(TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY)) #define tcp_in_slowstart(tc) (tc->cwnd < tc->ssthresh) +#define tcp_fastrecovery_sent_1_smss(tc) ((tc)->flags & TCP_CONN_FR_1_SMSS) +#define tcp_fastrecovery_1_smss_on(tc) ((tc)->flags |= TCP_CONN_FR_1_SMSS) +#define tcp_fastrecovery_1_smss_off(tc) ((tc)->flags &= ~TCP_CONN_FR_1_SMSS) typedef enum { @@ -293,8 +300,8 @@ typedef struct _tcp_main /* Per worker-thread timer wheel for connections timers */ tw_timer_wheel_16t_2w_512sl_t *timer_wheels; - /* Convenience per worker-thread vector of connections to DELACK */ - u32 **delack_connections; +// /* Convenience per worker-thread vector of connections to DELACK */ +// u32 **delack_connections; /* Pool of half-open connections on which we've sent a SYN */ tcp_connection_t *half_open_connections; @@ -397,8 +404,16 @@ tcp_end_seq (tcp_header_t * th, u32 len) always_inline u32 tcp_flight_size (const tcp_connection_t * tc) { - return tc->snd_una_max - tc->snd_una - tc->sack_sb.sacked_bytes - + tc->rtx_bytes; + int flight_size; + + flight_size = (int) ((tc->snd_una_max - tc->snd_una) + tc->rtx_bytes) + - (tc->rcv_dupacks * tc->snd_mss) /* - tc->sack_sb.sacked_bytes */ ; + + /* Happens if we don't clear sacked bytes */ + if (flight_size < 0) + return 0; + + return flight_size; } /** @@ -439,9 +454,13 @@ tcp_available_snd_space (const tcp_connection_t * tc) return available_wnd - flight_size; } +void tcp_update_rcv_wnd (tcp_connection_t * tc); + void tcp_retransmit_first_unacked (tcp_connection_t * tc); void tcp_fast_retransmit (tcp_connection_t * tc); +void tcp_cc_congestion (tcp_connection_t * tc); +void tcp_cc_recover (tcp_connection_t * tc); always_inline u32 tcp_time_now (void) @@ -453,7 +472,7 @@ u32 tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b); u32 tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, - u32 max_bytes); + u32 offset, u32 max_bytes); void tcp_connection_timers_init (tcp_connection_t * tc); void tcp_connection_timers_reset (tcp_connection_t * tc); @@ -476,14 +495,6 @@ tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval) tc->c_c_index, timer_id, interval); } -always_inline void -tcp_retransmit_timer_set (tcp_connection_t * tc) -{ - /* XXX Switch to faster TW */ - tcp_timer_set (tc, TCP_TIMER_RETRANSMIT, - clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); -} - always_inline void tcp_timer_reset (tcp_connection_t * tc, u8 timer_id) { @@ -506,6 +517,27 @@ tcp_timer_update (tcp_connection_t * tc, u8 timer_id, u32 interval) tc->c_c_index, timer_id, interval); } +/* XXX Switch retransmit to faster TW */ +always_inline void +tcp_retransmit_timer_set (tcp_connection_t * tc) +{ + tcp_timer_set (tc, TCP_TIMER_RETRANSMIT, + clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); +} + +always_inline void +tcp_retransmit_timer_update (tcp_connection_t * tc) +{ + tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, + clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); +} + +always_inline void +tcp_retransmit_timer_reset (tcp_connection_t * tc) +{ + tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT); +} + always_inline u8 tcp_timer_is_active (tcp_connection_t * tc, tcp_timers_e timer) { @@ -516,6 +548,14 @@ void scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole); +always_inline sack_scoreboard_hole_t * +scoreboard_get_hole (sack_scoreboard_t * sb, u32 index) +{ + if (index != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, index); + return 0; +} + always_inline sack_scoreboard_hole_t * scoreboard_next_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) { @@ -532,6 +572,14 @@ scoreboard_first_hole (sack_scoreboard_t * sb) return 0; } +always_inline sack_scoreboard_hole_t * +scoreboard_last_hole (sack_scoreboard_t * sb) +{ + if (sb->tail != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, sb->tail); + return 0; +} + always_inline void scoreboard_clear (sack_scoreboard_t * sb) { @@ -540,6 +588,10 @@ scoreboard_clear (sack_scoreboard_t * sb) { scoreboard_remove_hole (sb, hole); } + sb->sacked_bytes = 0; + sb->last_sacked_bytes = 0; + sb->snd_una_adv = 0; + sb->max_byte_sacked = 0; } always_inline u32 @@ -548,6 +600,21 @@ scoreboard_hole_bytes (sack_scoreboard_hole_t * hole) return hole->end - hole->start; } +always_inline u32 +scoreboard_hole_index (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) +{ + return hole - sb->holes; +} + +always_inline void +scoreboard_init (sack_scoreboard_t * sb) +{ + sb->head = TCP_INVALID_SACK_HOLE_INDEX; + sb->tail = TCP_INVALID_SACK_HOLE_INDEX; +} + +void tcp_rcv_sacks (tcp_connection_t * tc, u32 ack); + always_inline void tcp_cc_algo_register (tcp_cc_algorithm_type_e type, const tcp_cc_algorithm_t * vft) diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index 069c512d..5a71694e 100644 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -19,6 +19,8 @@ #include #define TCP_DEBUG (1) +#define TCP_DEBUG_CC (1) +#define TCP_DEBUG_VERBOSE (0) #define foreach_tcp_dbg_evt \ _(INIT, "") \ @@ -30,14 +32,24 @@ _(DELETE, "delete") \ _(SYN_SENT, "SYN sent") \ _(FIN_SENT, "FIN sent") \ + _(ACK_SENT, "ACK sent") \ + _(DUPACK_SENT, "DUPACK sent") \ _(RST_SENT, "RST sent") \ _(SYN_RCVD, "SYN rcvd") \ _(ACK_RCVD, "ACK rcvd") \ + _(DUPACK_RCVD, "DUPACK rcvd") \ _(FIN_RCVD, "FIN rcvd") \ _(RST_RCVD, "RST rcvd") \ _(PKTIZE, "packetize") \ _(INPUT, "in") \ - _(TIMER_POP, "timer pop") + _(SND_WND, "snd_wnd update") \ + _(OUTPUT, "output") \ + _(TIMER_POP, "timer pop") \ + _(CC_RTX, "retransmit") \ + _(CC_EVT, "cc event") \ + _(CC_PACK, "cc partial ack") \ + _(SEG_INVALID, "invalid segment") \ + _(ACK_RCV_ERR, "invalid ack") \ typedef enum _tcp_dbg { @@ -73,10 +85,10 @@ typedef enum _tcp_dbg_evt ed = ELOG_TRACK_DATA (&vlib_global_main.elog_main, \ _e, _tc->c_elog_track) -#define TCP_EVT_INIT_HANDLER(_tc, ...) \ +#define TCP_EVT_INIT_HANDLER(_tc, _fmt, ...) \ { \ _tc->c_elog_track.name = \ - (char *) format (0, "%d%c", _tc->c_c_index, 0); \ + (char *) format (0, _fmt, _tc->c_c_index, 0); \ elog_track_register (&vlib_global_main.elog_main, &_tc->c_elog_track);\ } @@ -87,7 +99,7 @@ typedef enum _tcp_dbg_evt #define TCP_EVT_OPEN_HANDLER(_tc, ...) \ { \ - TCP_EVT_INIT_HANDLER(_tc); \ + TCP_EVT_INIT_HANDLER(_tc, "s%d%c"); \ ELOG_TYPE_DECLARE (_e) = \ { \ .format = "open: index %d", \ @@ -110,7 +122,7 @@ typedef enum _tcp_dbg_evt #define TCP_EVT_BIND_HANDLER(_tc, ...) \ { \ - TCP_EVT_INIT_HANDLER(_tc); \ + TCP_EVT_INIT_HANDLER(_tc, "l%d%c"); \ ELOG_TYPE_DECLARE (_e) = \ { \ .format = "bind: listener %d", \ @@ -138,16 +150,44 @@ typedef enum _tcp_dbg_evt .format = "delete: %d", \ .format_args = "i4", \ }; \ - DECLARE_ETD(_tc, _e, 0); \ + DECLARE_ETD(_tc, _e, 1); \ ed->data[0] = _tc->c_c_index; \ TCP_EVT_DEALLOC_HANDLER(_tc); \ } +#define TCP_EVT_ACK_SENT_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "ack_prep: acked %u rcv_nxt %u rcv_wnd %u snd_nxt %u", \ + .format_args = "i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 4); \ + ed->data[0] = _tc->rcv_nxt - _tc->rcv_las; \ + ed->data[1] = _tc->rcv_nxt - _tc->irs; \ + ed->data[2] = _tc->rcv_wnd; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ +} + +#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "dack_tx: rcv_nxt %u rcv_wnd %u snd_nxt %u av-wnd %u", \ + .format_args = "i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 4); \ + ed->data[0] = _tc->rcv_nxt - _tc->irs; \ + ed->data[1] = _tc->rcv_wnd; \ + ed->data[2] = _tc->snd_nxt - _tc->iss; \ + ed->data[3] = tcp_available_wnd(_tc); \ +} + #define TCP_EVT_SYN_SENT_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "SYN: iss %d", \ + .format = "SYNtx: iss %u", \ .format_args = "i4", \ }; \ DECLARE_ETD(_tc, _e, 1); \ @@ -158,7 +198,7 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "FIN: snd_nxt %d rcv_nxt %d", \ + .format = "FINtx: snd_nxt %d rcv_nxt %d", \ .format_args = "i4i4", \ }; \ DECLARE_ETD(_tc, _e, 2); \ @@ -170,7 +210,7 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "RST: snd_nxt %d rcv_nxt %d", \ + .format = "RSTtx: snd_nxt %d rcv_nxt %d", \ .format_args = "i4i4", \ }; \ DECLARE_ETD(_tc, _e, 2); \ @@ -180,10 +220,10 @@ typedef enum _tcp_dbg_evt #define TCP_EVT_SYN_RCVD_HANDLER(_tc, ...) \ { \ - TCP_EVT_INIT_HANDLER(_tc); \ + TCP_EVT_INIT_HANDLER(_tc, "s%d%c"); \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "SYN rcvd: irs %d", \ + .format = "SYNrx: irs %u", \ .format_args = "i4", \ }; \ DECLARE_ETD(_tc, _e, 1); \ @@ -194,7 +234,7 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "FIN rcvd: snd_nxt %d rcv_nxt %d", \ + .format = "FINrx: snd_nxt %d rcv_nxt %d", \ .format_args = "i4i4", \ }; \ DECLARE_ETD(_tc, _e, 2); \ @@ -206,7 +246,7 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "RST rcvd: snd_nxt %d rcv_nxt %d", \ + .format = "RSTrx: snd_nxt %d rcv_nxt %d", \ .format_args = "i4i4", \ }; \ DECLARE_ETD(_tc, _e, 2); \ @@ -214,54 +254,68 @@ typedef enum _tcp_dbg_evt ed->data[1] = _tc->rcv_nxt - _tc->irs; \ } -#define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...) \ +#define TCP_EVT_ACK_RCVD_HANDLER(_tc, _ack, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "ACK: acked %u cwnd %u inflight %u", \ - .format_args = "i4i4i4", \ + .format = "acked: %u snd_una %u ack %u cwnd %u inflight %u", \ + .format_args = "i4i4i4i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 3); \ + DECLARE_ETD(_tc, _e, 5); \ ed->data[0] = _tc->bytes_acked; \ - ed->data[1] = _tc->cwnd; \ - ed->data[2] = tcp_flight_size(_tc); \ + ed->data[1] = _tc->snd_una - _tc->iss; \ + ed->data[2] = _ack - _tc->iss; \ + ed->data[3] = _tc->cwnd; \ + ed->data[4] = tcp_flight_size(_tc); \ } -#define TCP_EVT_PKTIZE_HANDLER(_tc, ...) \ +#define TCP_EVT_DUPACK_RCVD_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "pktize: snd_una %u snd_nxt %u una_max %u", \ - .format_args = "i4i4i4", \ + .format = "dack_rx: snd_una %u cwnd %u snd_wnd %u inflight %u", \ + .format_args = "i4i4i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 3); \ + DECLARE_ETD(_tc, _e, 4); \ ed->data[0] = _tc->snd_una - _tc->iss; \ - ed->data[1] = _tc->snd_nxt - _tc->iss; \ - ed->data[2] = _tc->snd_una_max - _tc->iss; \ + ed->data[1] = _tc->cwnd; \ + ed->data[2] = _tc->snd_wnd; \ + ed->data[3] = tcp_flight_size(_tc); \ } -#define TCP_EVT_OUTPUT_HANDLER(_tc, flags, n_bytes,...) \ +#define TCP_EVT_PKTIZE_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "out: flags %x, bytes %u", \ - .format_args = "i4i4", \ + .format = "pktize: una %u snd_nxt %u space %u flight %u rcv_wnd %u",\ + .format_args = "i4i4i4i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 2); \ - ed->data[0] = flags; \ - ed->data[1] = n_bytes; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->snd_una - _tc->iss; \ + ed->data[1] = _tc->snd_nxt - _tc->iss; \ + ed->data[2] = tcp_available_snd_space (_tc); \ + ed->data[3] = tcp_flight_size (_tc); \ + ed->data[4] = _tc->rcv_wnd; \ } -#define TCP_EVT_INPUT_HANDLER(_tc, n_bytes, ...) \ +#define TCP_EVT_INPUT_HANDLER(_tc, _type, _len, _written, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "in: bytes %u rcv_nxt %u", \ - .format_args = "i4i4", \ + .format = "in: %s len %u written %d rcv_nxt %u free wnd %d", \ + .format_args = "t4i4i4i4i4", \ + .n_enum_strings = 2, \ + .enum_strings = { \ + "order", \ + "ooo", \ + }, \ }; \ - DECLARE_ETD(_tc, _e, 2); \ - ed->data[0] = n_bytes; \ - ed->data[1] = _tc->rcv_nxt - _tc->irs; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _type; \ + ed->data[1] = _len; \ + ed->data[2] = _written; \ + ed->data[3] = (_tc->rcv_nxt - _tc->irs) + _written; \ + ed->data[4] = _tc->rcv_wnd - (_tc->rcv_nxt - _tc->rcv_las); \ } #define TCP_EVT_TIMER_POP_HANDLER(_tc_index, _timer_id, ...) \ @@ -296,9 +350,131 @@ typedef enum _tcp_dbg_evt ed->data[1] = _timer_id; \ } +#define TCP_EVT_SEG_INVALID_HANDLER(_tc, _seq, _end, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "seg-inv: seq %u end %u rcv_las %u rcv_nxt %u wnd %u", \ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _seq - _tc->irs; \ + ed->data[1] = _end - _tc->irs; \ + ed->data[2] = _tc->rcv_las - _tc->irs; \ + ed->data[3] = _tc->rcv_nxt - _tc->irs; \ + ed->data[4] = _tc->rcv_wnd; \ +} + +#define TCP_EVT_ACK_RCV_ERR_HANDLER(_tc, _type, _ack, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "ack-err: %s ack %u snd_una %u snd_nxt %u una_max %u", \ + .format_args = "t4i4i4i4i4", \ + .n_enum_strings = 3, \ + .enum_strings = { \ + "invalid", \ + "old", \ + "future", \ + }, \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _type; \ + ed->data[1] = _ack - _tc->iss; \ + ed->data[2] = _tc->snd_una - _tc->iss; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ + ed->data[4] = _tc->snd_una_max - _tc->iss; \ +} + +/* + * Congestion Control + */ + +#if TCP_DEBUG_CC +#define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "rtx: snd_nxt %u offset %u snd %u rtx %u", \ + .format_args = "i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 4); \ + ed->data[0] = _tc->snd_nxt - _tc->iss; \ + ed->data[1] = offset; \ + ed->data[2] = n_bytes; \ + ed->data[3] = _tc->rtx_bytes; \ +} + +#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "cc: %s wnd %u snd_cong %u rtx_bytes %u", \ + .format_args = "t4i4i4i4", \ + .n_enum_strings = 5, \ + .enum_strings = { \ + "fast-rtx", \ + "rtx-timeout", \ + "first-rtx", \ + "recovered", \ + "congestion", \ + }, \ + }; \ + DECLARE_ETD(_tc, _e, 4); \ + ed->data[0] = _sub_evt; \ + ed->data[1] = tcp_available_snd_space (_tc); \ + ed->data[2] = _tc->snd_congestion - _tc->iss; \ + ed->data[3] = _tc->rtx_bytes; \ +} + +#define TCP_EVT_CC_PACK_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "pack: snd_una %u snd_una_max %u", \ + .format_args = "i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = _tc->snd_una - _tc->iss; \ + ed->data[1] = _tc->snd_una_max - _tc->iss; \ +} + +#else +#define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...) +#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, _snd_space, ...) +#define TCP_EVT_CC_PACK_HANDLER(_tc, ...) +#endif + +#if TCP_DBG_VERBOSE +#define TCP_EVT_SND_WND_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "snd_wnd update: %u ", \ + .format_args = "i4", \ + }; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->snd_wnd; \ +} + +#define TCP_EVT_OUTPUT_HANDLER(_tc, flags, n_bytes,...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "out: flags %x, bytes %u", \ + .format_args = "i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = flags; \ + ed->data[1] = n_bytes; \ +} +#else +#define TCP_EVT_SND_WND_HANDLER(_tc, ...) +#define TCP_EVT_OUTPUT_HANDLER(_tc, flags, n_bytes,...) +#endif + #define CONCAT_HELPER(_a, _b) _a##_b #define CC(_a, _b) CONCAT_HELPER(_a, _b) - #define TCP_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args) #else diff --git a/src/vnet/tcp/tcp_error.def b/src/vnet/tcp/tcp_error.def index 2dbdd9b3..b91a08c0 100644 --- a/src/vnet/tcp/tcp_error.def +++ b/src/vnet/tcp/tcp_error.def @@ -12,12 +12,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - tcp_error (NONE, "no error") tcp_error (NO_LISTENER, "no listener for dst port") tcp_error (LOOKUP_DROPS, "lookup drops") tcp_error (DISPATCH, "Dispatch error") tcp_error (ENQUEUED, "Packets pushed into rx fifo") +tcp_error (PARTIALLY_ENQUEUED, "Packets partially pushed into rx fifo") tcp_error (PURE_ACK, "Pure acks") tcp_error (SYNS_RCVD, "SYNs received") tcp_error (SYN_ACKS_RCVD, "SYN-ACKs received") @@ -26,11 +26,14 @@ tcp_error (FIFO_FULL, "Packets dropped for lack of rx fifo space") tcp_error (EVENT_FIFO_FULL, "Events not sent for lack of event fifo space") tcp_error (API_QUEUE_FULL, "Sessions not created for lack of API queue space") tcp_error (CREATE_SESSION_FAIL, "Sessions couldn't be allocated") -tcp_error (SEGMENT_INVALID, "Invalid segment") +tcp_error (SEGMENT_INVALID, "Invalid segments") +tcp_error (SEGMENT_OLD, "Old segment") tcp_error (ACK_INVALID, "Invalid ACK") tcp_error (ACK_DUP, "Duplicate ACK") tcp_error (ACK_OLD, "Old ACK") +tcp_error (ACK_FUTURE, "Future ACK") tcp_error (PKTS_SENT, "Packets sent") tcp_error (FILTERED_DUPACKS, "Filtered duplicate ACKs") tcp_error (RST_SENT, "Resets sent") tcp_error (INVALID_CONNECTION, "Invalid connection") +tcp_error (NO_WND, "No window") \ No newline at end of file diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 67af4321..5d11985f 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -95,13 +95,21 @@ vlib_node_registration_t tcp6_established_node; * or the rcv_nxt at last ack sent instead of rcv_nxt since that's the * peer's reference when computing our receive window. * - * This accepts only segments within the window. + * This: + * seq_leq (end_seq, tc->rcv_las + tc->rcv_wnd) && seq_geq (seq, tc->rcv_las) + * however, is too strict when we have retransmits. Instead we just check that + * the seq is not beyond the right edge and that the end of the segment is not + * less than the left edge. + * + * N.B. rcv_nxt and rcv_wnd are both updated in this node if acks are sent, so + * use rcv_nxt in the right edge window test instead of rcv_las. + * */ always_inline u8 tcp_segment_in_rcv_wnd (tcp_connection_t * tc, u32 seq, u32 end_seq) { - return seq_leq (end_seq, tc->rcv_las + tc->rcv_wnd) - && seq_geq (seq, tc->rcv_nxt); + return (seq_geq (end_seq, tc->rcv_las) + && seq_leq (seq, tc->rcv_nxt + tc->rcv_wnd)); } void @@ -253,6 +261,7 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, { tcp_make_ack (tc0, b0); *next0 = tcp_next_output (tc0->c_is_ip4); + TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0); return -1; } } @@ -262,13 +271,25 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, if (!tcp_segment_in_rcv_wnd (tc0, vnet_buffer (b0)->tcp.seq_number, vnet_buffer (b0)->tcp.seq_end)) { - if (!tcp_rst (th0)) + /* If our window is 0 and the packet is in sequence, let it pass + * through for ack processing. It should be dropped later.*/ + if (tc0->rcv_wnd == 0 + && tc0->rcv_nxt == vnet_buffer (b0)->tcp.seq_number) { - /* Send dup ack */ - tcp_make_ack (tc0, b0); - *next0 = tcp_next_output (tc0->c_is_ip4); + /* Make it look as if there's nothing to dequeue */ + vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number; + } + else + { + /* If not RST, send dup ack */ + if (!tcp_rst (th0)) + { + tcp_make_ack (tc0, b0); + *next0 = tcp_next_output (tc0->c_is_ip4); + TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0); + } + return -1; } - return -1; } /* 2nd: check the RST bit */ @@ -326,13 +347,13 @@ tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt) /* XXX Drop in RTT results in RTTVAR increase and bigger RTO. * The increase should be bound */ - tc->rttvar += (clib_abs (err) - tc->rttvar) >> 2; + tc->rttvar += ((int) clib_abs (err) - (int) tc->rttvar) >> 2; } else { /* First measurement. */ tc->srtt = mrtt; - tc->rttvar = mrtt << 1; + tc->rttvar = mrtt >> 1; } } @@ -394,7 +415,11 @@ tcp_dequeue_acked (tcp_connection_t * tc, u32 ack) } } -/** Check if dupack as per RFC5681 Sec. 2 */ +/** + * Check if dupack as per RFC5681 Sec. 2 + * + * This works only if called before updating snd_wnd. + * */ always_inline u8 tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 new_snd_wnd) { @@ -429,10 +454,10 @@ scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) } sack_scoreboard_hole_t * -scoreboard_insert_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * prev, +scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index, u32 start, u32 end) { - sack_scoreboard_hole_t *hole, *next; + sack_scoreboard_hole_t *hole, *next, *prev; u32 hole_index; pool_get (sb->holes, hole); @@ -442,6 +467,7 @@ scoreboard_insert_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * prev, hole->end = end; hole_index = hole - sb->holes; + prev = scoreboard_get_hole (sb, prev_index); if (prev) { hole->prev = prev - sb->holes; @@ -462,28 +488,35 @@ scoreboard_insert_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * prev, return hole; } -static void +void tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { sack_scoreboard_t *sb = &tc->sack_sb; sack_block_t *blk, tmp; - sack_scoreboard_hole_t *hole, *next_hole; - u32 blk_index = 0; + sack_scoreboard_hole_t *hole, *next_hole, *last_hole, *new_hole; + u32 blk_index = 0, old_sacked_bytes, hole_index; int i, j; - if (!tcp_opts_sack (tc) && sb->head == TCP_INVALID_SACK_HOLE_INDEX) + sb->last_sacked_bytes = 0; + sb->snd_una_adv = 0; + old_sacked_bytes = sb->sacked_bytes; + + if (!tcp_opts_sack (&tc->opt) && sb->head == TCP_INVALID_SACK_HOLE_INDEX) return; /* Remove invalid blocks */ - vec_foreach (blk, tc->opt.sacks) - { - if (seq_lt (blk->start, blk->end) - && seq_gt (blk->start, tc->snd_una) - && seq_gt (blk->start, ack) && seq_lt (blk->end, tc->snd_nxt)) - continue; - - vec_del1 (tc->opt.sacks, blk - tc->opt.sacks); - } + blk = tc->opt.sacks; + while (blk < vec_end (tc->opt.sacks)) + { + if (seq_lt (blk->start, blk->end) + && seq_gt (blk->start, tc->snd_una) + && seq_gt (blk->start, ack) && seq_leq (blk->end, tc->snd_nxt)) + { + blk++; + continue; + } + vec_del1 (tc->opt.sacks, blk - tc->opt.sacks); + } /* Add block for cumulative ack */ if (seq_gt (ack, tc->snd_una)) @@ -498,7 +531,7 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) /* Make sure blocks are ordered */ for (i = 0; i < vec_len (tc->opt.sacks); i++) - for (j = i; j < vec_len (tc->opt.sacks); j++) + for (j = i + 1; j < vec_len (tc->opt.sacks); j++) if (seq_lt (tc->opt.sacks[j].start, tc->opt.sacks[i].start)) { tmp = tc->opt.sacks[i]; @@ -506,10 +539,22 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) tc->opt.sacks[j] = tmp; } - /* If no holes, insert the first that covers all outstanding bytes */ if (sb->head == TCP_INVALID_SACK_HOLE_INDEX) { - scoreboard_insert_hole (sb, 0, tc->snd_una, tc->snd_una_max); + /* If no holes, insert the first that covers all outstanding bytes */ + last_hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX, + tc->snd_una, tc->snd_una_max); + sb->tail = scoreboard_hole_index (sb, last_hole); + } + else + { + /* If we have holes but snd_una_max is beyond the last hole, update + * last hole end */ + tmp = tc->opt.sacks[vec_len (tc->opt.sacks) - 1]; + last_hole = scoreboard_last_hole (sb); + if (seq_gt (tc->snd_una_max, sb->max_byte_sacked) + && seq_gt (tc->snd_una_max, last_hole->end)) + last_hole->end = tc->snd_una_max; } /* Walk the holes with the SACK blocks */ @@ -526,10 +571,10 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) next_hole = scoreboard_next_hole (sb, hole); /* Byte accounting */ - if (seq_lt (hole->end, ack)) + if (seq_leq (hole->end, ack)) { - /* Bytes lost because snd wnd left edge advances */ - if (seq_lt (next_hole->start, ack)) + /* Bytes lost because snd_wnd left edge advances */ + if (next_hole && seq_leq (next_hole->start, ack)) sb->sacked_bytes -= next_hole->start - hole->end; else sb->sacked_bytes -= ack - hole->end; @@ -539,35 +584,78 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) sb->sacked_bytes += scoreboard_hole_bytes (hole); } + /* snd_una needs to be advanced */ + if (seq_geq (ack, hole->end)) + { + if (next_hole && seq_lt (ack, next_hole->start)) + sb->snd_una_adv = next_hole->start - ack; + else + sb->snd_una_adv = sb->max_byte_sacked - ack; + + /* all these can be delivered */ + sb->sacked_bytes -= sb->snd_una_adv; + } + + /* About to remove last hole */ + if (hole == last_hole) + { + sb->tail = hole->prev; + last_hole = scoreboard_last_hole (sb); + /* keep track of max byte sacked in case the last hole + * is acked */ + if (seq_gt (hole->end, sb->max_byte_sacked)) + sb->max_byte_sacked = hole->end; + } scoreboard_remove_hole (sb, hole); hole = next_hole; } - /* Partial overlap */ + /* Partial 'head' overlap */ else { - sb->sacked_bytes += blk->end - hole->start; - hole->start = blk->end; + if (seq_gt (blk->end, hole->start)) + { + sb->sacked_bytes += blk->end - hole->start; + hole->start = blk->end; + } blk_index++; } } else { /* Hole must be split */ - if (seq_leq (blk->end, hole->end)) + if (seq_lt (blk->end, hole->end)) { sb->sacked_bytes += blk->end - blk->start; - scoreboard_insert_hole (sb, hole, blk->end, hole->end); - hole->end = blk->start - 1; + hole_index = scoreboard_hole_index (sb, hole); + new_hole = scoreboard_insert_hole (sb, hole_index, blk->end, + hole->end); + + /* Pool might've moved */ + hole = scoreboard_get_hole (sb, hole_index); + hole->end = blk->start; + + /* New or split of tail */ + if ((last_hole->end == new_hole->end) + || seq_lt (last_hole->end, new_hole->start)) + { + last_hole = new_hole; + sb->tail = scoreboard_hole_index (sb, new_hole); + } + blk_index++; + hole = scoreboard_next_hole (sb, hole); } else { - sb->sacked_bytes += hole->end - blk->start + 1; - hole->end = blk->start - 1; + sb->sacked_bytes += hole->end - blk->start; + hole->end = blk->start; hole = scoreboard_next_hole (sb, hole); } } } + + sb->last_sacked_bytes = sb->sacked_bytes + sb->snd_una_adv + - old_sacked_bytes; } /** Update snd_wnd @@ -577,72 +665,94 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) static void tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd) { - if (tc->snd_wl1 < seq || (tc->snd_wl1 == seq && tc->snd_wl2 <= ack)) + if (seq_lt (tc->snd_wl1, seq) + || (tc->snd_wl1 == seq && seq_leq (tc->snd_wl2, ack))) { tc->snd_wnd = snd_wnd; tc->snd_wl1 = seq; tc->snd_wl2 = ack; + TCP_EVT_DBG (TCP_EVT_SND_WND, tc); } } -static void +void tcp_cc_congestion (tcp_connection_t * tc) { + tc->snd_congestion = tc->snd_nxt; tc->cc_algo->congestion (tc); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 4); } -static void +void tcp_cc_recover (tcp_connection_t * tc) { - if (tcp_in_fastrecovery (tc)) - { - tc->cc_algo->recovered (tc); - tcp_recovery_off (tc); - } - else if (tcp_in_recovery (tc)) - { - tcp_recovery_off (tc); - tc->cwnd = tcp_loss_wnd (tc); - } + tc->cc_algo->recovered (tc); + + tc->rtx_bytes = 0; + tc->rcv_dupacks = 0; + tc->snd_nxt = tc->snd_una; + + tc->cc_algo->rcv_ack (tc); + tc->tsecr_last_ack = tc->opt.tsecr; + + tcp_fastrecovery_1_smss_off (tc); + tcp_fastrecovery_off (tc); + + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); } static void -tcp_cc_rcv_ack (tcp_connection_t * tc) +tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) { u8 partial_ack; if (tcp_in_recovery (tc)) { - partial_ack = seq_lt (tc->snd_una, tc->snd_una_max); + partial_ack = seq_lt (tc->snd_una, tc->snd_congestion); if (!partial_ack) { /* Clear retransmitted bytes. */ - tc->rtx_bytes = 0; tcp_cc_recover (tc); } else { + TCP_EVT_DBG (TCP_EVT_CC_PACK, tc); + /* Clear retransmitted bytes. XXX should we clear all? */ tc->rtx_bytes = 0; tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK); - /* Retransmit first unacked segment */ - tcp_retransmit_first_unacked (tc); + /* In case snd_nxt is still in the past and output tries to + * shove some new bytes */ + tc->snd_nxt = tc->snd_una; + + /* XXX need proper RFC6675 support */ + if (tc->sack_sb.last_sacked_bytes) + { + tcp_fast_retransmit (tc); + } + else + { + /* Retransmit first unacked segment */ + tcp_retransmit_first_unacked (tc); + /* If window allows, send 1 SMSS of new data */ + if (seq_lt (tc->snd_nxt, tc->snd_congestion)) + tc->snd_nxt = tc->snd_congestion; + } } } else { tc->cc_algo->rcv_ack (tc); + tc->tsecr_last_ack = tc->opt.tsecr; + tc->rcv_dupacks = 0; } - - tc->rcv_dupacks = 0; - tc->tsecr_last_ack = tc->opt.tsecr; } static void tcp_cc_rcv_dupack (tcp_connection_t * tc, u32 ack) { - ASSERT (tc->snd_una == ack); +// ASSERT (seq_geq(tc->snd_una, ack)); tc->rcv_dupacks++; if (tc->rcv_dupacks == TCP_DUPACK_THRESHOLD) @@ -688,20 +798,39 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, { u32 new_snd_wnd; - /* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) then send an - * ACK, drop the segment, and return */ + /* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) */ if (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)) { - tcp_make_ack (tc, b); - *next = tcp_next_output (tc->c_is_ip4); - *error = TCP_ERROR_ACK_INVALID; - return -1; + /* If we have outstanding data and this is within the window, accept it, + * probably retransmit has timed out. Otherwise ACK segment and then + * drop it */ + if (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_una_max)) + { + tcp_make_ack (tc, b); + *next = tcp_next_output (tc->c_is_ip4); + *error = TCP_ERROR_ACK_INVALID; + TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 0, + vnet_buffer (b)->tcp.ack_number); + return -1; + } + + tc->snd_nxt = vnet_buffer (b)->tcp.ack_number; + *error = TCP_ERROR_ACK_FUTURE; + TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 2, + vnet_buffer (b)->tcp.ack_number); } - /* If old ACK, discard */ + /* If old ACK, probably it's an old dupack */ if (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una)) { *error = TCP_ERROR_ACK_OLD; + TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 1, + vnet_buffer (b)->tcp.ack_number); + if (tcp_in_fastrecovery (tc) && tc->rcv_dupacks == TCP_DUPACK_THRESHOLD) + { + TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc); + tcp_cc_rcv_dupack (tc, vnet_buffer (b)->tcp.ack_number); + } return -1; } @@ -712,32 +841,40 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, if (tcp_ack_is_dupack (tc, b, new_snd_wnd)) { + TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc, 1); tcp_cc_rcv_dupack (tc, vnet_buffer (b)->tcp.ack_number); *error = TCP_ERROR_ACK_DUP; return -1; } - /* Valid ACK */ + /* + * Valid ACK + */ + tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una; - tc->snd_una = vnet_buffer (b)->tcp.ack_number; + tc->snd_una = vnet_buffer (b)->tcp.ack_number + tc->sack_sb.snd_una_adv; - /* Dequeue ACKed packet and update RTT */ + /* Dequeue ACKed data and update RTT */ tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number); - tcp_update_snd_wnd (tc, vnet_buffer (b)->tcp.seq_number, vnet_buffer (b)->tcp.ack_number, new_snd_wnd); - /* Updates congestion control (slow start/congestion avoidance) */ - tcp_cc_rcv_ack (tc); + /* If some of our sent bytes have been acked, update cc and retransmit + * timer. */ + if (tc->bytes_acked) + { + TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc, vnet_buffer (b)->tcp.ack_number); - TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc); + /* Updates congestion control (slow start/congestion avoidance) */ + tcp_cc_rcv_ack (tc, b); - /* If everything has been acked, stop retransmit timer - * otherwise update */ - if (tc->snd_una == tc->snd_una_max) - tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT); - else - tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, tc->rto); + /* If everything has been acked, stop retransmit timer + * otherwise update */ + if (tc->snd_una == tc->snd_una_max) + tcp_retransmit_timer_reset (tc); + else + tcp_retransmit_timer_update (tc); + } return 0; } @@ -757,9 +894,7 @@ static void tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end) { sack_block_t *new_list = 0, block; - u32 n_elts; int i; - u8 new_head = 0; /* If the first segment is ooo add it to the list. Last write might've moved * rcv_nxt over the first segment. */ @@ -768,7 +903,6 @@ tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end) block.start = start; block.end = end; vec_add1 (new_list, block); - new_head = 1; } /* Find the blocks still worth keeping. */ @@ -782,20 +916,19 @@ tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end) || seq_leq (tc->snd_sacks[i].start, end)) continue; - /* Save subsequent segments to new SACK list. */ - n_elts = clib_min (vec_len (tc->snd_sacks) - i, - TCP_MAX_SACK_BLOCKS - new_head); - vec_insert_elts (new_list, &tc->snd_sacks[i], n_elts, new_head); - break; + /* Save to new SACK list. */ + vec_add1 (new_list, tc->snd_sacks[i]); } + ASSERT (vec_len (new_list) < TCP_MAX_SACK_BLOCKS); + /* Replace old vector with new one */ vec_free (tc->snd_sacks); tc->snd_sacks = new_list; } /** Enqueue data for delivery to application */ -always_inline u32 +always_inline int tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, u16 data_len) { @@ -812,6 +945,8 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, vlib_buffer_get_current (b), data_len, 1 /* queue event */ ); + TCP_EVT_DBG (TCP_EVT_INPUT, tc, 0, data_len, written); + /* Update rcv_nxt */ if (PREDICT_TRUE (written == data_len)) { @@ -824,38 +959,61 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, /* Send ACK confirming the update */ tc->flags |= TCP_CONN_SNDACK; + } + else if (written > 0) + { + /* We've written something but FIFO is probably full now */ + tc->rcv_nxt += written; - /* Update SACK list if need be */ - if (tcp_opts_sack_permitted (&tc->opt)) - { - /* Remove SACK blocks that have been delivered */ - tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt); - } + /* Depending on how fast the app is, all remaining buffers in burst will + * not be enqueued. Should we inform peer of the damage? XXX */ + return TCP_ERROR_PARTIALLY_ENQUEUED; } else { - ASSERT (0); return TCP_ERROR_FIFO_FULL; } + /* Update SACK list if need be */ + if (tcp_opts_sack_permitted (&tc->opt)) + { + /* Remove SACK blocks that have been delivered */ + tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt); + } + return TCP_ERROR_ENQUEUED; } /** Enqueue out-of-order data */ -always_inline u32 +always_inline int tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, u16 data_len) { stream_session_t *s0; u32 offset, seq; + int rv; + + /* Pure ACK. Do nothing */ + if (PREDICT_FALSE (data_len == 0)) + { + return TCP_ERROR_PURE_ACK; + } s0 = stream_session_get (tc->c_s_index, tc->c_thread_index); seq = vnet_buffer (b)->tcp.seq_number; offset = seq - tc->rcv_nxt; - if (svm_fifo_enqueue_with_offset (s0->server_rx_fifo, s0->pid, offset, - data_len, vlib_buffer_get_current (b))) - return TCP_ERROR_FIFO_FULL; + rv = svm_fifo_enqueue_with_offset (s0->server_rx_fifo, s0->pid, offset, + data_len, vlib_buffer_get_current (b)); + + /* Nothing written */ + if (rv) + { + TCP_EVT_DBG (TCP_EVT_INPUT, tc, 1, data_len, 0); + return TCP_ERROR_FIFO_FULL; + } + + TCP_EVT_DBG (TCP_EVT_INPUT, tc, 1, data_len, data_len); /* Update SACK list if in use */ if (tcp_opts_sack_permitted (&tc->opt)) @@ -875,20 +1033,23 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, } /** - * Check if ACK could be delayed. DELACK timer is set only after frame is - * processed so this can return true for a full bursts of packets. + * Check if ACK could be delayed. If ack can be delayed, it should return + * true for a full frame. If we're always acking return 0. */ always_inline int tcp_can_delack (tcp_connection_t * tc) { - /* If there's no DELACK timer set and the last window sent wasn't 0 we - * can safely delay. */ - if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK) - && (tc->flags & TCP_CONN_SENT_RCV_WND0) == 0 - && (tc->flags & TCP_CONN_SNDACK) == 0) - return 1; + /* Send ack if ... */ + if (TCP_ALWAYS_ACK + /* just sent a rcv wnd 0 */ + || (tc->flags & TCP_CONN_SENT_RCV_WND0) != 0 + /* constrained to send ack */ + || (tc->flags & TCP_CONN_SNDACK) != 0 + /* we're almost out of tx wnd */ + || tcp_available_snd_space (tc) < 2 * tc->snd_mss) + return 0; - return 0; + return 1; } static int @@ -900,23 +1061,33 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, /* Handle out-of-order data */ if (PREDICT_FALSE (vnet_buffer (b)->tcp.seq_number != tc->rcv_nxt)) { + /* Old sequence numbers allowed through because they overlapped + * the rx window */ + if (seq_lt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt)) + { + error = TCP_ERROR_SEGMENT_OLD; + *next0 = TCP_NEXT_DROP; + goto done; + } + error = tcp_session_enqueue_ooo (tc, b, n_data_bytes); - /* Don't send more than 3 dupacks per burst - * XXX decide if this is good */ - if (tc->snt_dupacks < 3) - { - /* RFC2581: Send DUPACK for fast retransmit */ - tcp_make_ack (tc, b); - *next0 = tcp_next_output (tc->c_is_ip4); + /* N.B. Should not filter burst of dupacks. Two issues 1) dupacks open + * cwnd on remote peer when congested 2) acks leaving should have the + * latest rcv_wnd since the burst may eaten up all of it, so only the + * old ones could be filtered. + */ - /* Mark as DUPACK. We may filter these in output if - * the burst fills the holes. */ - vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_DUPACK; + /* RFC2581: Send DUPACK for fast retransmit */ + tcp_make_ack (tc, b); + *next0 = tcp_next_output (tc->c_is_ip4); - tc->snt_dupacks++; - } + /* Mark as DUPACK. We may filter these in output if + * the burst fills the holes. */ + if (n_data_bytes) + vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_DUPACK; + TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc); goto done; } @@ -924,63 +1095,45 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, * segments can be enqueued after fifo tail offset changes. */ error = tcp_session_enqueue_data (tc, b, n_data_bytes); - TCP_EVT_DBG (TCP_EVT_INPUT, tc, n_data_bytes); + if (n_data_bytes == 0) + { + *next0 = TCP_NEXT_DROP; + goto done; + } + + if (PREDICT_FALSE (error == TCP_ERROR_FIFO_FULL)) + *next0 = TCP_NEXT_DROP; /* Check if ACK can be delayed */ - if (tcp_can_delack (tc)) + if (!tcp_can_delack (tc)) { - /* Nothing to do for pure ACKs */ + /* Nothing to do for pure ACKs XXX */ if (n_data_bytes == 0) goto done; - /* If connection has not been previously marked for delay ack - * add it to the list and flag it */ - if (!tc->flags & TCP_CONN_DELACK) - { - vec_add1 (tm->delack_connections[tc->c_thread_index], - tc->c_c_index); - tc->flags |= TCP_CONN_DELACK; - } + *next0 = tcp_next_output (tc->c_is_ip4); + tcp_make_ack (tc, b); } else { - /* Check if a packet has already been enqueued to output for burst. - * If yes, then drop this one, otherwise, let it pass through to - * output */ - if ((tc->flags & TCP_CONN_BURSTACK) == 0) - { - *next0 = tcp_next_output (tc->c_is_ip4); - tcp_make_ack (tc, b); - error = TCP_ERROR_ENQUEUED; - - /* TODO: maybe add counter to ensure N acks will be sent/burst */ - tc->flags |= TCP_CONN_BURSTACK; - } + if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK)) + tcp_timer_set (tc, TCP_TIMER_DELACK, TCP_DELACK_TIME); } done: return error; } -void -delack_timers_init (tcp_main_t * tm, u32 thread_index) +always_inline void +tcp_established_inc_counter (vlib_main_t * vm, u8 is_ip4, u8 evt, u8 val) { - tcp_connection_t *tc; - u32 i, *conns; - tw_timer_wheel_16t_2w_512sl_t *tw; - - tw = &tm->timer_wheels[thread_index]; - conns = tm->delack_connections[thread_index]; - for (i = 0; i < vec_len (conns); i++) - { - tc = pool_elt_at_index (tm->connections[thread_index], conns[i]); - ASSERT (0 != tc); + if (PREDICT_TRUE (!val)) + return; - tc->timers[TCP_TIMER_DELACK] - = tw_timer_start_16t_2w_512sl (tw, conns[i], - TCP_TIMER_DELACK, TCP_DELACK_TIME); - } - vec_reset_length (tm->delack_connections[thread_index]); + if (is_ip4) + vlib_node_increment_counter (vm, tcp4_established_node.index, evt, val); + else + vlib_node_increment_counter (vm, tcp6_established_node.index, evt, val); } always_inline uword @@ -1027,7 +1180,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (PREDICT_FALSE (tc0 == 0)) { error0 = TCP_ERROR_INVALID_CONNECTION; - goto drop; + goto done; } /* Checksum computed by ipx_local no need to compute again */ @@ -1061,18 +1214,22 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (PREDICT_FALSE (tcp_segment_validate (vm, tc0, b0, th0, &next0))) { error0 = TCP_ERROR_SEGMENT_INVALID; - goto drop; + TCP_EVT_DBG (TCP_EVT_SEG_INVALID, tc0, + vnet_buffer (b0)->tcp.seq_number, + vnet_buffer (b0)->tcp.seq_end); + goto done; } /* 5: check the ACK field */ if (tcp_rcv_ack (tc0, b0, th0, &next0, &error0)) { - goto drop; + goto done; } /* 6: check the URG bit TODO */ /* 7: process the segment text */ + vlib_buffer_advance (b0, n_advance_bytes0); error0 = tcp_segment_rcv (tm, tc0, b0, n_data_bytes0, &next0); @@ -1088,7 +1245,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); } - drop: + done: b0->error = node->errors[error0]; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { @@ -1103,17 +1260,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } errors = session_manager_flush_enqueue_events (my_thread_index); - if (errors) - { - if (is_ip4) - vlib_node_increment_counter (vm, tcp4_established_node.index, - TCP_ERROR_EVENT_FIFO_FULL, errors); - else - vlib_node_increment_counter (vm, tcp6_established_node.index, - TCP_ERROR_EVENT_FIFO_FULL, errors); - } - - delack_timers_init (tm, my_thread_index); + tcp_established_inc_counter (vm, is_ip4, TCP_ERROR_EVENT_FIFO_FULL, errors); return from_frame->n_vectors; } @@ -1602,7 +1749,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, stream_session_accept_notify (&tc0->connection); /* Reset SYN-ACK retransmit timer */ - tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT); + tcp_retransmit_timer_reset (tc0); break; case TCP_STATE_ESTABLISHED: /* We can get packets in established state here because they @@ -1668,7 +1815,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); /* Stop retransmit */ - tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT); + tcp_retransmit_timer_reset (tc0); goto drop; diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 114a5b9e..a671f728 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -125,15 +125,33 @@ tcp_initial_window_to_advertise (tcp_connection_t * tc) u32 tcp_window_to_advertise (tcp_connection_t * tc, tcp_state_t state) { - u32 available_space, max_fifo, observed_wnd; - if (state < TCP_STATE_ESTABLISHED) return tcp_initial_window_to_advertise (tc); + tcp_update_rcv_wnd (tc); + + if (tc->rcv_wnd == 0) + { + tc->flags |= TCP_CONN_SENT_RCV_WND0; + } + else + { + tc->flags &= ~TCP_CONN_SENT_RCV_WND0; + } + + return tc->rcv_wnd >> tc->rcv_wscale; +} + +void +tcp_update_rcv_wnd (tcp_connection_t * tc) +{ + i32 observed_wnd; + u32 available_space, max_fifo, wnd; + /* * Figure out how much space we have available */ - available_space = stream_session_max_enqueue (&tc->connection); + available_space = stream_session_max_rx_enqueue (&tc->connection); max_fifo = stream_session_fifo_size (&tc->connection); ASSERT (tc->opt.mss < max_fifo); @@ -145,23 +163,25 @@ tcp_window_to_advertise (tcp_connection_t * tc, tcp_state_t state) * Use the above and what we know about what we've previously advertised * to compute the new window */ - observed_wnd = tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las); + observed_wnd = (i32) tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las); + if (observed_wnd < 0) + observed_wnd = 0; /* Bad. Thou shalt not shrink */ if (available_space < observed_wnd) { - if (available_space == 0) - clib_warning ("Didn't shrink rcv window despite not having space"); + /* Does happen! */ + wnd = observed_wnd; } - - tc->rcv_wnd = clib_min (available_space, TCP_WND_MAX << tc->rcv_wscale); - - if (tc->rcv_wnd == 0) + else { - tc->flags |= TCP_CONN_SENT_RCV_WND0; + wnd = available_space; } - return tc->rcv_wnd >> tc->rcv_wscale; + if (wnd && ((wnd << tc->rcv_wscale) >> tc->rcv_wscale != wnd)) + wnd += 1 << tc->rcv_wscale; + + tc->rcv_wnd = clib_min (wnd, TCP_WND_MAX << tc->rcv_wscale); } /** @@ -363,7 +383,7 @@ tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts, #define tcp_get_free_buffer_index(tm, bidx) \ do { \ u32 *my_tx_buffers, n_free_buffers; \ - u32 cpu_index = tm->vlib_main->cpu_index; \ + u32 cpu_index = os_get_cpu_number(); \ my_tx_buffers = tm->tx_buffers[cpu_index]; \ if (PREDICT_FALSE(vec_len (my_tx_buffers) == 0)) \ { \ @@ -381,6 +401,14 @@ do { \ _vec_len (my_tx_buffers) -= 1; \ } while (0) +#define tcp_return_buffer(tm) \ +do { \ + u32 *my_tx_buffers; \ + u32 cpu_index = os_get_cpu_number(); \ + my_tx_buffers = tm->tx_buffers[cpu_index]; \ + _vec_len (my_tx_buffers) +=1; \ +} while (0) + always_inline void tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b) { @@ -421,8 +449,6 @@ tcp_make_ack_i (tcp_connection_t * tc, vlib_buffer_t * b, tcp_state_t state, tc->rcv_nxt, tcp_hdr_opts_len, flags, wnd); tcp_options_write ((u8 *) (th + 1), snd_opts); - - /* Mark as ACK */ vnet_buffer (b)->tcp.connection_index = tc->c_c_index; } @@ -432,12 +458,12 @@ tcp_make_ack_i (tcp_connection_t * tc, vlib_buffer_t * b, tcp_state_t state, void tcp_make_ack (tcp_connection_t * tc, vlib_buffer_t * b) { - tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; + vlib_main_t *vm = vlib_get_main (); tcp_reuse_buffer (vm, b); tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_ACK); vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; + TCP_EVT_DBG (TCP_EVT_ACK_SENT, tc); } /** @@ -446,8 +472,7 @@ tcp_make_ack (tcp_connection_t * tc, vlib_buffer_t * b) void tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b) { - tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; + vlib_main_t *vm = vlib_get_main (); u8 flags = 0; tcp_reuse_buffer (vm, b); @@ -467,8 +492,7 @@ tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b) void tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b) { - tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; + vlib_main_t *vm = vlib_get_main (); tcp_options_t _snd_opts, *snd_opts = &_snd_opts; u8 tcp_opts_len, tcp_hdr_opts_len; tcp_header_t *th; @@ -631,7 +655,7 @@ tcp_send_reset (vlib_buffer_t * pkt, u8 is_ip4) vlib_buffer_t *b; u32 bi; tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; + vlib_main_t *vm = vlib_get_main (); u8 tcp_hdr_len, flags = 0; tcp_header_t *th, *pkt_th; u32 seq, ack; @@ -736,7 +760,7 @@ tcp_send_syn (tcp_connection_t * tc) vlib_buffer_t *b; u32 bi; tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; + vlib_main_t *vm = vlib_get_main (); u8 tcp_hdr_opts_len, tcp_opts_len; tcp_header_t *th; u32 time_now; @@ -795,9 +819,9 @@ tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) /* Decide where to send the packet */ next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index; - f = vlib_get_frame_to_node (vm, next_index); /* Enqueue the packet */ + f = vlib_get_frame_to_node (vm, next_index); to_next = vlib_frame_vector_args (f); to_next[0] = bi; f->n_vectors = 1; @@ -813,7 +837,7 @@ tcp_send_fin (tcp_connection_t * tc) vlib_buffer_t *b; u32 bi; tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; + vlib_main_t *vm = vlib_get_main (); tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); @@ -884,22 +908,21 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, vnet_buffer (b)->tcp.connection_index = tc->c_c_index; tc->snd_nxt += data_len; + /* TODO this is updated in output as well ... */ + if (tc->snd_nxt > tc->snd_una_max) + tc->snd_una_max = tc->snd_nxt; TCP_EVT_DBG (TCP_EVT_PKTIZE, tc); } -/* Send delayed ACK when timer expires */ void -tcp_timer_delack_handler (u32 index) +tcp_send_ack (tcp_connection_t * tc) { tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; - u32 thread_index = os_get_cpu_number (); - tcp_connection_t *tc; + vlib_main_t *vm = vlib_get_main (); + vlib_buffer_t *b; u32 bi; - tc = tcp_connection_get (index, thread_index); - /* Get buffer */ tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); @@ -907,12 +930,22 @@ tcp_timer_delack_handler (u32 index) /* Fill in the ACK */ tcp_make_ack (tc, b); - tc->timers[TCP_TIMER_DELACK] = TCP_TIMER_HANDLE_INVALID; - tc->flags &= ~TCP_CONN_DELACK; - tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); } +/* Send delayed ACK when timer expires */ +void +tcp_timer_delack_handler (u32 index) +{ + u32 thread_index = os_get_cpu_number (); + tcp_connection_t *tc; + + tc = tcp_connection_get (index, thread_index); + tc->timers[TCP_TIMER_DELACK] = TCP_TIMER_HANDLE_INVALID; +// tc->flags &= ~TCP_CONN_DELACK; + tcp_send_ack (tc); +} + /** Build a retransmit segment * * @return the number of bytes in the segment or 0 if there's nothing to @@ -920,59 +953,74 @@ tcp_timer_delack_handler (u32 index) * */ u32 tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, - u32 max_bytes) + u32 offset, u32 max_bytes) { - tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; - u32 n_bytes, offset = 0; - sack_scoreboard_hole_t *hole; - u32 hole_size; + vlib_main_t *vm = vlib_get_main (); + u32 n_bytes = 0; tcp_reuse_buffer (vm, b); ASSERT (tc->state >= TCP_STATE_ESTABLISHED); ASSERT (max_bytes != 0); - if (tcp_opts_sack_permitted (&tc->opt)) - { - /* XXX get first hole not retransmitted yet */ - hole = scoreboard_first_hole (&tc->sack_sb); - if (!hole) - return 0; - - offset = hole->start - tc->snd_una; - hole_size = hole->end - hole->start; + max_bytes = clib_min (tc->snd_mss, max_bytes); - ASSERT (hole_size); + /* Start is beyond snd_congestion */ + if (seq_geq (tc->snd_una + offset, tc->snd_congestion)) + goto done; - if (hole_size < max_bytes) - max_bytes = hole_size; - } - else + /* Don't overshoot snd_congestion */ + if (seq_gt (tc->snd_nxt + max_bytes, tc->snd_congestion)) { - if (seq_geq (tc->snd_nxt, tc->snd_una_max)) - return 0; + max_bytes = tc->snd_congestion - tc->snd_nxt; + if (max_bytes == 0) + goto done; } + ASSERT (max_bytes <= tc->snd_mss); + n_bytes = stream_session_peek_bytes (&tc->connection, vlib_buffer_get_current (b), offset, max_bytes); ASSERT (n_bytes != 0); - + b->current_length = n_bytes; tcp_push_hdr_i (tc, b, tc->state); +done: + TCP_EVT_DBG (TCP_EVT_CC_RTX, tc, offset, n_bytes); return n_bytes; } +/** + * Reset congestion control, switch cwnd to loss window and try again. + */ +static void +tcp_rtx_timeout_cc_recover (tcp_connection_t * tc) +{ + /* Cleanly recover cc (also clears up fast retransmit) */ + if (tcp_in_fastrecovery (tc)) + { + tcp_cc_recover (tc); + } + else + { + tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss); + } + + /* Start again from the beginning */ + tc->cwnd = tcp_loss_wnd (tc); + tc->snd_congestion = tc->snd_una_max; +} + static void tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; + vlib_main_t *vm = vlib_get_main (); u32 thread_index = os_get_cpu_number (); tcp_connection_t *tc; vlib_buffer_t *b; - u32 bi, max_bytes, snd_space; + u32 bi, snd_space, n_bytes; if (is_syn) { @@ -998,26 +1046,43 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) if (tc->state >= TCP_STATE_ESTABLISHED) { - tcp_fastrecovery_off (tc); + /* First retransmit timeout */ + if (tc->rto_boff == 1) + tcp_rtx_timeout_cc_recover (tc); /* Exponential backoff */ tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); /* Figure out what and how many bytes we can send */ snd_space = tcp_available_snd_space (tc); - max_bytes = clib_min (tc->snd_mss, snd_space); - if (max_bytes == 0) + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1); + + if (snd_space == 0) { clib_warning ("no wnd to retransmit"); + tcp_return_buffer (tm); + + /* Force one segment */ + tcp_retransmit_first_unacked (tc); + + /* Re-enable retransmit timer. Output may be unwilling + * to do it for us */ + tcp_retransmit_timer_set (tc); + return; } - tcp_prepare_retransmit_segment (tc, b, max_bytes); + else + { + /* No fancy recovery for now! */ + n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, snd_space); + scoreboard_clear (&tc->sack_sb); - tc->rtx_bytes += max_bytes; + if (n_bytes == 0) + return; - /* No fancy recovery for now! */ - scoreboard_clear (&tc->sack_sb); + tc->rtx_bytes += n_bytes; + } } else { @@ -1072,63 +1137,110 @@ tcp_timer_retransmit_syn_handler (u32 index) } /** - * Retansmit first unacked segment */ + * Retransmit first unacked segment + */ void tcp_retransmit_first_unacked (tcp_connection_t * tc) { tcp_main_t *tm = vnet_get_tcp_main (); - u32 snd_nxt = tc->snd_nxt; + vlib_main_t *vm = vlib_get_main (); vlib_buffer_t *b; - u32 bi; + u32 bi, n_bytes; tc->snd_nxt = tc->snd_una; /* Get buffer */ tcp_get_free_buffer_index (tm, &bi); - b = vlib_get_buffer (tm->vlib_main, bi); + b = vlib_get_buffer (vm, bi); + + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2); - tcp_prepare_retransmit_segment (tc, b, tc->snd_mss); - tcp_enqueue_to_output (tm->vlib_main, b, bi, tc->c_is_ip4); + n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); + if (n_bytes == 0) + return; - tc->snd_nxt = snd_nxt; - tc->rtx_bytes += tc->snd_mss; + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + tc->rtx_bytes += n_bytes; +} + +sack_scoreboard_hole_t * +scoreboard_first_rtx_hole (sack_scoreboard_t * sb) +{ + sack_scoreboard_hole_t *hole = 0; + +// hole = scoreboard_first_hole (&tc->sack_sb); +// if (hole) +// { +// +// offset = hole->start - tc->snd_una; +// hole_size = hole->end - hole->start; +// +// ASSERT(hole_size); +// +// if (hole_size < max_bytes) +// max_bytes = hole_size; +// } + return hole; } +/** + * Do fast retransmit. + */ void tcp_fast_retransmit (tcp_connection_t * tc) { tcp_main_t *tm = vnet_get_tcp_main (); - u32 snd_space, max_bytes, n_bytes, bi; + vlib_main_t *vm = vlib_get_main (); + u32 bi; + int snd_space; + u32 n_written = 0, offset = 0; vlib_buffer_t *b; + u8 use_sacks = 0; ASSERT (tcp_in_fastrecovery (tc)); - clib_warning ("fast retransmit!"); - /* Start resending from first un-acked segment */ tc->snd_nxt = tc->snd_una; snd_space = tcp_available_snd_space (tc); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); + + /* If we have SACKs use them */ + if (tcp_opts_sack_permitted (&tc->opt) + && scoreboard_first_hole (&tc->sack_sb)) + use_sacks = 0; - while (snd_space) + while (snd_space > 0) { tcp_get_free_buffer_index (tm, &bi); - b = vlib_get_buffer (tm->vlib_main, bi); + b = vlib_get_buffer (vm, bi); + + if (use_sacks) + { + scoreboard_first_rtx_hole (&tc->sack_sb); + } + else + { + offset += n_written; + } - max_bytes = clib_min (tc->snd_mss, snd_space); - n_bytes = tcp_prepare_retransmit_segment (tc, b, max_bytes); + n_written = tcp_prepare_retransmit_segment (tc, b, offset, snd_space); /* Nothing left to retransmit */ - if (n_bytes == 0) - return; - - tcp_enqueue_to_output (tm->vlib_main, b, bi, tc->c_is_ip4); + if (n_written == 0) + { + tcp_return_buffer (tm); + break; + } - snd_space -= n_bytes; + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + tc->rtx_bytes += n_written; + snd_space -= n_written; } - /* If window allows, send new data */ - tc->snd_nxt = tc->snd_una_max; + /* If window allows, send 1 SMSS of new data */ + if (seq_lt (tc->snd_nxt, tc->snd_congestion)) + tc->snd_nxt = tc->snd_congestion; } always_inline u32 @@ -1209,8 +1321,6 @@ tcp46_output_inline (vlib_main_t * vm, if (PREDICT_FALSE (vnet_buffer (b0)->tcp.flags & TCP_BUF_FLAG_DUPACK)) { - ASSERT (tc0->snt_dupacks > 0); - tc0->snt_dupacks--; if (!tcp_session_has_ooo_data (tc0)) { error0 = TCP_ERROR_FILTERED_DUPACKS; @@ -1223,8 +1333,7 @@ tcp46_output_inline (vlib_main_t * vm, tc0->rcv_las = tc0->rcv_nxt; /* Stop DELACK timer and fix flags */ - tc0->flags &= - ~(TCP_CONN_SNDACK | TCP_CONN_DELACK | TCP_CONN_BURSTACK); + tc0->flags &= ~(TCP_CONN_SNDACK); if (tcp_timer_is_active (tc0, TCP_TIMER_DELACK)) { tcp_timer_reset (tc0, TCP_TIMER_DELACK); diff --git a/src/vnet/tcp/tcp_packet.h b/src/vnet/tcp/tcp_packet.h index 866c5fd6..4f28cf32 100644 --- a/src/vnet/tcp/tcp_packet.h +++ b/src/vnet/tcp/tcp_packet.h @@ -137,7 +137,7 @@ enum typedef struct _sack_block { u32 start; /**< Start sequence number */ - u32 end; /**< End sequence number */ + u32 end; /**< End sequence number (first outside) */ } sack_block_t; typedef struct diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c new file mode 100644 index 00000000..0725bb04 --- /dev/null +++ b/src/vnet/tcp/tcp_test.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#define TCP_TEST_I(_cond, _comment, _args...) \ +({ \ + int _evald = (_cond); \ + if (!(_evald)) { \ + fformat(stderr, "FAIL:%d: " _comment "\n", \ + __LINE__, ##_args); \ + } else { \ + fformat(stderr, "PASS:%d: " _comment "\n", \ + __LINE__, ##_args); \ + } \ + _evald; \ +}) + +#define TCP_TEST(_cond, _comment, _args...) \ +{ \ + if (!TCP_TEST_I(_cond, _comment, ##_args)) { \ + return 1; \ + } \ +} + +static int +tcp_test_sack () +{ + tcp_connection_t _tc, *tc = &_tc; + sack_scoreboard_t *sb = &tc->sack_sb; + sack_block_t *sacks = 0, block; + sack_scoreboard_hole_t *hole; + int i; + + memset (tc, 0, sizeof (*tc)); + + tc->snd_una = 0; + tc->snd_una_max = 1000; + tc->snd_nxt = 1000; + tc->opt.flags |= TCP_OPTS_FLAG_SACK; + scoreboard_init (&tc->sack_sb); + + for (i = 0; i < 1000 / 100; i++) + { + block.start = i * 100; + block.end = (i + 1) * 100; + vec_add1 (sacks, block); + } + + /* + * Inject even blocks + */ + + for (i = 0; i < 1000 / 200; i++) + { + vec_add1 (tc->opt.sacks, sacks[i * 2]); + } + tc->opt.n_sack_blocks = vec_len (tc->opt.sacks); + tcp_rcv_sacks (tc, 0); + + TCP_TEST ((pool_elts (sb->holes) == 5), + "scoreboard has %d elements", pool_elts (sb->holes)); + + /* First SACK block should be rejected */ + hole = scoreboard_first_hole (sb); + TCP_TEST ((hole->start == 0 && hole->end == 200), + "first hole start %u end %u", hole->start, hole->end); + hole = scoreboard_last_hole (sb); + TCP_TEST ((hole->start == 900 && hole->end == 1000), + "last hole start %u end %u", hole->start, hole->end); + TCP_TEST ((sb->sacked_bytes == 400), "sacked bytes %d", sb->sacked_bytes); + TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv); + TCP_TEST ((sb->last_sacked_bytes == 400), + "last sacked bytes %d", sb->last_sacked_bytes); + + /* + * Inject odd blocks + */ + + vec_reset_length (tc->opt.sacks); + for (i = 0; i < 1000 / 200; i++) + { + vec_add1 (tc->opt.sacks, sacks[i * 2 + 1]); + } + tc->opt.n_sack_blocks = vec_len (tc->opt.sacks); + tcp_rcv_sacks (tc, 0); + + hole = scoreboard_first_hole (sb); + TCP_TEST ((pool_elts (sb->holes) == 1), + "scoreboard has %d holes", pool_elts (sb->holes)); + TCP_TEST ((hole->start == 0 && hole->end == 100), + "first hole start %u end %u", hole->start, hole->end); + TCP_TEST ((sb->sacked_bytes == 900), "sacked bytes %d", sb->sacked_bytes); + TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv); + TCP_TEST ((sb->max_byte_sacked == 1000), + "max sacked byte %u", sb->max_byte_sacked); + TCP_TEST ((sb->last_sacked_bytes == 500), + "last sacked bytes %d", sb->last_sacked_bytes); + + /* + * Ack until byte 100, all bytes are now acked + sacked + */ + tcp_rcv_sacks (tc, 100); + + TCP_TEST ((pool_elts (sb->holes) == 0), + "scoreboard has %d elements", pool_elts (sb->holes)); + TCP_TEST ((sb->snd_una_adv == 900), + "snd_una_adv after ack %u", sb->snd_una_adv); + TCP_TEST ((sb->max_byte_sacked == 1000), + "max sacked byte %u", sb->max_byte_sacked); + TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes); + TCP_TEST ((sb->last_sacked_bytes == 0), + "last sacked bytes %d", sb->last_sacked_bytes); + + /* + * Add new block + */ + + vec_reset_length (tc->opt.sacks); + + block.start = 1200; + block.end = 1300; + vec_add1 (tc->opt.sacks, block); + + tc->snd_una_max = 1500; + tc->snd_una = 1000; + tc->snd_nxt = 1500; + tcp_rcv_sacks (tc, 1000); + + TCP_TEST ((sb->snd_una_adv == 0), + "snd_una_adv after ack %u", sb->snd_una_adv); + TCP_TEST ((pool_elts (sb->holes) == 2), + "scoreboard has %d holes", pool_elts (sb->holes)); + hole = scoreboard_first_hole (sb); + TCP_TEST ((hole->start == 1000 && hole->end == 1200), + "first hole start %u end %u", hole->start, hole->end); + hole = scoreboard_last_hole (sb); + TCP_TEST ((hole->start == 1300 && hole->end == 1500), + "last hole start %u end %u", hole->start, hole->end); + TCP_TEST ((sb->sacked_bytes == 100), "sacked bytes %d", sb->sacked_bytes); + + /* + * Ack first hole + */ + + vec_reset_length (tc->opt.sacks); + tcp_rcv_sacks (tc, 1200); + + TCP_TEST ((sb->snd_una_adv == 100), + "snd_una_adv after ack %u", sb->snd_una_adv); + TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes); + TCP_TEST ((pool_elts (sb->holes) == 1), + "scoreboard has %d elements", pool_elts (sb->holes)); + + /* + * Remove all + */ + + scoreboard_clear (sb); + TCP_TEST ((pool_elts (sb->holes) == 0), + "number of holes %d", pool_elts (sb->holes)); + return 0; +} + +static clib_error_t * +tcp_test (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd_arg) +{ + int res = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "sack")) + { + res = tcp_test_sack (); + } + else + { + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + } + + if (res) + { + return clib_error_return (0, "TCP unit test failed"); + } + else + { + return 0; + } +} + +VLIB_CLI_COMMAND (tcp_test_command, static) = +{ +.path = "test tcp",.short_help = "internal tcp unit tests",.function = + tcp_test,}; +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/udp/builtin_server.c b/src/vnet/udp/builtin_server.c index 46c8e734..57f774c5 100644 --- a/src/vnet/udp/builtin_server.c +++ b/src/vnet/udp/builtin_server.c @@ -39,10 +39,10 @@ builtin_session_disconnect_callback (stream_session_t * s) } static int -builtin_server_rx_callback (stream_session_t * s, session_fifo_event_t * ep) +builtin_server_rx_callback (stream_session_t * s) { svm_fifo_t *rx_fifo, *tx_fifo; - u32 this_transfer; + u32 this_transfer, max_deq, max_enq; int actual_transfer; u8 *my_copy_buffer; session_fifo_event_t evt; @@ -52,9 +52,9 @@ builtin_server_rx_callback (stream_session_t * s, session_fifo_event_t * ep) rx_fifo = s->server_rx_fifo; tx_fifo = s->server_tx_fifo; - this_transfer = svm_fifo_max_enqueue (tx_fifo) - < svm_fifo_max_dequeue (rx_fifo) ? - svm_fifo_max_enqueue (tx_fifo) : svm_fifo_max_dequeue (rx_fifo); + max_deq = svm_fifo_max_dequeue (rx_fifo); + max_enq = svm_fifo_max_enqueue (tx_fifo); + this_transfer = max_enq < max_deq ? max_enq : max_deq; vec_validate (my_copy_buffer, this_transfer - 1); _vec_len (my_copy_buffer) = this_transfer; @@ -64,17 +64,20 @@ builtin_server_rx_callback (stream_session_t * s, session_fifo_event_t * ep) ASSERT (actual_transfer == this_transfer); actual_transfer = svm_fifo_enqueue_nowait (tx_fifo, 0, this_transfer, my_copy_buffer); + ASSERT (actual_transfer == this_transfer); copy_buffers[s->thread_index] = my_copy_buffer; - /* Fabricate TX event, send to ourselves */ - evt.fifo = tx_fifo; - evt.event_type = FIFO_EVENT_SERVER_TX; - /* $$$$ for event logging */ - evt.enqueue_length = actual_transfer; - evt.event_id = 0; - q = session_manager_get_vpp_event_queue (s->thread_index); - unix_shared_memory_queue_add (q, (u8 *) & evt, 0 /* do wait for mutex */ ); + if (svm_fifo_set_event (tx_fifo)) + { + /* Fabricate TX event, send to ourselves */ + evt.fifo = tx_fifo; + evt.event_type = FIFO_EVENT_SERVER_TX; + evt.event_id = 0; + q = session_manager_get_vpp_event_queue (s->thread_index); + unix_shared_memory_queue_add (q, (u8 *) & evt, + 0 /* do wait for mutex */ ); + } return 0; } diff --git a/src/vnet/udp/udp_input.c b/src/vnet/udp/udp_input.c index 88278735..4b22109b 100644 --- a/src/vnet/udp/udp_input.c +++ b/src/vnet/udp/udp_input.c @@ -244,44 +244,53 @@ udp4_uri_input_node_fn (vlib_main_t * vm, /* Get session's server */ server0 = application_get (s0->app_index); - /* Fabricate event */ - evt.fifo = s0->server_rx_fifo; - evt.event_type = FIFO_EVENT_SERVER_RX; - evt.event_id = serial_number++; - evt.enqueue_length = svm_fifo_max_dequeue (s0->server_rx_fifo); - /* Built-in server? Deliver the goods... */ if (server0->cb_fns.builtin_server_rx_callback) { - server0->cb_fns.builtin_server_rx_callback (s0, &evt); + server0->cb_fns.builtin_server_rx_callback (s0); continue; } - /* Add event to server's event queue */ - q = server0->event_queue; - - /* Don't block for lack of space */ - if (PREDICT_TRUE (q->cursize < q->maxsize)) - unix_shared_memory_queue_add (server0->event_queue, (u8 *) & evt, - 0 /* do wait for mutex */ ); - else + if (svm_fifo_set_event (s0->server_rx_fifo)) { - vlib_node_increment_counter (vm, udp4_uri_input_node.index, - SESSION_ERROR_FIFO_FULL, 1); + /* Fabricate event */ + evt.fifo = s0->server_rx_fifo; + evt.event_type = FIFO_EVENT_SERVER_RX; + evt.event_id = serial_number++; + + /* Add event to server's event queue */ + q = server0->event_queue; + + /* Don't block for lack of space */ + if (PREDICT_TRUE (q->cursize < q->maxsize)) + { + unix_shared_memory_queue_add (server0->event_queue, + (u8 *) & evt, + 0 /* do wait for mutex */ ); + } + else + { + vlib_node_increment_counter (vm, udp4_uri_input_node.index, + SESSION_ERROR_FIFO_FULL, 1); + } } + /* *INDENT-OFF* */ if (1) { ELOG_TYPE_DECLARE (e) = { - .format = "evt-enqueue: id %d length %d",.format_args = "i4i4",}; + .format = "evt-enqueue: id %d length %d", + .format_args = "i4i4",}; struct { u32 data[2]; } *ed; ed = ELOG_DATA (&vlib_global_main.elog_main, e); ed->data[0] = evt.event_id; - ed->data[1] = evt.enqueue_length; + ed->data[1] = svm_fifo_max_dequeue (s0->server_rx_fifo); } + /* *INDENT-ON* */ + } vec_reset_length (session_indices_to_enqueue); -- cgit 1.2.3-korg From 3e350af5d3e9744a4529a28dd293b2d4601442f7 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Thu, 30 Mar 2017 02:54:28 -0700 Subject: TCP cc/window management fixes and debugging - added persist timer - update rcv_las whenever sending an ack - moved fifo size to its own cache line - improved session and builtin client debugging Change-Id: Ia649cf942cf0c061a713e8b67f0eb6974a6cd55b Signed-off-by: Florin Coras Signed-off-by: Dave Barach --- src/svm/svm_fifo.c | 37 +++++----- src/svm/svm_fifo.h | 8 ++- src/vnet/session/node.c | 24 ++++--- src/vnet/session/session.c | 6 +- src/vnet/session/session.h | 12 +++- src/vnet/session/session_debug.h | 32 ++++++++- src/vnet/tcp/builtin_client.c | 146 +++++++++++++++++++++++++++------------ src/vnet/tcp/builtin_server.c | 12 +++- src/vnet/tcp/tcp.c | 6 +- src/vnet/tcp/tcp.h | 36 +++++++++- src/vnet/tcp/tcp_debug.h | 66 +++++++++++++----- src/vnet/tcp/tcp_input.c | 68 +++++++++--------- src/vnet/tcp/tcp_output.c | 79 +++++++++++++++++---- 13 files changed, 386 insertions(+), 146 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c index 07b0d2df..cc84feb9 100644 --- a/src/svm/svm_fifo.c +++ b/src/svm/svm_fifo.c @@ -254,6 +254,10 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) { ooo_segment_t *s; u32 index, bytes = 0, diff; + u32 cursize; + + /* read cursize, which can only increase while we're working */ + cursize = svm_fifo_max_dequeue (f); s = pool_elt_at_index (f->ooo_segments, f->ooos_list_head); @@ -286,8 +290,8 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) /* If tail is adjacent to an ooo segment, 'consume' it */ if (diff == 0) { - bytes = ((f->nitems - f->cursize) >= s->length) ? s->length : - f->nitems - f->cursize; + bytes = ((f->nitems - cursize) >= s->length) ? s->length : + f->nitems - cursize; f->tail += bytes; f->tail %= f->nitems; @@ -305,11 +309,12 @@ svm_fifo_enqueue_internal (svm_fifo_t * f, u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; u32 cursize, nitems; - if (PREDICT_FALSE (f->cursize == f->nitems)) + /* read cursize, which can only increase while we're working */ + cursize = svm_fifo_max_dequeue (f); + + if (PREDICT_FALSE (cursize == f->nitems)) return -2; /* fifo stuffed */ - /* read cursize, which can only decrease while we're working */ - cursize = f->cursize; nitems = f->nitems; /* Number of bytes we're going to copy */ @@ -382,8 +387,8 @@ svm_fifo_enqueue_with_offset_internal (svm_fifo_t * f, ASSERT (offset > 0); - /* read cursize, which can only decrease while we're working */ - cursize = f->cursize; + /* read cursize, which can only increase while we're working */ + cursize = svm_fifo_max_dequeue (f); nitems = f->nitems; /* Will this request fit? */ @@ -437,11 +442,11 @@ svm_fifo_dequeue_internal (svm_fifo_t * f, u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; u32 cursize, nitems; - if (PREDICT_FALSE (f->cursize == 0)) + /* read cursize, which can only increase while we're working */ + cursize = svm_fifo_max_dequeue (f); + if (PREDICT_FALSE (cursize == 0)) return -2; /* nothing in the fifo */ - /* read cursize, which can only increase while we're working */ - cursize = f->cursize; nitems = f->nitems; /* Number of bytes we're going to copy */ @@ -495,11 +500,11 @@ svm_fifo_peek (svm_fifo_t * f, int pid, u32 offset, u32 max_bytes, u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; u32 cursize, nitems, real_head; - if (PREDICT_FALSE (f->cursize == 0)) + /* read cursize, which can only increase while we're working */ + cursize = svm_fifo_max_dequeue (f); + if (PREDICT_FALSE (cursize == 0)) return -2; /* nothing in the fifo */ - /* read cursize, which can only increase while we're working */ - cursize = f->cursize; nitems = f->nitems; real_head = f->head + offset; real_head = real_head >= nitems ? real_head - nitems : real_head; @@ -532,11 +537,11 @@ svm_fifo_dequeue_drop (svm_fifo_t * f, int pid, u32 max_bytes) u32 total_drop_bytes, first_drop_bytes, second_drop_bytes; u32 cursize, nitems; - if (PREDICT_FALSE (f->cursize == 0)) + /* read cursize, which can only increase while we're working */ + cursize = svm_fifo_max_dequeue (f); + if (PREDICT_FALSE (cursize == 0)) return -2; /* nothing in the fifo */ - /* read cursize, which can only increase while we're working */ - cursize = f->cursize; nitems = f->nitems; /* Number of bytes we're going to drop */ diff --git a/src/svm/svm_fifo.h b/src/svm/svm_fifo.h index 39556173..80e5b0f2 100644 --- a/src/svm/svm_fifo.h +++ b/src/svm/svm_fifo.h @@ -44,14 +44,16 @@ typedef struct typedef struct { + volatile u32 cursize; /**< current fifo size */ + u32 nitems; + CLIB_CACHE_LINE_ALIGN_MARK (end_cursize); + pthread_mutex_t mutex; /* 8 bytes */ pthread_cond_t condvar; /* 8 bytes */ svm_lock_tag_t tag; - volatile u32 cursize; /**< current fifo size */ volatile u8 has_event; /**< non-zero if deq event exists */ u32 owner_pid; - u32 nitems; /* Backpointers */ u32 server_session_index; @@ -105,7 +107,7 @@ svm_fifo_max_dequeue (svm_fifo_t * f) static inline u32 svm_fifo_max_enqueue (svm_fifo_t * f) { - return f->nitems - f->cursize; + return f->nitems - svm_fifo_max_dequeue (f); } static inline u8 diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c index 8681105c..b86e87d9 100644 --- a/src/vnet/session/node.c +++ b/src/vnet/session/node.c @@ -119,15 +119,20 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, /* Nothing to read return */ if (max_dequeue0 == 0) - { - return 0; - } + return 0; /* Ensure we're not writing more than transport window allows */ - max_len_to_snd0 = clib_min (max_dequeue0, snd_space0); - - /* TODO check if transport is willing to send len_to_snd0 - * bytes (Nagle) */ + if (max_dequeue0 < snd_space0) + { + /* Constrained by tx queue. Try to send only fully formed segments */ + max_len_to_snd0 = (max_dequeue0 > snd_mss0) ? + max_dequeue0 - max_dequeue0 % snd_mss0 : max_dequeue0; + /* TODO Nagle ? */ + } + else + { + max_len_to_snd0 = snd_space0; + } n_frame_bytes = snd_mss0 * VLIB_FRAME_SIZE; n_frames_per_evt = ceil ((double) max_len_to_snd0 / n_frame_bytes); @@ -308,11 +313,14 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, int n_tx_packets = 0; u32 my_thread_index = vm->cpu_index; int i, rv; + f64 now = vlib_time_now (vm); + + SESSION_EVT_DBG (SESSION_EVT_POLL_GAP_TRACK, smm, my_thread_index); /* * Update TCP time */ - tcp_update_time (vlib_time_now (vm), my_thread_index); + tcp_update_time (now, my_thread_index); /* * Get vpp queue events diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index f10918aa..8e2b2616 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -556,7 +556,7 @@ session_manager_allocate_session_fifos (session_manager_main_t * smm, u8 * added_a_segment) { svm_fifo_segment_private_t *fifo_segment; - u32 fifo_size, default_fifo_size = 128 << 10; /* TODO config */ + u32 fifo_size, default_fifo_size = 1 << 16; /* TODO config */ int i; *added_a_segment = 0; @@ -1293,6 +1293,10 @@ session_manager_main_enable (vlib_main_t * vm) vec_validate (smm->current_enqueue_epoch, num_threads - 1); vec_validate (smm->vpp_event_queues, num_threads - 1); +#if SESSION_DBG + vec_validate (smm->last_event_poll_by_thread, num_threads - 1); +#endif + /* $$$$ preallocate hack config parameter */ for (i = 0; i < 200000; i++) { diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index a39bc06f..6878b4d2 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -20,6 +20,7 @@ #include #include #include +#include #define HALF_OPEN_LOOKUP_INVALID_VALUE ((u64)~0) #define INVALID_INDEX ((u32)~0) @@ -36,7 +37,7 @@ typedef enum FIFO_EVENT_BUILTIN_RX } fifo_event_type_t; -#define foreach_session_input_error \ +#define foreach_session_input_error \ _(NO_SESSION, "No session drops") \ _(NO_LISTENER, "No listener for dst port drops") \ _(ENQUEUED, "Packets pushed into rx fifo") \ @@ -218,6 +219,15 @@ struct _session_manager_main /* Convenience */ vlib_main_t *vlib_main; vnet_main_t *vnet_main; + +#if SESSION_DBG + /** + * last event poll time by thread + * Debug only. Will cause false cache-line sharing as-is + */ + f64 *last_event_poll_by_thread; +#endif + }; extern session_manager_main_t session_manager_main; diff --git a/src/vnet/session/session_debug.h b/src/vnet/session/session_debug.h index 80a97cd5..eb11f1a0 100644 --- a/src/vnet/session/session_debug.h +++ b/src/vnet/session/session_debug.h @@ -16,13 +16,13 @@ #define SRC_VNET_SESSION_SESSION_DEBUG_H_ #include -#include #include #define foreach_session_dbg_evt \ _(ENQ, "enqueue") \ _(DEQ, "dequeue") \ - _(DEQ_NODE, "dequeue") + _(DEQ_NODE, "dequeue") \ + _(POLL_GAP_TRACK, "poll gap track") \ typedef enum _session_evt_dbg { @@ -33,6 +33,7 @@ typedef enum _session_evt_dbg #define SESSION_DBG (0) #define SESSION_DEQ_NODE_EVTS (0) +#define SESSION_EVT_POLL_DBG (1) #if TRANSPORT_DEBUG && SESSION_DBG @@ -97,9 +98,34 @@ typedef enum _session_evt_dbg #define SESSION_EVT_DEQ_NODE_HANDLER(_node_evt) #endif +#if SESSION_DBG && SESSION_EVT_POLL_DBG +#define SESSION_EVT_POLL_GAP(_smm, _my_thread_index) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "nixon-gap: %d MS", \ + .format_args = "i4", \ + }; \ + DEC_SESSION_ED(_e, 1); \ + ed->data[0] = (u32) ((now - \ + _smm->last_event_poll_by_thread[my_thread_index])*1000.0); \ +} +#define SESSION_EVT_POLL_GAP_TRACK_HANDLER(_smm, _my_thread_index) \ +{ \ + if (PREDICT_TRUE( \ + smm->last_event_poll_by_thread[my_thread_index] != 0.0)) \ + if (now > smm->last_event_poll_by_thread[_my_thread_index] + 500e-6)\ + SESSION_EVT_POLL_GAP(smm, my_thread_index); \ + _smm->last_event_poll_by_thread[my_thread_index] = now; \ +} + +#else +#define SESSION_EVT_POLL_GAP(_smm, _my_thread_index) +#define SESSION_EVT_POLL_GAP_TRACK_HANDLER(_smm, _my_thread_index) +#endif + #define CONCAT_HELPER(_a, _b) _a##_b #define CC(_a, _b) CONCAT_HELPER(_a, _b) - #define SESSION_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args) #else diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index 83cdbc1b..e3705060 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -43,6 +43,10 @@ #include #undef vl_printfun +#define TCP_BUILTIN_CLIENT_DBG (1) +#define TCP_BUILTIN_CLIENT_VPP_THREAD (0) +#define TCP_BUILTIN_CLIENT_PTHREAD (!TCP_BUILTIN_CLIENT_VPP_THREAD) + static void send_test_chunk (tclient_main_t * tm, session_t * s) { @@ -52,35 +56,50 @@ send_test_chunk (tclient_main_t * tm, session_t * s) session_fifo_event_t evt; static int serial_number = 0; int rv; + test_buf_offset = s->bytes_sent % vec_len (test_data); + bytes_this_chunk = vec_len (test_data) - test_buf_offset; - while (s->bytes_to_send > 0) - { + bytes_this_chunk = bytes_this_chunk < s->bytes_to_send + ? bytes_this_chunk : s->bytes_to_send; - test_buf_offset = s->bytes_sent % vec_len (test_data); - bytes_this_chunk = vec_len (test_data) - test_buf_offset; + rv = svm_fifo_enqueue_nowait (s->server_tx_fifo, 0 /*pid */ , + bytes_this_chunk, + test_data + test_buf_offset); - bytes_this_chunk = bytes_this_chunk < s->bytes_to_send - ? bytes_this_chunk : s->bytes_to_send; + /* If we managed to enqueue data... */ + if (rv > 0) + { + if (TCP_BUILTIN_CLIENT_DBG) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "tx-enq: %d bytes", + .format_args = "i4", + }; + /* *INDENT-ON* */ + struct + { + u32 data[1]; + } *ed; + ed = ELOG_DATA (&vlib_global_main.elog_main, e); + ed->data[0] = rv; + } - rv = svm_fifo_enqueue_nowait (s->server_tx_fifo, 0 /*pid */ , - bytes_this_chunk, - test_data + test_buf_offset); + /* Account for it... */ + s->bytes_to_send -= rv; + s->bytes_sent += rv; - if (rv > 0) + /* Poke the TCP state machine */ + if (svm_fifo_set_event (s->server_tx_fifo)) { - s->bytes_to_send -= rv; - s->bytes_sent += rv; + /* Fabricate TX event, send to vpp */ + evt.fifo = s->server_tx_fifo; + evt.event_type = FIFO_EVENT_SERVER_TX; + evt.event_id = serial_number++; - if (svm_fifo_set_event (s->server_tx_fifo)) - { - /* Fabricate TX event, send to vpp */ - evt.fifo = s->server_tx_fifo; - evt.event_type = FIFO_EVENT_SERVER_TX; - evt.event_id = serial_number++; - - unix_shared_memory_queue_add (tm->vpp_event_queue, (u8 *) & evt, - 0 /* do wait for mutex */ ); - } + unix_shared_memory_queue_add (tm->vpp_event_queue, (u8 *) & evt, + 0 /* do wait for mutex */ ); } } } @@ -89,39 +108,55 @@ static void receive_test_chunk (tclient_main_t * tm, session_t * s) { svm_fifo_t *rx_fifo = s->server_rx_fifo; - int n_read, bytes, i; + int n_read, test_bytes = 0; - bytes = svm_fifo_max_dequeue (rx_fifo); /* Allow enqueuing of new event */ - svm_fifo_unset_event (rx_fifo); + // svm_fifo_unset_event (rx_fifo); - /* Read the bytes */ - do + n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len (tm->rx_buf), + tm->rx_buf); + if (n_read > 0) { - n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len (tm->rx_buf), - tm->rx_buf); - if (n_read > 0) + if (TCP_BUILTIN_CLIENT_DBG) { - bytes -= n_read; + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "rx-deq: %d bytes", + .format_args = "i4", + }; + /* *INDENT-ON* */ + struct + { + u32 data[1]; + } *ed; + ed = ELOG_DATA (&vlib_global_main.elog_main, e); + ed->data[0] = n_read; + } + + if (test_bytes) + { + int i; for (i = 0; i < n_read; i++) { if (tm->rx_buf[i] != ((s->bytes_received + i) & 0xff)) { clib_warning ("read %d error at byte %lld, 0x%x not 0x%x", - n_read, s->bytes_received + i, - tm->rx_buf[i], + n_read, s->bytes_received + i, tm->rx_buf[i], ((s->bytes_received + i) & 0xff)); } } - s->bytes_to_receive -= n_read; - s->bytes_received += n_read; } - + s->bytes_to_receive -= n_read; + s->bytes_received += n_read; } - while (n_read < 0 || bytes > 0); } +#if TCP_BUILTIN_CLIENT_VPP_THREAD +static void +#else static void * +#endif tclient_thread_fn (void *arg) { tclient_main_t *tm = &tclient_main; @@ -139,6 +174,8 @@ tclient_thread_fn (void *arg) pthread_sigmask (SIG_SETMASK, &s, 0); } + clib_per_cpu_mheaps[os_get_cpu_number ()] = clib_per_cpu_mheaps[0]; + while (1) { /* Wait until we're told to get busy */ @@ -186,12 +223,12 @@ tclient_thread_fn (void *arg) /* Disconnect sessions... */ vec_reset_length (session_indices); - pool_foreach (sp, tm->sessions, ( - { - vec_add1 (session_indices, - sp - tm->sessions); - } - )); + + /* *INDENT-OFF* */ + pool_foreach (sp, tm->sessions, ({ + vec_add1 (session_indices, sp - tm->sessions); + })); + /* *INDENT-ON* */ for (i = 0; i < vec_len (session_indices); i++) { @@ -207,7 +244,9 @@ tclient_thread_fn (void *arg) } } /* NOTREACHED */ +#if TCP_BUILTIN_CLIENT_PTHREAD return 0; +#endif } /* So we don't get "no handler for... " msgs */ @@ -333,7 +372,7 @@ test_tcp_clients_command_fn (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) { - u8 *connect_uri = (u8 *) "tcp://6.0.1.2/1234"; + u8 *connect_uri = (u8 *) "tcp://6.0.1.1/1234"; u8 *uri; tclient_main_t *tm = &tclient_main; int i; @@ -349,7 +388,7 @@ test_tcp_clients_command_fn (vlib_main_t * vm, ; else if (unformat (input, "iterations %d", &tm->n_iterations)) ; - else if (unformat (input, "bytes %d", &tm->bytes_to_send)) + else if (unformat (input, "bytes %lld", &tm->bytes_to_send)) ; else if (unformat (input, "uri %s", &tm->connect_uri)) ; @@ -366,17 +405,20 @@ test_tcp_clients_command_fn (vlib_main_t * vm, create_api_loopback (tm); +#if TCP_BUILTIN_CLIENT_PTHREAD /* Start a transmit thread */ if (tm->client_thread_handle == 0) { int rv = pthread_create (&tm->client_thread_handle, - NULL /*attr */ , tclient_thread_fn, 0); + NULL /*attr */ , + tclient_thread_fn, 0); if (rv) { tm->client_thread_handle = 0; return clib_error_return (0, "pthread_create returned %d", rv); } } +#endif /* Fire off connect requests, in something approaching a normal manner */ for (i = 0; i < n_clients; i++) @@ -397,6 +439,18 @@ test_tcp_clients_command_fn (vlib_main_t * vm, return 0; } +#if TCP_BUILTIN_CLIENT_VPP_THREAD +/* *INDENT-OFF* */ +VLIB_REGISTER_THREAD (builtin_client_reg, static) = { + .name = "tcp-builtin-client", + .function = tclient_thread_fn, + .fixed_count = 1, + .count = 1, + .no_data_structure_clone = 1, +}; +/* *INDENT-ON* */ +#endif + /* *INDENT-OFF* */ VLIB_CLI_COMMAND (test_clients_command, static) = { diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c index efd26e91..917d4bd3 100644 --- a/src/vnet/tcp/builtin_server.c +++ b/src/vnet/tcp/builtin_server.c @@ -127,6 +127,7 @@ builtin_server_rx_callback (stream_session_t * s) { /* XXX timeout for session that are stuck */ + rx_event: /* Program self-tap to retry */ if (svm_fifo_set_event (rx_fifo)) { @@ -158,7 +159,9 @@ builtin_server_rx_callback (stream_session_t * s) n_written = svm_fifo_enqueue_nowait (tx_fifo, 0, actual_transfer, bsm->rx_buf); - ASSERT (n_written == max_transfer); + + if (n_written != max_transfer) + clib_warning ("short trout!"); if (svm_fifo_set_event (tx_fifo)) { @@ -171,6 +174,9 @@ builtin_server_rx_callback (stream_session_t * s) (u8 *) & evt, 0 /* do wait for mutex */ ); } + if (PREDICT_FALSE (max_enqueue < max_dequeue)) + goto rx_event; + return 0; } @@ -204,8 +210,8 @@ server_create (vlib_main_t * vm) a->session_cb_vft = &builtin_session_cb_vft; a->options = options; a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 128 << 20; - a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 64 << 10; - a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 64 << 10; + a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 1 << 16; + a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 1 << 16; a->segment_name = segment_name; a->segment_name_length = ARRAY_LEN (segment_name); diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index c3df5bc1..b2a371e2 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -578,7 +578,9 @@ tcp_session_send_space (transport_connection_t * trans_conn) /* If we can't write at least a segment, don't try at all */ if (snd_space < tc->snd_mss) return 0; - return snd_space; + + /* round down to mss multiple */ + return snd_space - (snd_space % tc->snd_mss); } /* If in fast recovery, send 1 SMSS if wnd allows */ @@ -706,7 +708,7 @@ static timer_expiration_handler *timer_expiration_handlers[TCP_N_TIMERS] = { tcp_timer_retransmit_handler, tcp_timer_delack_handler, - 0, + tcp_timer_persist_handler, tcp_timer_keep_handler, tcp_timer_waitclose_handler, tcp_timer_retransmit_syn_handler, diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index b4286bc4..2f5da108 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -81,6 +81,7 @@ typedef void (timer_expiration_handler) (u32 index); extern timer_expiration_handler tcp_timer_delack_handler; extern timer_expiration_handler tcp_timer_retransmit_handler; +extern timer_expiration_handler tcp_timer_persist_handler; extern timer_expiration_handler tcp_timer_retransmit_syn_handler; #define TCP_TIMER_HANDLE_INVALID ((u32) ~0) @@ -253,13 +254,25 @@ struct _tcp_cc_algorithm #define tcp_fastrecovery_on(tc) (tc)->flags |= TCP_CONN_FAST_RECOVERY #define tcp_fastrecovery_off(tc) (tc)->flags &= ~TCP_CONN_FAST_RECOVERY +#define tcp_recovery_on(tc) (tc)->flags |= TCP_CONN_RECOVERY +#define tcp_recovery_off(tc) (tc)->flags &= ~TCP_CONN_RECOVERY #define tcp_in_fastrecovery(tc) ((tc)->flags & TCP_CONN_FAST_RECOVERY) -#define tcp_in_recovery(tc) ((tc)->flags & (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY)) +#define tcp_in_recovery(tc) ((tc)->flags & (TCP_CONN_RECOVERY)) #define tcp_in_slowstart(tc) (tc->cwnd < tc->ssthresh) #define tcp_fastrecovery_sent_1_smss(tc) ((tc)->flags & TCP_CONN_FR_1_SMSS) #define tcp_fastrecovery_1_smss_on(tc) ((tc)->flags |= TCP_CONN_FR_1_SMSS) #define tcp_fastrecovery_1_smss_off(tc) ((tc)->flags &= ~TCP_CONN_FR_1_SMSS) +#define tcp_in_cong_recovery(tc) ((tc)->flags & \ + (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY)) + +always_inline void +tcp_cong_recovery_off (tcp_connection_t * tc) +{ + tc->flags &= ~(TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY); + tcp_fastrecovery_1_smss_off (tc); +} + typedef enum { TCP_IP4, @@ -538,6 +551,27 @@ tcp_retransmit_timer_reset (tcp_connection_t * tc) tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT); } +always_inline void +tcp_persist_timer_set (tcp_connection_t * tc) +{ + /* Reuse RTO. It's backed off in handler */ + tcp_timer_set (tc, TCP_TIMER_PERSIST, + clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); +} + +always_inline void +tcp_persist_timer_update (tcp_connection_t * tc) +{ + tcp_timer_update (tc, TCP_TIMER_PERSIST, + clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); +} + +always_inline void +tcp_persist_timer_reset (tcp_connection_t * tc) +{ + tcp_timer_reset (tc, TCP_TIMER_PERSIST); +} + always_inline u8 tcp_timer_is_active (tcp_connection_t * tc, tcp_timers_e timer) { diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index 5a71694e..0090e15e 100644 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -31,6 +31,7 @@ _(UNBIND, "unbind") \ _(DELETE, "delete") \ _(SYN_SENT, "SYN sent") \ + _(SYN_RTX, "SYN retransmit") \ _(FIN_SENT, "FIN sent") \ _(ACK_SENT, "ACK sent") \ _(DUPACK_SENT, "DUPACK sent") \ @@ -50,6 +51,7 @@ _(CC_PACK, "cc partial ack") \ _(SEG_INVALID, "invalid segment") \ _(ACK_RCV_ERR, "invalid ack") \ + _(RCV_WND_SHRUNK, "shrunk rcv_wnd") \ typedef enum _tcp_dbg { @@ -159,35 +161,48 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "ack_prep: acked %u rcv_nxt %u rcv_wnd %u snd_nxt %u", \ - .format_args = "i4i4i4i4", \ + .format = "ack_tx: acked %u rcv_nxt %u rcv_wnd %u snd_nxt %u snd_wnd %u",\ + .format_args = "i4i4i4i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 4); \ + DECLARE_ETD(_tc, _e, 5); \ ed->data[0] = _tc->rcv_nxt - _tc->rcv_las; \ ed->data[1] = _tc->rcv_nxt - _tc->irs; \ ed->data[2] = _tc->rcv_wnd; \ ed->data[3] = _tc->snd_nxt - _tc->iss; \ + ed->data[4] = _tc->snd_wnd; \ } #define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "dack_tx: rcv_nxt %u rcv_wnd %u snd_nxt %u av-wnd %u", \ - .format_args = "i4i4i4i4", \ + .format = "dack_tx: rcv_nxt %u rcv_wnd %u snd_nxt %u av_wnd %u snd_wnd %u",\ + .format_args = "i4i4i4i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 4); \ + DECLARE_ETD(_tc, _e, 5); \ ed->data[0] = _tc->rcv_nxt - _tc->irs; \ ed->data[1] = _tc->rcv_wnd; \ ed->data[2] = _tc->snd_nxt - _tc->iss; \ ed->data[3] = tcp_available_wnd(_tc); \ + ed->data[4] = _tc->snd_wnd; \ } #define TCP_EVT_SYN_SENT_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "SYNtx: iss %u", \ + .format = "SYNtx: iss %u", \ + .format_args = "i4", \ + }; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->iss; \ +} + +#define TCP_EVT_SYN_RTX_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "SYNrtx: iss %u", \ .format_args = "i4", \ }; \ DECLARE_ETD(_tc, _e, 1); \ @@ -254,17 +269,17 @@ typedef enum _tcp_dbg_evt ed->data[1] = _tc->rcv_nxt - _tc->irs; \ } -#define TCP_EVT_ACK_RCVD_HANDLER(_tc, _ack, ...) \ +#define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "acked: %u snd_una %u ack %u cwnd %u inflight %u", \ + .format = "acked: %u snd_una %u snd_wnd %u cwnd %u inflight %u", \ .format_args = "i4i4i4i4i4", \ }; \ DECLARE_ETD(_tc, _e, 5); \ ed->data[0] = _tc->bytes_acked; \ ed->data[1] = _tc->snd_una - _tc->iss; \ - ed->data[2] = _ack - _tc->iss; \ + ed->data[2] = _tc->snd_wnd; \ ed->data[3] = _tc->cwnd; \ ed->data[4] = tcp_flight_size(_tc); \ } @@ -273,14 +288,15 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "dack_rx: snd_una %u cwnd %u snd_wnd %u inflight %u", \ - .format_args = "i4i4i4i4", \ + .format = "dack_rx: snd_una %u cwnd %u snd_wnd %u flight %u rcv_wnd %u",\ + .format_args = "i4i4i4i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 4); \ + DECLARE_ETD(_tc, _e, 5); \ ed->data[0] = _tc->snd_una - _tc->iss; \ ed->data[1] = _tc->cwnd; \ ed->data[2] = _tc->snd_wnd; \ ed->data[3] = tcp_flight_size(_tc); \ + ed->data[4] = _tc->rcv_wnd; \ } #define TCP_EVT_PKTIZE_HANDLER(_tc, ...) \ @@ -302,7 +318,7 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "in: %s len %u written %d rcv_nxt %u free wnd %d", \ + .format = "in: %s len %u written %d rcv_nxt %u rcv_wnd(o) %d", \ .format_args = "t4i4i4i4i4", \ .n_enum_strings = 2, \ .enum_strings = { \ @@ -338,7 +354,7 @@ typedef enum _tcp_dbg_evt .enum_strings = { \ "retransmit", \ "delack", \ - "BUG", \ + "persist", \ "keep", \ "waitclose", \ "retransmit syn", \ @@ -354,7 +370,7 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "seg-inv: seq %u end %u rcv_las %u rcv_nxt %u wnd %u", \ + .format = "seg-inv: seq %u end %u rcv_las %u rcv_nxt %u rcv_wnd %u",\ .format_args = "i4i4i4i4i4", \ }; \ DECLARE_ETD(_tc, _e, 5); \ @@ -445,6 +461,24 @@ typedef enum _tcp_dbg_evt #define TCP_EVT_CC_PACK_HANDLER(_tc, ...) #endif +#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...) \ +{ \ +if (_av > 0) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "huh?: rcv_wnd %u obsd %u av %u rcv_nxt %u rcv_las %u", \ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->rcv_wnd; \ + ed->data[1] = _obs; \ + ed->data[2] = _av; \ + ed->data[3] = _tc->rcv_nxt - _tc->irs; \ + ed->data[4] = _tc->rcv_las - _tc->irs; \ +} \ +} + #if TCP_DBG_VERBOSE #define TCP_EVT_SND_WND_HANDLER(_tc, ...) \ { \ diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 5d11985f..a8224dc2 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -276,8 +276,7 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, if (tc0->rcv_wnd == 0 && tc0->rcv_nxt == vnet_buffer (b0)->tcp.seq_number) { - /* Make it look as if there's nothing to dequeue */ - vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number; + /* TODO Should segment be tagged? */ } else { @@ -375,7 +374,6 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) if (tc->rtt_seq && seq_gt (ack, tc->rtt_seq) && !tc->rto_boff) { mrtt = tcp_time_now () - tc->rtt_ts; - tc->rtt_seq = 0; } /* As per RFC7323 TSecr can be used for RTTM only if the segment advances @@ -395,6 +393,10 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); + /* Allow measuring of RTT and make sure boff is 0 */ + tc->rtt_seq = 0; + tc->rto_boff = 0; + return 1; } @@ -408,11 +410,7 @@ tcp_dequeue_acked (tcp_connection_t * tc, u32 ack) stream_session_dequeue_drop (&tc->connection, tc->bytes_acked); /* Update rtt and rto */ - if (tcp_update_rtt (tc, ack)) - { - /* Good ACK received and valid RTT, make sure retransmit backoff is 0 */ - tc->rto_boff = 0; - } + tcp_update_rtt (tc, ack); } /** @@ -672,6 +670,13 @@ tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd) tc->snd_wl1 = seq; tc->snd_wl2 = ack; TCP_EVT_DBG (TCP_EVT_SND_WND, tc); + + /* Set probe timer if we just got 0 wnd */ + if (tc->snd_wnd < tc->snd_mss + && !tcp_timer_is_active (tc, TCP_TIMER_PERSIST)) + tcp_persist_timer_set (tc); + else + tcp_persist_timer_reset (tc); } } @@ -686,6 +691,10 @@ tcp_cc_congestion (tcp_connection_t * tc) void tcp_cc_recover (tcp_connection_t * tc) { + /* TODO: check if time to recover was small. It might be that RTO popped + * too soon. + */ + tc->cc_algo->recovered (tc); tc->rtx_bytes = 0; @@ -695,8 +704,7 @@ tcp_cc_recover (tcp_connection_t * tc) tc->cc_algo->rcv_ack (tc); tc->tsecr_last_ack = tc->opt.tsecr; - tcp_fastrecovery_1_smss_off (tc); - tcp_fastrecovery_off (tc); + tcp_cong_recovery_off (tc); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); } @@ -706,7 +714,7 @@ tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) { u8 partial_ack; - if (tcp_in_recovery (tc)) + if (tcp_in_cong_recovery (tc)) { partial_ack = seq_lt (tc->snd_una, tc->snd_congestion); if (!partial_ack) @@ -724,10 +732,10 @@ tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) /* In case snd_nxt is still in the past and output tries to * shove some new bytes */ - tc->snd_nxt = tc->snd_una; + tc->snd_nxt = tc->snd_una_max; /* XXX need proper RFC6675 support */ - if (tc->sack_sb.last_sacked_bytes) + if (tc->sack_sb.last_sacked_bytes && !tcp_in_recovery (tc)) { tcp_fast_retransmit (tc); } @@ -735,9 +743,6 @@ tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) { /* Retransmit first unacked segment */ tcp_retransmit_first_unacked (tc); - /* If window allows, send 1 SMSS of new data */ - if (seq_lt (tc->snd_nxt, tc->snd_congestion)) - tc->snd_nxt = tc->snd_congestion; } } } @@ -814,10 +819,11 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, return -1; } - tc->snd_nxt = vnet_buffer (b)->tcp.ack_number; - *error = TCP_ERROR_ACK_FUTURE; TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 2, vnet_buffer (b)->tcp.ack_number); + + tc->snd_nxt = vnet_buffer (b)->tcp.ack_number; + *error = TCP_ERROR_ACK_FUTURE; } /* If old ACK, probably it's an old dupack */ @@ -863,7 +869,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, * timer. */ if (tc->bytes_acked) { - TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc, vnet_buffer (b)->tcp.ack_number); + TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc); /* Updates congestion control (slow start/congestion avoidance) */ tcp_cc_rcv_ack (tc, b); @@ -966,11 +972,14 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, tc->rcv_nxt += written; /* Depending on how fast the app is, all remaining buffers in burst will - * not be enqueued. Should we inform peer of the damage? XXX */ + * not be enqueued. Inform peer */ + tc->flags |= TCP_CONN_SNDACK; + return TCP_ERROR_PARTIALLY_ENQUEUED; } else { + tc->flags |= TCP_CONN_SNDACK; return TCP_ERROR_FIFO_FULL; } @@ -1101,25 +1110,17 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, goto done; } - if (PREDICT_FALSE (error == TCP_ERROR_FIFO_FULL)) - *next0 = TCP_NEXT_DROP; - /* Check if ACK can be delayed */ - if (!tcp_can_delack (tc)) - { - /* Nothing to do for pure ACKs XXX */ - if (n_data_bytes == 0) - goto done; - - *next0 = tcp_next_output (tc->c_is_ip4); - tcp_make_ack (tc, b); - } - else + if (tcp_can_delack (tc)) { if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK)) tcp_timer_set (tc, TCP_TIMER_DELACK, TCP_DELACK_TIME); + goto done; } + *next0 = tcp_next_output (tc->c_is_ip4); + tcp_make_ack (tc, b); + done: return error; } @@ -2084,6 +2085,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, child0->irs = vnet_buffer (b0)->tcp.seq_number; child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1; + child0->rcv_las = child0->rcv_nxt; child0->state = TCP_STATE_SYN_RCVD; /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK} diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index a671f728..ea157bd7 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -155,8 +155,7 @@ tcp_update_rcv_wnd (tcp_connection_t * tc) max_fifo = stream_session_fifo_size (&tc->connection); ASSERT (tc->opt.mss < max_fifo); - - if (available_space < tc->opt.mss && available_space < max_fifo / 8) + if (available_space < tc->opt.mss && available_space < max_fifo >> 3) available_space = 0; /* @@ -170,16 +169,21 @@ tcp_update_rcv_wnd (tcp_connection_t * tc) /* Bad. Thou shalt not shrink */ if (available_space < observed_wnd) { - /* Does happen! */ wnd = observed_wnd; + TCP_EVT_DBG (TCP_EVT_RCV_WND_SHRUNK, tc, observed_wnd, available_space); } else { wnd = available_space; } - if (wnd && ((wnd << tc->rcv_wscale) >> tc->rcv_wscale != wnd)) - wnd += 1 << tc->rcv_wscale; + /* Make sure we have a multiple of rcv_wscale */ + if (wnd && tc->rcv_wscale) + { + wnd &= ~(1 << tc->rcv_wscale); + if (wnd == 0) + wnd = 1 << tc->rcv_wscale; + } tc->rcv_wnd = clib_min (wnd, TCP_WND_MAX << tc->rcv_wscale); } @@ -462,8 +466,9 @@ tcp_make_ack (tcp_connection_t * tc, vlib_buffer_t * b) tcp_reuse_buffer (vm, b); tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_ACK); - vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; TCP_EVT_DBG (TCP_EVT_ACK_SENT, tc); + vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; + tc->rcv_las = tc->rcv_nxt; } /** @@ -908,6 +913,7 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, vnet_buffer (b)->tcp.connection_index = tc->c_c_index; tc->snd_nxt += data_len; + /* TODO this is updated in output as well ... */ if (tc->snd_nxt > tc->snd_una_max) tc->snd_una_max = tc->snd_nxt; @@ -929,7 +935,6 @@ tcp_send_ack (tcp_connection_t * tc) /* Fill in the ACK */ tcp_make_ack (tc, b); - tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); } @@ -942,7 +947,6 @@ tcp_timer_delack_handler (u32 index) tc = tcp_connection_get (index, thread_index); tc->timers[TCP_TIMER_DELACK] = TCP_TIMER_HANDLE_INVALID; -// tc->flags &= ~TCP_CONN_DELACK; tcp_send_ack (tc); } @@ -995,7 +999,7 @@ done: * Reset congestion control, switch cwnd to loss window and try again. */ static void -tcp_rtx_timeout_cc_recover (tcp_connection_t * tc) +tcp_rtx_timeout_cc (tcp_connection_t * tc) { /* Cleanly recover cc (also clears up fast retransmit) */ if (tcp_in_fastrecovery (tc)) @@ -1008,6 +1012,7 @@ tcp_rtx_timeout_cc_recover (tcp_connection_t * tc) } /* Start again from the beginning */ + tcp_recovery_on (tc); tc->cwnd = tcp_loss_wnd (tc); tc->snd_congestion = tc->snd_una_max; } @@ -1048,7 +1053,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { /* First retransmit timeout */ if (tc->rto_boff == 1) - tcp_rtx_timeout_cc_recover (tc); + tcp_rtx_timeout_cc (tc); /* Exponential backoff */ tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); @@ -1114,6 +1119,8 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { ASSERT (tc->state == TCP_STATE_SYN_SENT); + TCP_EVT_DBG (TCP_EVT_SYN_RTX, tc); + /* This goes straight to ipx_lookup */ tcp_push_ip_hdr (tm, tc, b); tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4); @@ -1136,6 +1143,55 @@ tcp_timer_retransmit_syn_handler (u32 index) tcp_timer_retransmit_handler_i (index, 1); } +/** + * Got 0 snd_wnd from peer, try to do something about it. + * + */ +void +tcp_timer_persist_handler (u32 index) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = vlib_get_main (); + u32 thread_index = os_get_cpu_number (); + tcp_connection_t *tc; + vlib_buffer_t *b; + u32 bi, n_bytes; + + tc = tcp_connection_get (index, thread_index); + + /* Make sure timer handle is set to invalid */ + tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID; + + /* Problem already solved or worse */ + if (tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc)) + return; + + /* Increment RTO backoff */ + tc->rto_boff += 1; + tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); + + /* Try to force the first unsent segment */ + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (vm, bi); + n_bytes = stream_session_peek_bytes (&tc->connection, + vlib_buffer_get_current (b), + tc->snd_una_max - tc->snd_una, + tc->snd_mss); + /* Nothing to send */ + if (n_bytes == 0) + { + tcp_return_buffer (tm); + return; + } + + b->current_length = n_bytes; + tcp_push_hdr_i (tc, b, tc->state); + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + + /* Re-enable persist timer */ + tcp_persist_timer_set (tc); +} + /** * Retransmit first unacked segment */ @@ -1329,9 +1385,6 @@ tcp46_output_inline (vlib_main_t * vm, } } - /* Retransmitted SYNs do reach this but it should be harmless */ - tc0->rcv_las = tc0->rcv_nxt; - /* Stop DELACK timer and fix flags */ tc0->flags &= ~(TCP_CONN_SNDACK); if (tcp_timer_is_active (tc0, TCP_TIMER_DELACK)) -- cgit 1.2.3-korg From 586afd762bfa149f5ca167bd5fd5a0cd59ce94fe Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 5 Apr 2017 19:18:20 +0200 Subject: Use thread local storage for thread index This patch deprecates stack-based thread identification, Also removes requirement that thread stacks are adjacent. Finally, possibly annoying for some folks, it renames all occurences of cpu_index and cpu_number with thread index. Using word "cpu" is misleading here as thread can be migrated ti different CPU, and also it is not related to linux cpu index. Change-Id: I68cdaf661e701d2336fc953dcb9978d10a70f7c1 Signed-off-by: Damjan Marion --- src/examples/srv6-sample-localsid/node.c | 4 +- src/plugins/dpdk/buffer.c | 2 +- src/plugins/dpdk/device/device.c | 8 +- src/plugins/dpdk/device/dpdk_priv.h | 8 +- src/plugins/dpdk/device/init.c | 2 +- src/plugins/dpdk/device/node.c | 32 +++--- src/plugins/dpdk/hqos/hqos.c | 16 +-- src/plugins/dpdk/ipsec/cli.c | 8 +- src/plugins/dpdk/ipsec/crypto_node.c | 4 +- src/plugins/dpdk/ipsec/esp.h | 4 +- src/plugins/dpdk/ipsec/esp_decrypt.c | 4 +- src/plugins/dpdk/ipsec/esp_encrypt.c | 5 +- src/plugins/dpdk/ipsec/ipsec.c | 2 +- src/plugins/dpdk/ipsec/ipsec.h | 4 +- src/plugins/dpdk/main.c | 2 +- src/plugins/flowperpkt/l2_node.c | 2 +- src/plugins/flowperpkt/node.c | 2 +- src/plugins/ioam/export-common/ioam_export.h | 6 +- .../ioam/ip6/ioam_cache_tunnel_select_node.c | 16 +-- src/plugins/ixge/ixge.c | 2 +- src/plugins/lb/lb.c | 8 +- src/plugins/lb/node.c | 22 ++-- src/plugins/lb/refcount.c | 8 +- src/plugins/lb/refcount.h | 4 +- src/plugins/memif/node.c | 35 +++--- src/plugins/snat/in2out.c | 110 +++++++++--------- src/plugins/snat/out2in.c | 102 ++++++++--------- src/plugins/snat/snat.h | 10 +- src/vlib/buffer.c | 6 +- src/vlib/buffer_funcs.h | 4 +- src/vlib/cli.c | 6 +- src/vlib/counter.h | 16 +-- src/vlib/error.c | 2 +- src/vlib/global_funcs.h | 2 +- src/vlib/main.c | 14 +-- src/vlib/main.h | 2 +- src/vlib/node.c | 2 +- src/vlib/node.h | 6 +- src/vlib/node_funcs.h | 8 +- src/vlib/threads.c | 69 ++++------- src/vlib/threads.h | 21 ++-- src/vlib/unix/cj.c | 7 +- src/vlib/unix/cj.h | 2 +- src/vlib/unix/main.c | 43 +++---- src/vnet/adj/adj_l2.c | 4 +- src/vnet/adj/adj_midchain.c | 8 +- src/vnet/adj/adj_nsh.c | 4 +- src/vnet/classify/vnet_classify.c | 16 +-- src/vnet/cop/ip4_whitelist.c | 8 +- src/vnet/cop/ip6_whitelist.c | 8 +- src/vnet/devices/af_packet/node.c | 20 ++-- src/vnet/devices/devices.c | 61 +++++----- src/vnet/devices/devices.h | 18 +-- src/vnet/devices/netmap/node.c | 24 ++-- src/vnet/devices/ssvm/node.c | 6 +- src/vnet/devices/virtio/vhost-user.c | 127 +++++++++++---------- src/vnet/dpo/lookup_dpo.c | 20 ++-- src/vnet/dpo/replicate_dpo.c | 12 +- src/vnet/ethernet/arp.c | 2 +- src/vnet/ethernet/interface.c | 7 +- src/vnet/ethernet/node.c | 14 +-- src/vnet/gre/node.c | 8 +- src/vnet/interface.h | 2 +- src/vnet/interface_output.c | 53 ++++----- src/vnet/ip/ip4_forward.c | 34 +++--- src/vnet/ip/ip4_input.c | 8 +- src/vnet/ip/ip6_forward.c | 24 ++-- src/vnet/ip/ip6_input.c | 8 +- src/vnet/ip/ip6_neighbor.c | 4 +- src/vnet/ipsec/esp.h | 8 +- src/vnet/ipsec/esp_decrypt.c | 13 ++- src/vnet/ipsec/esp_encrypt.c | 13 ++- src/vnet/ipsec/ikev2.c | 64 ++++++----- src/vnet/ipsec/ipsec.h | 12 +- src/vnet/ipsec/ipsec_if.c | 2 +- src/vnet/l2/l2_bvi.h | 2 +- src/vnet/l2/l2_input.c | 14 +-- src/vnet/l2/l2_output.c | 6 +- src/vnet/l2tp/decap.c | 2 +- src/vnet/l2tp/encap.c | 2 +- src/vnet/l2tp/l2tp.c | 6 +- src/vnet/lisp-gpe/decap.c | 16 +-- src/vnet/lldp/lldp_input.c | 2 +- src/vnet/map/ip4_map.c | 14 +-- src/vnet/map/ip4_map_t.c | 12 +- src/vnet/map/ip6_map.c | 19 +-- src/vnet/map/ip6_map_t.c | 12 +- src/vnet/mpls/mpls_input.c | 8 +- src/vnet/mpls/mpls_lookup.c | 20 ++-- src/vnet/mpls/mpls_output.c | 10 +- src/vnet/pg/input.c | 4 +- src/vnet/replication.c | 20 ++-- src/vnet/replication.h | 2 +- src/vnet/session/node.c | 2 +- src/vnet/sr/sr_localsid.c | 44 +++---- src/vnet/tcp/builtin_client.c | 2 +- src/vnet/tcp/tcp.c | 8 +- src/vnet/tcp/tcp_debug.h | 2 +- src/vnet/tcp/tcp_input.c | 10 +- src/vnet/tcp/tcp_output.c | 20 ++-- src/vnet/udp/udp_input.c | 2 +- src/vnet/unix/tapcli.c | 2 +- src/vnet/unix/tuntap.c | 4 +- src/vnet/vxlan-gpe/decap.c | 10 +- src/vnet/vxlan-gpe/encap.c | 12 +- src/vnet/vxlan/decap.c | 10 +- src/vnet/vxlan/encap.c | 12 +- src/vpp/stats/stats.c | 14 +-- src/vpp/stats/stats.h | 2 +- 109 files changed, 790 insertions(+), 791 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/examples/srv6-sample-localsid/node.c b/src/examples/srv6-sample-localsid/node.c index 7bae9cd7..e83e2352 100644 --- a/src/examples/srv6-sample-localsid/node.c +++ b/src/examples/srv6-sample-localsid/node.c @@ -114,7 +114,7 @@ srv6_localsid_sample_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_fram from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; next_index = node->cached_next_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -168,7 +168,7 @@ srv6_localsid_sample_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_fram /* This increments the SRv6 per LocalSID counters.*/ vlib_increment_combined_counter (((next0 == SRV6_SAMPLE_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : &(sm->sr_ls_valid_counters)), - cpu_index, + thread_index, ls0 - sm->localsids, 1, vlib_buffer_length_in_chain (vm, b0)); diff --git a/src/plugins/dpdk/buffer.c b/src/plugins/dpdk/buffer.c index 2765c292..c80b3fa8 100644 --- a/src/plugins/dpdk/buffer.c +++ b/src/plugins/dpdk/buffer.c @@ -132,7 +132,7 @@ dpdk_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index) u32 merge_index; int i; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); f = vlib_buffer_get_free_list (vm, free_list_index); diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c index 50b26689..91661246 100644 --- a/src/plugins/dpdk/device/device.c +++ b/src/plugins/dpdk/device/device.c @@ -243,7 +243,7 @@ static_always_inline ASSERT (ring->tx_tail == 0); n_retry = 16; - queue_id = vm->cpu_index; + queue_id = vm->thread_index; do { @@ -266,7 +266,7 @@ static_always_inline { /* no wrap, transmit in one burst */ dpdk_device_hqos_per_worker_thread_t *hqos = - &xd->hqos_wt[vm->cpu_index]; + &xd->hqos_wt[vm->thread_index]; ASSERT (hqos->swq != NULL); @@ -332,7 +332,7 @@ dpdk_buffer_recycle (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t * b, u32 bi, struct rte_mbuf **mbp) { dpdk_main_t *dm = &dpdk_main; - u32 my_cpu = vm->cpu_index; + u32 my_cpu = vm->thread_index; struct rte_mbuf *mb_new; if (PREDICT_FALSE (b->flags & VLIB_BUFFER_RECYCLE) == 0) @@ -376,7 +376,7 @@ dpdk_interface_tx (vlib_main_t * vm, tx_ring_hdr_t *ring; u32 n_on_ring; - my_cpu = vm->cpu_index; + my_cpu = vm->thread_index; queue_id = my_cpu; diff --git a/src/plugins/dpdk/device/dpdk_priv.h b/src/plugins/dpdk/device/dpdk_priv.h index dd40ff48..52b4ca4b 100644 --- a/src/plugins/dpdk/device/dpdk_priv.h +++ b/src/plugins/dpdk/device/dpdk_priv.h @@ -79,7 +79,7 @@ dpdk_update_counters (dpdk_device_t * xd, f64 now) { vlib_simple_counter_main_t *cm; vnet_main_t *vnm = vnet_get_main (); - u32 my_cpu = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u64 rxerrors, last_rxerrors; /* only update counters for PMD interfaces */ @@ -96,7 +96,7 @@ dpdk_update_counters (dpdk_device_t * xd, f64 now) cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, VNET_INTERFACE_COUNTER_RX_NO_BUF); - vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, xd->vlib_sw_if_index, xd->stats.rx_nombuf - xd->last_stats.rx_nombuf); } @@ -107,7 +107,7 @@ dpdk_update_counters (dpdk_device_t * xd, f64 now) cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, VNET_INTERFACE_COUNTER_RX_MISS); - vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, xd->vlib_sw_if_index, xd->stats.imissed - xd->last_stats.imissed); } @@ -119,7 +119,7 @@ dpdk_update_counters (dpdk_device_t * xd, f64 now) cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, VNET_INTERFACE_COUNTER_RX_ERROR); - vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, xd->vlib_sw_if_index, rxerrors - last_rxerrors); } diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 538db6cb..7eaf8da7 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -324,7 +324,7 @@ dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd) int rv; int j; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) { diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c index e740fd18..b10e0fad 100644 --- a/src/plugins/dpdk/device/node.c +++ b/src/plugins/dpdk/device/node.c @@ -283,7 +283,7 @@ dpdk_buffer_init_from_template (void *d0, void *d1, void *d2, void *d3, */ static_always_inline u32 dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, - vlib_node_runtime_t * node, u32 cpu_index, u16 queue_id, + vlib_node_runtime_t * node, u32 thread_index, u16 queue_id, int maybe_multiseg) { u32 n_buffers; @@ -294,7 +294,7 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, uword n_rx_bytes = 0; u32 n_trace, trace_cnt __attribute__ ((unused)); vlib_buffer_free_list_t *fl; - vlib_buffer_t *bt = vec_elt_at_index (dm->buffer_templates, cpu_index); + vlib_buffer_t *bt = vec_elt_at_index (dm->buffer_templates, thread_index); if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) == 0) return 0; @@ -306,7 +306,7 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, return 0; } - vec_reset_length (xd->d_trace_buffers[cpu_index]); + vec_reset_length (xd->d_trace_buffers[thread_index]); trace_cnt = n_trace = vlib_get_trace_count (vm, node); if (n_trace > 0) @@ -318,7 +318,7 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, { struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index++]; vlib_buffer_t *b = vlib_buffer_from_rte_mbuf (mb); - vec_add1 (xd->d_trace_buffers[cpu_index], + vec_add1 (xd->d_trace_buffers[thread_index], vlib_get_buffer_index (vm, b)); } } @@ -546,20 +546,22 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, vlib_put_next_frame (vm, node, next_index, n_left_to_next); } - if (PREDICT_FALSE (vec_len (xd->d_trace_buffers[cpu_index]) > 0)) + if (PREDICT_FALSE (vec_len (xd->d_trace_buffers[thread_index]) > 0)) { - dpdk_rx_trace (dm, node, xd, queue_id, xd->d_trace_buffers[cpu_index], - vec_len (xd->d_trace_buffers[cpu_index])); - vlib_set_trace_count (vm, node, n_trace - - vec_len (xd->d_trace_buffers[cpu_index])); + dpdk_rx_trace (dm, node, xd, queue_id, + xd->d_trace_buffers[thread_index], + vec_len (xd->d_trace_buffers[thread_index])); + vlib_set_trace_count (vm, node, + n_trace - + vec_len (xd->d_trace_buffers[thread_index])); } vlib_increment_combined_counter (vnet_get_main ()->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, xd->vlib_sw_if_index, mb_index, n_rx_bytes); + thread_index, xd->vlib_sw_if_index, mb_index, n_rx_bytes); - vnet_device_increment_rx_packets (cpu_index, mb_index); + vnet_device_increment_rx_packets (thread_index, mb_index); return mb_index; } @@ -630,19 +632,19 @@ dpdk_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f) dpdk_device_t *xd; uword n_rx_packets = 0; dpdk_device_and_queue_t *dq; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); /* * Poll all devices on this cpu for input/interrupts. */ /* *INDENT-OFF* */ - vec_foreach (dq, dm->devices_by_cpu[cpu_index]) + vec_foreach (dq, dm->devices_by_cpu[thread_index]) { xd = vec_elt_at_index(dm->devices, dq->device); if (xd->flags & DPDK_DEVICE_FLAG_MAYBE_MULTISEG) - n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id, /* maybe_multiseg */ 1); + n_rx_packets += dpdk_device_input (dm, xd, node, thread_index, dq->queue_id, /* maybe_multiseg */ 1); else - n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id, /* maybe_multiseg */ 0); + n_rx_packets += dpdk_device_input (dm, xd, node, thread_index, dq->queue_id, /* maybe_multiseg */ 0); } /* *INDENT-ON* */ diff --git a/src/plugins/dpdk/hqos/hqos.c b/src/plugins/dpdk/hqos/hqos.c index a288fca7..8b251beb 100644 --- a/src/plugins/dpdk/hqos/hqos.c +++ b/src/plugins/dpdk/hqos/hqos.c @@ -397,7 +397,7 @@ static_always_inline void dpdk_hqos_thread_internal_hqos_dbg_bypass (vlib_main_t * vm) { dpdk_main_t *dm = &dpdk_main; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; u32 dev_pos; dev_pos = 0; @@ -405,12 +405,12 @@ dpdk_hqos_thread_internal_hqos_dbg_bypass (vlib_main_t * vm) { vlib_worker_thread_barrier_check (); - u32 n_devs = vec_len (dm->devices_by_hqos_cpu[cpu_index]); + u32 n_devs = vec_len (dm->devices_by_hqos_cpu[thread_index]); if (dev_pos >= n_devs) dev_pos = 0; dpdk_device_and_queue_t *dq = - vec_elt_at_index (dm->devices_by_hqos_cpu[cpu_index], dev_pos); + vec_elt_at_index (dm->devices_by_hqos_cpu[thread_index], dev_pos); dpdk_device_t *xd = vec_elt_at_index (dm->devices, dq->device); dpdk_device_hqos_per_hqos_thread_t *hqos = xd->hqos_ht; @@ -479,7 +479,7 @@ static_always_inline void dpdk_hqos_thread_internal (vlib_main_t * vm) { dpdk_main_t *dm = &dpdk_main; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; u32 dev_pos; dev_pos = 0; @@ -487,7 +487,7 @@ dpdk_hqos_thread_internal (vlib_main_t * vm) { vlib_worker_thread_barrier_check (); - u32 n_devs = vec_len (dm->devices_by_hqos_cpu[cpu_index]); + u32 n_devs = vec_len (dm->devices_by_hqos_cpu[thread_index]); if (PREDICT_FALSE (n_devs == 0)) { dev_pos = 0; @@ -497,7 +497,7 @@ dpdk_hqos_thread_internal (vlib_main_t * vm) dev_pos = 0; dpdk_device_and_queue_t *dq = - vec_elt_at_index (dm->devices_by_hqos_cpu[cpu_index], dev_pos); + vec_elt_at_index (dm->devices_by_hqos_cpu[thread_index], dev_pos); dpdk_device_t *xd = vec_elt_at_index (dm->devices, dq->device); dpdk_device_hqos_per_hqos_thread_t *hqos = xd->hqos_ht; @@ -586,7 +586,7 @@ dpdk_hqos_thread (vlib_worker_thread_t * w) vm = vlib_get_main (); - ASSERT (vm->cpu_index == os_get_cpu_number ()); + ASSERT (vm->thread_index == vlib_get_thread_index ()); clib_time_init (&vm->clib_time); clib_mem_set_heap (w->thread_mheap); @@ -595,7 +595,7 @@ dpdk_hqos_thread (vlib_worker_thread_t * w) while (tm->worker_thread_release == 0) vlib_worker_thread_barrier_check (); - if (vec_len (dm->devices_by_hqos_cpu[vm->cpu_index]) == 0) + if (vec_len (dm->devices_by_hqos_cpu[vm->thread_index]) == 0) return clib_error ("current I/O TX thread does not have any devices assigned to it"); diff --git a/src/plugins/dpdk/ipsec/cli.c b/src/plugins/dpdk/ipsec/cli.c index cd0a6037..3ae8c9b8 100644 --- a/src/plugins/dpdk/ipsec/cli.c +++ b/src/plugins/dpdk/ipsec/cli.c @@ -42,8 +42,8 @@ dpdk_ipsec_show_mapping (vlib_main_t * vm, u16 detail_display) for (i = 0; i < tm->n_vlib_mains; i++) { uword key, data; - u32 cpu_index = vlib_mains[i]->cpu_index; - crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index]; + u32 thread_index = vlib_mains[i]->thread_index; + crypto_worker_main_t *cwm = &dcm->workers_main[thread_index]; u8 *s = 0; if (skip_master) @@ -57,7 +57,7 @@ dpdk_ipsec_show_mapping (vlib_main_t * vm, u16 detail_display) i32 last_cdev = -1; crypto_qp_data_t *qpd; - s = format (s, "%u\t", cpu_index); + s = format (s, "%u\t", thread_index); /* *INDENT-OFF* */ vec_foreach (qpd, cwm->qp_data) @@ -95,7 +95,7 @@ dpdk_ipsec_show_mapping (vlib_main_t * vm, u16 detail_display) cap.sym.auth.algo = p_key->auth_algo; check_algo_is_supported (&cap, auth_str); vlib_cli_output (vm, "%u\t%10s\t%15s\t%3s\t%u\t%u\n", - vlib_mains[i]->cpu_index, cipher_str, auth_str, + vlib_mains[i]->thread_index, cipher_str, auth_str, p_key->is_outbound ? "out" : "in", cwm->qp_data[data].dev_id, cwm->qp_data[data].qp_id); diff --git a/src/plugins/dpdk/ipsec/crypto_node.c b/src/plugins/dpdk/ipsec/crypto_node.c index dc3452b2..a3c45902 100644 --- a/src/plugins/dpdk/ipsec/crypto_node.c +++ b/src/plugins/dpdk/ipsec/crypto_node.c @@ -171,9 +171,9 @@ static uword dpdk_crypto_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); dpdk_crypto_main_t *dcm = &dpdk_crypto_main; - crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index]; + crypto_worker_main_t *cwm = &dcm->workers_main[thread_index]; crypto_qp_data_t *qpd; u32 n_deq = 0; diff --git a/src/plugins/dpdk/ipsec/esp.h b/src/plugins/dpdk/ipsec/esp.h index 320295b1..56f0c756 100644 --- a/src/plugins/dpdk/ipsec/esp.h +++ b/src/plugins/dpdk/ipsec/esp.h @@ -170,9 +170,9 @@ static_always_inline int create_sym_sess (ipsec_sa_t * sa, crypto_sa_session_t * sa_sess, u8 is_outbound) { - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); dpdk_crypto_main_t *dcm = &dpdk_crypto_main; - crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index]; + crypto_worker_main_t *cwm = &dcm->workers_main[thread_index]; struct rte_crypto_sym_xform cipher_xform = { 0 }; struct rte_crypto_sym_xform auth_xform = { 0 }; struct rte_crypto_sym_xform *xfs; diff --git a/src/plugins/dpdk/ipsec/esp_decrypt.c b/src/plugins/dpdk/ipsec/esp_decrypt.c index 286e03f8..bab76e3b 100644 --- a/src/plugins/dpdk/ipsec/esp_decrypt.c +++ b/src/plugins/dpdk/ipsec/esp_decrypt.c @@ -88,7 +88,7 @@ dpdk_esp_decrypt_node_fn (vlib_main_t * vm, { u32 n_left_from, *from, *to_next, next_index; ipsec_main_t *im = &ipsec_main; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); dpdk_crypto_main_t * dcm = &dpdk_crypto_main; dpdk_esp_main_t * em = &dpdk_esp_main; u32 i; @@ -104,7 +104,7 @@ dpdk_esp_decrypt_node_fn (vlib_main_t * vm, return n_left_from; } - crypto_worker_main_t *cwm = vec_elt_at_index(dcm->workers_main, cpu_index); + crypto_worker_main_t *cwm = vec_elt_at_index(dcm->workers_main, thread_index); u32 n_qps = vec_len(cwm->qp_data); struct rte_crypto_op ** cops_to_enq[n_qps]; u32 n_cop_qp[n_qps], * bi_to_enq[n_qps]; diff --git a/src/plugins/dpdk/ipsec/esp_encrypt.c b/src/plugins/dpdk/ipsec/esp_encrypt.c index 5b03de73..f996d7df 100644 --- a/src/plugins/dpdk/ipsec/esp_encrypt.c +++ b/src/plugins/dpdk/ipsec/esp_encrypt.c @@ -93,7 +93,7 @@ dpdk_esp_encrypt_node_fn (vlib_main_t * vm, { u32 n_left_from, *from, *to_next, next_index; ipsec_main_t *im = &ipsec_main; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); dpdk_crypto_main_t *dcm = &dpdk_crypto_main; dpdk_esp_main_t *em = &dpdk_esp_main; u32 i; @@ -111,7 +111,8 @@ dpdk_esp_encrypt_node_fn (vlib_main_t * vm, return n_left_from; } - crypto_worker_main_t *cwm = vec_elt_at_index (dcm->workers_main, cpu_index); + crypto_worker_main_t *cwm = + vec_elt_at_index (dcm->workers_main, thread_index); u32 n_qps = vec_len (cwm->qp_data); struct rte_crypto_op **cops_to_enq[n_qps]; u32 n_cop_qp[n_qps], *bi_to_enq[n_qps]; diff --git a/src/plugins/dpdk/ipsec/ipsec.c b/src/plugins/dpdk/ipsec/ipsec.c index b0aaaaec..5d8f4fba 100644 --- a/src/plugins/dpdk/ipsec/ipsec.c +++ b/src/plugins/dpdk/ipsec/ipsec.c @@ -289,7 +289,7 @@ dpdk_ipsec_process (vlib_main_t * vm, vlib_node_runtime_t * rt, if (!map) { clib_warning ("unable to create hash table for worker %u", - vlib_mains[i]->cpu_index); + vlib_mains[i]->thread_index); goto error; } cwm->algo_qp_map = map; diff --git a/src/plugins/dpdk/ipsec/ipsec.h b/src/plugins/dpdk/ipsec/ipsec.h index 28bffc80..f0f793c0 100644 --- a/src/plugins/dpdk/ipsec/ipsec.h +++ b/src/plugins/dpdk/ipsec/ipsec.h @@ -95,8 +95,8 @@ static_always_inline void crypto_alloc_cops () { dpdk_crypto_main_t *dcm = &dpdk_crypto_main; - u32 cpu_index = os_get_cpu_number (); - crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index]; + u32 thread_index = vlib_get_thread_index (); + crypto_worker_main_t *cwm = &dcm->workers_main[thread_index]; unsigned socket_id = rte_socket_id (); crypto_qp_data_t *qpd; diff --git a/src/plugins/dpdk/main.c b/src/plugins/dpdk/main.c index 7ee2a785..942b8b2d 100644 --- a/src/plugins/dpdk/main.c +++ b/src/plugins/dpdk/main.c @@ -39,7 +39,7 @@ rte_delay_us_override (unsigned us) * thread then do not intercept. (Must not be called from an * independent pthread). */ - if (os_get_cpu_number () == 0) + if (vlib_get_thread_index () == 0) { /* * We're in the vlib main thread or a vlib process. Make sure diff --git a/src/plugins/flowperpkt/l2_node.c b/src/plugins/flowperpkt/l2_node.c index 1c2f681e..fdaf81d1 100644 --- a/src/plugins/flowperpkt/l2_node.c +++ b/src/plugins/flowperpkt/l2_node.c @@ -102,7 +102,7 @@ add_to_flow_record_l2 (vlib_main_t * vm, u8 * src_mac, u8 * dst_mac, u16 ethertype, u64 timestamp, u16 length, int do_flush) { - u32 my_cpu_number = vm->cpu_index; + u32 my_cpu_number = vm->thread_index; flow_report_main_t *frm = &flow_report_main; ip4_header_t *ip; udp_header_t *udp; diff --git a/src/plugins/flowperpkt/node.c b/src/plugins/flowperpkt/node.c index f77f087d..0277682d 100644 --- a/src/plugins/flowperpkt/node.c +++ b/src/plugins/flowperpkt/node.c @@ -101,7 +101,7 @@ add_to_flow_record_ipv4 (vlib_main_t * vm, u32 src_address, u32 dst_address, u8 tos, u64 timestamp, u16 length, int do_flush) { - u32 my_cpu_number = vm->cpu_index; + u32 my_cpu_number = vm->thread_index; flow_report_main_t *frm = &flow_report_main; ip4_header_t *ip; udp_header_t *udp; diff --git a/src/plugins/ioam/export-common/ioam_export.h b/src/plugins/ioam/export-common/ioam_export.h index 2bf3fd54..9de0d13b 100644 --- a/src/plugins/ioam/export-common/ioam_export.h +++ b/src/plugins/ioam/export-common/ioam_export.h @@ -477,8 +477,8 @@ do { \ from = vlib_frame_vector_args (F); \ n_left_from = (F)->n_vectors; \ next_index = (N)->cached_next_index; \ - while (__sync_lock_test_and_set ((EM)->lockp[(VM)->cpu_index], 1)); \ - my_buf = ioam_export_get_my_buffer (EM, (VM)->cpu_index); \ + while (__sync_lock_test_and_set ((EM)->lockp[(VM)->thread_index], 1)); \ + my_buf = ioam_export_get_my_buffer (EM, (VM)->thread_index); \ my_buf->touched_at = vlib_time_now (VM); \ while (n_left_from > 0) \ { \ @@ -620,7 +620,7 @@ do { \ } \ vlib_node_increment_counter (VM, export_node.index, \ EXPORT_ERROR_RECORDED, pkts_recorded); \ - *(EM)->lockp[(VM)->cpu_index] = 0; \ + *(EM)->lockp[(VM)->thread_index] = 0; \ } while(0) #endif /* __included_ioam_export_h__ */ diff --git a/src/plugins/ioam/ip6/ioam_cache_tunnel_select_node.c b/src/plugins/ioam/ip6/ioam_cache_tunnel_select_node.c index a56dc040..0cf742c9 100644 --- a/src/plugins/ioam/ip6/ioam_cache_tunnel_select_node.c +++ b/src/plugins/ioam/ip6/ioam_cache_tunnel_select_node.c @@ -396,7 +396,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, clib_net_to_host_u32 (tcp0->seq_number) + 1, no_of_responses, now, - vm->cpu_index, &pool_index0)) + vm->thread_index, &pool_index0)) { cache_ts_added++; } @@ -419,7 +419,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, e2e = (ioam_e2e_cache_option_t *) ((u8 *) hbh0 + cm->rewrite_pool_index_offset); - e2e->pool_id = (u8) vm->cpu_index; + e2e->pool_id = (u8) vm->thread_index; e2e->pool_index = pool_index0; ioam_e2e_id_rewrite_handler ((ioam_e2e_id_option_t *) ((u8 *) e2e + @@ -455,7 +455,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, clib_net_to_host_u32 (tcp1->seq_number) + 1, no_of_responses, now, - vm->cpu_index, &pool_index1)) + vm->thread_index, &pool_index1)) { cache_ts_added++; } @@ -479,7 +479,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, e2e = (ioam_e2e_cache_option_t *) ((u8 *) hbh1 + cm->rewrite_pool_index_offset); - e2e->pool_id = (u8) vm->cpu_index; + e2e->pool_id = (u8) vm->thread_index; e2e->pool_index = pool_index1; ioam_e2e_id_rewrite_handler ((ioam_e2e_id_option_t *) ((u8 *) e2e + @@ -562,7 +562,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, clib_net_to_host_u32 (tcp0->seq_number) + 1, no_of_responses, now, - vm->cpu_index, &pool_index0)) + vm->thread_index, &pool_index0)) { cache_ts_added++; } @@ -585,7 +585,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, e2e = (ioam_e2e_cache_option_t *) ((u8 *) hbh0 + cm->rewrite_pool_index_offset); - e2e->pool_id = (u8) vm->cpu_index; + e2e->pool_id = (u8) vm->thread_index; e2e->pool_index = pool_index0; ioam_e2e_id_rewrite_handler ((ioam_e2e_id_option_t *) ((u8 *) e2e + @@ -701,7 +701,7 @@ expired_cache_ts_timer_callback (u32 * expired_timers) ioam_cache_main_t *cm = &ioam_cache_main; int i; u32 pool_index; - u32 thread_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 count = 0; for (i = 0; i < vec_len (expired_timers); i++) @@ -724,7 +724,7 @@ ioam_cache_ts_timer_tick_node_fn (vlib_main_t * vm, vlib_frame_t * f) { ioam_cache_main_t *cm = &ioam_cache_main; - u32 my_thread_index = os_get_cpu_number (); + u32 my_thread_index = vlib_get_thread_index (); struct timespec ts, tsrem; tw_timer_expire_timers_16t_2w_512sl (&cm->timer_wheels[my_thread_index], diff --git a/src/plugins/ixge/ixge.c b/src/plugins/ixge/ixge.c index f3c5cc09..08f5b692 100644 --- a/src/plugins/ixge/ixge.c +++ b/src/plugins/ixge/ixge.c @@ -1887,7 +1887,7 @@ done: vlib_increment_combined_counter (vnet_main. interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - 0 /* cpu_index */ , + 0 /* thread_index */ , xd->vlib_sw_if_index, n_packets, dq->rx.n_bytes); diff --git a/src/plugins/lb/lb.c b/src/plugins/lb/lb.c index add81236..addc2a42 100644 --- a/src/plugins/lb/lb.c +++ b/src/plugins/lb/lb.c @@ -63,11 +63,11 @@ u8 *format_lb_main (u8 * s, va_list * args) s = format(s, " #vips: %u\n", pool_elts(lbm->vips)); s = format(s, " #ass: %u\n", pool_elts(lbm->ass) - 1); - u32 cpu_index; - for(cpu_index = 0; cpu_index < tm->n_vlib_mains; cpu_index++ ) { - lb_hash_t *h = lbm->per_cpu[cpu_index].sticky_ht; + u32 thread_index; + for(thread_index = 0; thread_index < tm->n_vlib_mains; thread_index++ ) { + lb_hash_t *h = lbm->per_cpu[thread_index].sticky_ht; if (h) { - s = format(s, "core %d\n", cpu_index); + s = format(s, "core %d\n", thread_index); s = format(s, " timeout: %ds\n", h->timeout); s = format(s, " usage: %d / %d\n", lb_hash_elts(h, lb_hash_time_now(vlib_get_main())), lb_hash_size(h)); } diff --git a/src/plugins/lb/node.c b/src/plugins/lb/node.c index 8b763c53..3171148b 100644 --- a/src/plugins/lb/node.c +++ b/src/plugins/lb/node.c @@ -60,10 +60,10 @@ format_lb_trace (u8 * s, va_list * args) return s; } -lb_hash_t *lb_get_sticky_table(u32 cpu_index) +lb_hash_t *lb_get_sticky_table(u32 thread_index) { lb_main_t *lbm = &lb_main; - lb_hash_t *sticky_ht = lbm->per_cpu[cpu_index].sticky_ht; + lb_hash_t *sticky_ht = lbm->per_cpu[thread_index].sticky_ht; //Check if size changed if (PREDICT_FALSE(sticky_ht && (lbm->per_cpu_sticky_buckets != lb_hash_nbuckets(sticky_ht)))) { @@ -71,8 +71,8 @@ lb_hash_t *lb_get_sticky_table(u32 cpu_index) lb_hash_bucket_t *b; u32 i; lb_hash_foreach_entry(sticky_ht, b, i) { - vlib_refcount_add(&lbm->as_refcount, cpu_index, b->value[i], -1); - vlib_refcount_add(&lbm->as_refcount, cpu_index, 0, 1); + vlib_refcount_add(&lbm->as_refcount, thread_index, b->value[i], -1); + vlib_refcount_add(&lbm->as_refcount, thread_index, 0, 1); } lb_hash_free(sticky_ht); @@ -81,8 +81,8 @@ lb_hash_t *lb_get_sticky_table(u32 cpu_index) //Create if necessary if (PREDICT_FALSE(sticky_ht == NULL)) { - lbm->per_cpu[cpu_index].sticky_ht = lb_hash_alloc(lbm->per_cpu_sticky_buckets, lbm->flow_timeout); - sticky_ht = lbm->per_cpu[cpu_index].sticky_ht; + lbm->per_cpu[thread_index].sticky_ht = lb_hash_alloc(lbm->per_cpu_sticky_buckets, lbm->flow_timeout); + sticky_ht = lbm->per_cpu[thread_index].sticky_ht; clib_warning("Regenerated sticky table %p", sticky_ht); } @@ -153,10 +153,10 @@ lb_node_fn (vlib_main_t * vm, { lb_main_t *lbm = &lb_main; u32 n_left_from, *from, next_index, *to_next, n_left_to_next; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u32 lb_time = lb_hash_time_now(vm); - lb_hash_t *sticky_ht = lb_get_sticky_table(cpu_index); + lb_hash_t *sticky_ht = lb_get_sticky_table(thread_index); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; next_index = node->cached_next_index; @@ -240,9 +240,9 @@ lb_node_fn (vlib_main_t * vm, //Configuration may be changed, vectors resized, etc... //Dereference previously used - vlib_refcount_add(&lbm->as_refcount, cpu_index, + vlib_refcount_add(&lbm->as_refcount, thread_index, lb_hash_available_value(sticky_ht, hash0, available_index0), -1); - vlib_refcount_add(&lbm->as_refcount, cpu_index, + vlib_refcount_add(&lbm->as_refcount, thread_index, asindex0, 1); //Add sticky entry @@ -260,7 +260,7 @@ lb_node_fn (vlib_main_t * vm, } vlib_increment_simple_counter(&lbm->vip_counters[counter], - cpu_index, + thread_index, vnet_buffer (p0)->ip.adj_index[VLIB_TX], 1); diff --git a/src/plugins/lb/refcount.c b/src/plugins/lb/refcount.c index 22415c88..6f01ab5a 100644 --- a/src/plugins/lb/refcount.c +++ b/src/plugins/lb/refcount.c @@ -31,10 +31,10 @@ u64 vlib_refcount_get(vlib_refcount_t *r, u32 index) { u64 count = 0; vlib_thread_main_t *tm = vlib_get_thread_main (); - u32 cpu_index; - for (cpu_index = 0; cpu_index < tm->n_vlib_mains; cpu_index++) { - if (r->per_cpu[cpu_index].length > index) - count += r->per_cpu[cpu_index].counters[index]; + u32 thread_index; + for (thread_index = 0; thread_index < tm->n_vlib_mains; thread_index++) { + if (r->per_cpu[thread_index].length > index) + count += r->per_cpu[thread_index].counters[index]; } return count; } diff --git a/src/plugins/lb/refcount.h b/src/plugins/lb/refcount.h index 8c26e7be..dcfcb3fe 100644 --- a/src/plugins/lb/refcount.h +++ b/src/plugins/lb/refcount.h @@ -45,9 +45,9 @@ typedef struct { void __vlib_refcount_resize(vlib_refcount_per_cpu_t *per_cpu, u32 size); static_always_inline -void vlib_refcount_add(vlib_refcount_t *r, u32 cpu_index, u32 counter_index, i32 v) +void vlib_refcount_add(vlib_refcount_t *r, u32 thread_index, u32 counter_index, i32 v) { - vlib_refcount_per_cpu_t *per_cpu = &r->per_cpu[cpu_index]; + vlib_refcount_per_cpu_t *per_cpu = &r->per_cpu[thread_index]; if (PREDICT_FALSE(counter_index >= per_cpu->length)) __vlib_refcount_resize(per_cpu, clib_max(counter_index + 16, per_cpu->length * 2)); diff --git a/src/plugins/memif/node.c b/src/plugins/memif/node.c index 659d5dfb..cee1f3d1 100644 --- a/src/plugins/memif/node.c +++ b/src/plugins/memif/node.c @@ -94,7 +94,7 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, u32 n_rx_bytes = 0; u32 *to_next = 0; u32 n_free_bufs; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 bi0, bi1; vlib_buffer_t *b0, *b1; u16 ring_size = 1 << mif->log2_ring_size; @@ -105,14 +105,15 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (mif->per_interface_next_index != ~0) next_index = mif->per_interface_next_index; - n_free_bufs = vec_len (nm->rx_buffers[cpu_index]); + n_free_bufs = vec_len (nm->rx_buffers[thread_index]); if (PREDICT_FALSE (n_free_bufs < ring_size)) { - vec_validate (nm->rx_buffers[cpu_index], ring_size + n_free_bufs - 1); + vec_validate (nm->rx_buffers[thread_index], + ring_size + n_free_bufs - 1); n_free_bufs += - vlib_buffer_alloc (vm, &nm->rx_buffers[cpu_index][n_free_bufs], + vlib_buffer_alloc (vm, &nm->rx_buffers[thread_index][n_free_bufs], ring_size); - _vec_len (nm->rx_buffers[cpu_index]) = n_free_bufs; + _vec_len (nm->rx_buffers[thread_index]) = n_free_bufs; } head = ring->head; @@ -158,15 +159,15 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, CLIB_CACHE_LINE_BYTES, LOAD); } /* get empty buffer */ - u32 last_buf = vec_len (nm->rx_buffers[cpu_index]) - 1; - bi0 = nm->rx_buffers[cpu_index][last_buf]; - bi1 = nm->rx_buffers[cpu_index][last_buf - 1]; - _vec_len (nm->rx_buffers[cpu_index]) -= 2; + u32 last_buf = vec_len (nm->rx_buffers[thread_index]) - 1; + bi0 = nm->rx_buffers[thread_index][last_buf]; + bi1 = nm->rx_buffers[thread_index][last_buf - 1]; + _vec_len (nm->rx_buffers[thread_index]) -= 2; if (last_buf > 4) { - memif_prefetch (vm, nm->rx_buffers[cpu_index][last_buf - 2]); - memif_prefetch (vm, nm->rx_buffers[cpu_index][last_buf - 3]); + memif_prefetch (vm, nm->rx_buffers[thread_index][last_buf - 2]); + memif_prefetch (vm, nm->rx_buffers[thread_index][last_buf - 3]); } /* enqueue buffer */ @@ -256,9 +257,9 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, while (num_slots && n_left_to_next) { /* get empty buffer */ - u32 last_buf = vec_len (nm->rx_buffers[cpu_index]) - 1; - bi0 = nm->rx_buffers[cpu_index][last_buf]; - _vec_len (nm->rx_buffers[cpu_index]) = last_buf; + u32 last_buf = vec_len (nm->rx_buffers[thread_index]) - 1; + bi0 = nm->rx_buffers[thread_index][last_buf]; + _vec_len (nm->rx_buffers[thread_index]) = last_buf; /* enqueue buffer */ to_next[0] = bi0; @@ -315,7 +316,7 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, ring->tail = head; vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters - + VNET_INTERFACE_COUNTER_RX, cpu_index, + + VNET_INTERFACE_COUNTER_RX, thread_index, mif->hw_if_index, n_rx_packets, n_rx_bytes); @@ -327,7 +328,7 @@ memif_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { u32 n_rx_packets = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); memif_main_t *nm = &memif_main; memif_if_t *mif; @@ -337,7 +338,7 @@ memif_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, if (mif->flags & MEMIF_IF_FLAG_ADMIN_UP && mif->flags & MEMIF_IF_FLAG_CONNECTED && (mif->if_index % nm->input_cpu_count) == - (cpu_index - nm->input_cpu_first_index)) + (thread_index - nm->input_cpu_first_index)) { if (mif->flags & MEMIF_IF_FLAG_IS_SLAVE) n_rx_packets += diff --git a/src/plugins/snat/in2out.c b/src/plugins/snat/in2out.c index b4961365..e5ee965f 100644 --- a/src/plugins/snat/in2out.c +++ b/src/plugins/snat/in2out.c @@ -212,7 +212,7 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, snat_session_t ** sessionp, vlib_node_runtime_t * node, u32 next0, - u32 cpu_index) + u32 thread_index) { snat_user_t *u; snat_user_key_t user_key; @@ -246,27 +246,27 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, if (clib_bihash_search_8_8 (&sm->user_hash, &kv0, &value0)) { /* no, make a new one */ - pool_get (sm->per_thread_data[cpu_index].users, u); + pool_get (sm->per_thread_data[thread_index].users, u); memset (u, 0, sizeof (*u)); u->addr = ip0->src_address; u->fib_index = rx_fib_index0; - pool_get (sm->per_thread_data[cpu_index].list_pool, per_user_list_head_elt); + pool_get (sm->per_thread_data[thread_index].list_pool, per_user_list_head_elt); u->sessions_per_user_list_head_index = per_user_list_head_elt - - sm->per_thread_data[cpu_index].list_pool; + sm->per_thread_data[thread_index].list_pool; - clib_dlist_init (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_init (sm->per_thread_data[thread_index].list_pool, u->sessions_per_user_list_head_index); - kv0.value = u - sm->per_thread_data[cpu_index].users; + kv0.value = u - sm->per_thread_data[thread_index].users; /* add user */ clib_bihash_add_del_8_8 (&sm->user_hash, &kv0, 1 /* is_add */); } else { - u = pool_elt_at_index (sm->per_thread_data[cpu_index].users, + u = pool_elt_at_index (sm->per_thread_data[thread_index].users, value0.value); } @@ -276,25 +276,25 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, /* Remove the oldest dynamic translation */ do { oldest_per_user_translation_list_index = - clib_dlist_remove_head (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove_head (sm->per_thread_data[thread_index].list_pool, u->sessions_per_user_list_head_index); ASSERT (oldest_per_user_translation_list_index != ~0); /* add it back to the end of the LRU list */ - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, u->sessions_per_user_list_head_index, oldest_per_user_translation_list_index); /* Get the list element */ oldest_per_user_translation_list_elt = - pool_elt_at_index (sm->per_thread_data[cpu_index].list_pool, + pool_elt_at_index (sm->per_thread_data[thread_index].list_pool, oldest_per_user_translation_list_index); /* Get the session index from the list element */ session_index = oldest_per_user_translation_list_elt->value; /* Get the session */ - s = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, session_index); } while (snat_is_session_static (s)); @@ -346,7 +346,7 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, } /* Create a new session */ - pool_get (sm->per_thread_data[cpu_index].sessions, s); + pool_get (sm->per_thread_data[thread_index].sessions, s); memset (s, 0, sizeof (*s)); s->outside_address_index = address_index; @@ -362,22 +362,22 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, } /* Create list elts */ - pool_get (sm->per_thread_data[cpu_index].list_pool, + pool_get (sm->per_thread_data[thread_index].list_pool, per_user_translation_list_elt); - clib_dlist_init (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_init (sm->per_thread_data[thread_index].list_pool, per_user_translation_list_elt - - sm->per_thread_data[cpu_index].list_pool); + sm->per_thread_data[thread_index].list_pool); per_user_translation_list_elt->value = - s - sm->per_thread_data[cpu_index].sessions; + s - sm->per_thread_data[thread_index].sessions; s->per_user_index = per_user_translation_list_elt - - sm->per_thread_data[cpu_index].list_pool; + sm->per_thread_data[thread_index].list_pool; s->per_user_list_head_index = u->sessions_per_user_list_head_index; - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s->per_user_list_head_index, per_user_translation_list_elt - - sm->per_thread_data[cpu_index].list_pool); + sm->per_thread_data[thread_index].list_pool); } s->in2out = *key0; @@ -388,12 +388,12 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, /* Add to translation hashes */ kv0.key = s->in2out.as_u64; - kv0.value = s - sm->per_thread_data[cpu_index].sessions; + kv0.value = s - sm->per_thread_data[thread_index].sessions; if (clib_bihash_add_del_8_8 (&sm->in2out, &kv0, 1 /* is_add */)) clib_warning ("in2out key add failed"); kv0.key = s->out2in.as_u64; - kv0.value = s - sm->per_thread_data[cpu_index].sessions; + kv0.value = s - sm->per_thread_data[thread_index].sessions; if (clib_bihash_add_del_8_8 (&sm->out2in, &kv0, 1 /* is_add */)) clib_warning ("out2in key add failed"); @@ -403,7 +403,7 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, worker_by_out_key.port = s->out2in.port; worker_by_out_key.fib_index = s->out2in.fib_index; kv0.key = worker_by_out_key.as_u64; - kv0.value = cpu_index; + kv0.value = thread_index; clib_bihash_add_del_8_8 (&sm->worker_by_out, &kv0, 1); /* log NAT event */ @@ -465,7 +465,7 @@ snat_in2out_error_t icmp_get_key(icmp46_header_t *icmp0, * * @param[in,out] sm SNAT main * @param[in,out] node SNAT node runtime - * @param[in] cpu_index CPU index + * @param[in] thread_index thread index * @param[in,out] b0 buffer containing packet to be translated * @param[out] p_key address and port before NAT translation * @param[out] p_value address and port after NAT translation @@ -473,7 +473,7 @@ snat_in2out_error_t icmp_get_key(icmp46_header_t *icmp0, * @param d optional parameter */ u32 icmp_match_in2out_slow(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d) @@ -524,13 +524,13 @@ u32 icmp_match_in2out_slow(snat_main_t *sm, vlib_node_runtime_t *node, } next0 = slow_path (sm, b0, ip0, rx_fib_index0, &key0, - &s0, node, next0, cpu_index); + &s0, node, next0, thread_index); if (PREDICT_FALSE (next0 == SNAT_IN2OUT_NEXT_DROP)) goto out; } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); out: @@ -548,7 +548,7 @@ out: * * @param[in] sm SNAT main * @param[in,out] node SNAT node runtime - * @param[in] cpu_index CPU index + * @param[in] thread_index thread index * @param[in,out] b0 buffer containing packet to be translated * @param[out] p_key address and port before NAT translation * @param[out] p_value address and port after NAT translation @@ -556,7 +556,7 @@ out: * @param d optional parameter */ u32 icmp_match_in2out_fast(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d) @@ -624,7 +624,7 @@ static inline u32 icmp_in2out (snat_main_t *sm, u32 rx_fib_index0, vlib_node_runtime_t * node, u32 next0, - u32 cpu_index, + u32 thread_index, void *d) { snat_session_key_t key0, sm0; @@ -641,7 +641,7 @@ static inline u32 icmp_in2out (snat_main_t *sm, echo0 = (icmp_echo_header_t *)(icmp0+1); - next0_tmp = sm->icmp_match_in2out_cb(sm, node, cpu_index, b0, + next0_tmp = sm->icmp_match_in2out_cb(sm, node, thread_index, b0, &key0, &sm0, &dont_translate, d); if (next0_tmp != ~0) next0 = next0_tmp; @@ -847,11 +847,11 @@ static inline u32 icmp_in2out_slow_path (snat_main_t *sm, vlib_node_runtime_t * node, u32 next0, f64 now, - u32 cpu_index, + u32 thread_index, snat_session_t ** p_s0) { next0 = icmp_in2out(sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, - next0, cpu_index, p_s0); + next0, thread_index, p_s0); snat_session_t * s0 = *p_s0; if (PREDICT_TRUE(next0 != SNAT_IN2OUT_NEXT_DROP && s0)) { @@ -862,9 +862,9 @@ static inline u32 icmp_in2out_slow_path (snat_main_t *sm, /* Per-user LRU list maintenance for dynamic translations */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -884,7 +884,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, snat_runtime_t * rt = (snat_runtime_t *)node->runtime_data; f64 now = vlib_time_now (vm); u32 stats_node_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); stats_node_index = is_slow_path ? snat_in2out_slowpath_node.index : snat_in2out_node.index; @@ -977,7 +977,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, { next0 = icmp_in2out_slow_path (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, - node, next0, now, cpu_index, &s0); + node, next0, now, thread_index, &s0); goto trace00; } } @@ -1006,7 +1006,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, goto trace00; next0 = slow_path (sm, b0, ip0, rx_fib_index0, &key0, - &s0, node, next0, cpu_index); + &s0, node, next0, thread_index); if (PREDICT_FALSE (next0 == SNAT_IN2OUT_NEXT_DROP)) goto trace00; } @@ -1017,7 +1017,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, } } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); old_addr0 = ip0->src_address.as_u32; @@ -1063,9 +1063,9 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -1081,7 +1081,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, t->next_index = next0; t->session_index = ~0; if (s0) - t->session_index = s0 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s0 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next0 != SNAT_IN2OUT_NEXT_DROP; @@ -1117,7 +1117,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, { next1 = icmp_in2out_slow_path (sm, b1, ip1, icmp1, sw_if_index1, rx_fib_index1, node, - next1, now, cpu_index, &s1); + next1, now, thread_index, &s1); goto trace01; } } @@ -1146,7 +1146,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, goto trace01; next1 = slow_path (sm, b1, ip1, rx_fib_index1, &key1, - &s1, node, next1, cpu_index); + &s1, node, next1, thread_index); if (PREDICT_FALSE (next1 == SNAT_IN2OUT_NEXT_DROP)) goto trace01; } @@ -1157,7 +1157,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, } } else - s1 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s1 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value1.value); old_addr1 = ip1->src_address.as_u32; @@ -1203,9 +1203,9 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s1)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s1->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s1->per_user_list_head_index, s1->per_user_index); } @@ -1220,7 +1220,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, t->next_index = next1; t->session_index = ~0; if (s1) - t->session_index = s1 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s1 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next1 != SNAT_IN2OUT_NEXT_DROP; @@ -1292,7 +1292,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, { next0 = icmp_in2out_slow_path (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, - next0, now, cpu_index, &s0); + next0, now, thread_index, &s0); goto trace0; } } @@ -1321,7 +1321,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, goto trace0; next0 = slow_path (sm, b0, ip0, rx_fib_index0, &key0, - &s0, node, next0, cpu_index); + &s0, node, next0, thread_index); if (PREDICT_FALSE (next0 == SNAT_IN2OUT_NEXT_DROP)) goto trace0; @@ -1333,7 +1333,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, } } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); old_addr0 = ip0->src_address.as_u32; @@ -1379,9 +1379,9 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -1397,7 +1397,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, t->next_index = next0; t->session_index = ~0; if (s0) - t->session_index = s0 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s0 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next0 != SNAT_IN2OUT_NEXT_DROP; @@ -2010,7 +2010,7 @@ snat_in2out_worker_handoff_fn (vlib_main_t * vm, u32 n_left_to_next_worker = 0, *to_next_worker = 0; u32 next_worker_index = 0; u32 current_worker_index = ~0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); ASSERT (vec_len (sm->workers)); @@ -2048,7 +2048,7 @@ snat_in2out_worker_handoff_fn (vlib_main_t * vm, next_worker_index = sm->worker_in2out_cb(ip0, rx_fib_index0); - if (PREDICT_FALSE (next_worker_index != cpu_index)) + if (PREDICT_FALSE (next_worker_index != thread_index)) { do_handoff = 1; diff --git a/src/plugins/snat/out2in.c b/src/plugins/snat/out2in.c index 656e42db..5d308d78 100644 --- a/src/plugins/snat/out2in.c +++ b/src/plugins/snat/out2in.c @@ -129,7 +129,7 @@ create_session_for_static_mapping (snat_main_t *sm, snat_session_key_t in2out, snat_session_key_t out2in, vlib_node_runtime_t * node, - u32 cpu_index) + u32 thread_index) { snat_user_t *u; snat_user_key_t user_key; @@ -146,36 +146,36 @@ create_session_for_static_mapping (snat_main_t *sm, if (clib_bihash_search_8_8 (&sm->user_hash, &kv0, &value0)) { /* no, make a new one */ - pool_get (sm->per_thread_data[cpu_index].users, u); + pool_get (sm->per_thread_data[thread_index].users, u); memset (u, 0, sizeof (*u)); u->addr = in2out.addr; u->fib_index = in2out.fib_index; - pool_get (sm->per_thread_data[cpu_index].list_pool, + pool_get (sm->per_thread_data[thread_index].list_pool, per_user_list_head_elt); u->sessions_per_user_list_head_index = per_user_list_head_elt - - sm->per_thread_data[cpu_index].list_pool; + sm->per_thread_data[thread_index].list_pool; - clib_dlist_init (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_init (sm->per_thread_data[thread_index].list_pool, u->sessions_per_user_list_head_index); - kv0.value = u - sm->per_thread_data[cpu_index].users; + kv0.value = u - sm->per_thread_data[thread_index].users; /* add user */ clib_bihash_add_del_8_8 (&sm->user_hash, &kv0, 1 /* is_add */); /* add non-traslated packets worker lookup */ - kv0.value = cpu_index; + kv0.value = thread_index; clib_bihash_add_del_8_8 (&sm->worker_by_in, &kv0, 1); } else { - u = pool_elt_at_index (sm->per_thread_data[cpu_index].users, + u = pool_elt_at_index (sm->per_thread_data[thread_index].users, value0.value); } - pool_get (sm->per_thread_data[cpu_index].sessions, s); + pool_get (sm->per_thread_data[thread_index].sessions, s); memset (s, 0, sizeof (*s)); s->outside_address_index = ~0; @@ -183,22 +183,22 @@ create_session_for_static_mapping (snat_main_t *sm, u->nstaticsessions++; /* Create list elts */ - pool_get (sm->per_thread_data[cpu_index].list_pool, + pool_get (sm->per_thread_data[thread_index].list_pool, per_user_translation_list_elt); - clib_dlist_init (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_init (sm->per_thread_data[thread_index].list_pool, per_user_translation_list_elt - - sm->per_thread_data[cpu_index].list_pool); + sm->per_thread_data[thread_index].list_pool); per_user_translation_list_elt->value = - s - sm->per_thread_data[cpu_index].sessions; + s - sm->per_thread_data[thread_index].sessions; s->per_user_index = - per_user_translation_list_elt - sm->per_thread_data[cpu_index].list_pool; + per_user_translation_list_elt - sm->per_thread_data[thread_index].list_pool; s->per_user_list_head_index = u->sessions_per_user_list_head_index; - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s->per_user_list_head_index, per_user_translation_list_elt - - sm->per_thread_data[cpu_index].list_pool); + sm->per_thread_data[thread_index].list_pool); s->in2out = in2out; s->out2in = out2in; @@ -206,12 +206,12 @@ create_session_for_static_mapping (snat_main_t *sm, /* Add to translation hashes */ kv0.key = s->in2out.as_u64; - kv0.value = s - sm->per_thread_data[cpu_index].sessions; + kv0.value = s - sm->per_thread_data[thread_index].sessions; if (clib_bihash_add_del_8_8 (&sm->in2out, &kv0, 1 /* is_add */)) clib_warning ("in2out key add failed"); kv0.key = s->out2in.as_u64; - kv0.value = s - sm->per_thread_data[cpu_index].sessions; + kv0.value = s - sm->per_thread_data[thread_index].sessions; if (clib_bihash_add_del_8_8 (&sm->out2in, &kv0, 1 /* is_add */)) clib_warning ("out2in key add failed"); @@ -298,7 +298,7 @@ is_interface_addr(snat_main_t *sm, vlib_node_runtime_t *node, u32 sw_if_index0, * * @param[in,out] sm SNAT main * @param[in,out] node SNAT node runtime - * @param[in] cpu_index CPU index + * @param[in] thread_index thread index * @param[in,out] b0 buffer containing packet to be translated * @param[out] p_key address and port before NAT translation * @param[out] p_value address and port after NAT translation @@ -306,7 +306,7 @@ is_interface_addr(snat_main_t *sm, vlib_node_runtime_t *node, u32 sw_if_index0, * @param d optional parameter */ u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d) @@ -366,7 +366,7 @@ u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node, /* Create session initiated by host from external network */ s0 = create_session_for_static_mapping(sm, b0, sm0, key0, - node, cpu_index); + node, thread_index); if (!s0) { @@ -375,7 +375,7 @@ u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node, } } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); out: @@ -393,7 +393,7 @@ out: * * @param[in] sm SNAT main * @param[in,out] node SNAT node runtime - * @param[in] cpu_index CPU index + * @param[in] thread_index thread index * @param[in,out] b0 buffer containing packet to be translated * @param[out] p_key address and port before NAT translation * @param[out] p_value address and port after NAT translation @@ -401,7 +401,7 @@ out: * @param d optional parameter */ u32 icmp_match_out2in_fast(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d) @@ -460,7 +460,7 @@ static inline u32 icmp_out2in (snat_main_t *sm, u32 rx_fib_index0, vlib_node_runtime_t * node, u32 next0, - u32 cpu_index, + u32 thread_index, void *d) { snat_session_key_t key0, sm0; @@ -477,7 +477,7 @@ static inline u32 icmp_out2in (snat_main_t *sm, echo0 = (icmp_echo_header_t *)(icmp0+1); - next0_tmp = sm->icmp_match_out2in_cb(sm, node, cpu_index, b0, + next0_tmp = sm->icmp_match_out2in_cb(sm, node, thread_index, b0, &key0, &sm0, &dont_translate, d); if (next0_tmp != ~0) next0 = next0_tmp; @@ -589,11 +589,11 @@ static inline u32 icmp_out2in_slow_path (snat_main_t *sm, u32 rx_fib_index0, vlib_node_runtime_t * node, u32 next0, f64 now, - u32 cpu_index, + u32 thread_index, snat_session_t ** p_s0) { next0 = icmp_out2in(sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, - next0, cpu_index, p_s0); + next0, thread_index, p_s0); snat_session_t * s0 = *p_s0; if (PREDICT_TRUE(next0 != SNAT_OUT2IN_NEXT_DROP && s0)) { @@ -604,9 +604,9 @@ static inline u32 icmp_out2in_slow_path (snat_main_t *sm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -624,7 +624,7 @@ snat_out2in_node_fn (vlib_main_t * vm, u32 pkts_processed = 0; snat_main_t * sm = &snat_main; f64 now = vlib_time_now (vm); - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -712,7 +712,7 @@ snat_out2in_node_fn (vlib_main_t * vm, { next0 = icmp_out2in_slow_path (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, - next0, now, cpu_index, &s0); + next0, now, thread_index, &s0); goto trace0; } @@ -743,7 +743,7 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Create session initiated by host from external network */ s0 = create_session_for_static_mapping(sm, b0, sm0, key0, node, - cpu_index); + thread_index); if (!s0) { b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; @@ -752,7 +752,7 @@ snat_out2in_node_fn (vlib_main_t * vm, } } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); old_addr0 = ip0->dst_address.as_u32; @@ -796,9 +796,9 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -813,7 +813,7 @@ snat_out2in_node_fn (vlib_main_t * vm, t->next_index = next0; t->session_index = ~0; if (s0) - t->session_index = s0 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s0 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next0 != SNAT_OUT2IN_NEXT_DROP; @@ -847,7 +847,7 @@ snat_out2in_node_fn (vlib_main_t * vm, { next1 = icmp_out2in_slow_path (sm, b1, ip1, icmp1, sw_if_index1, rx_fib_index1, node, - next1, now, cpu_index, &s1); + next1, now, thread_index, &s1); goto trace1; } @@ -878,7 +878,7 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Create session initiated by host from external network */ s1 = create_session_for_static_mapping(sm, b1, sm1, key1, node, - cpu_index); + thread_index); if (!s1) { b1->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; @@ -887,7 +887,7 @@ snat_out2in_node_fn (vlib_main_t * vm, } } else - s1 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s1 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value1.value); old_addr1 = ip1->dst_address.as_u32; @@ -931,9 +931,9 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s1)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s1->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s1->per_user_list_head_index, s1->per_user_index); } @@ -948,7 +948,7 @@ snat_out2in_node_fn (vlib_main_t * vm, t->next_index = next1; t->session_index = ~0; if (s1) - t->session_index = s1 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s1 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next1 != SNAT_OUT2IN_NEXT_DROP; @@ -1016,7 +1016,7 @@ snat_out2in_node_fn (vlib_main_t * vm, { next0 = icmp_out2in_slow_path (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, - next0, now, cpu_index, &s0); + next0, now, thread_index, &s0); goto trace00; } @@ -1048,7 +1048,7 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Create session initiated by host from external network */ s0 = create_session_for_static_mapping(sm, b0, sm0, key0, node, - cpu_index); + thread_index); if (!s0) { b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; @@ -1057,7 +1057,7 @@ snat_out2in_node_fn (vlib_main_t * vm, } } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); old_addr0 = ip0->dst_address.as_u32; @@ -1101,9 +1101,9 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -1118,7 +1118,7 @@ snat_out2in_node_fn (vlib_main_t * vm, t->next_index = next0; t->session_index = ~0; if (s0) - t->session_index = s0 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s0 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next0 != SNAT_OUT2IN_NEXT_DROP; @@ -1599,7 +1599,7 @@ snat_out2in_worker_handoff_fn (vlib_main_t * vm, u32 n_left_to_next_worker = 0, *to_next_worker = 0; u32 next_worker_index = 0; u32 current_worker_index = ~0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); ASSERT (vec_len (sm->workers)); @@ -1637,7 +1637,7 @@ snat_out2in_worker_handoff_fn (vlib_main_t * vm, next_worker_index = sm->worker_out2in_cb(ip0, rx_fib_index0); - if (PREDICT_FALSE (next_worker_index != cpu_index)) + if (PREDICT_FALSE (next_worker_index != thread_index)) { do_handoff = 1; diff --git a/src/plugins/snat/snat.h b/src/plugins/snat/snat.h index 017825c0..f4e1c5c0 100644 --- a/src/plugins/snat/snat.h +++ b/src/plugins/snat/snat.h @@ -221,7 +221,7 @@ struct snat_main_s; typedef u32 snat_icmp_match_function_t (struct snat_main_s *sm, vlib_node_runtime_t *node, - u32 cpu_index, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, @@ -402,22 +402,22 @@ typedef struct { } tcp_udp_header_t; u32 icmp_match_in2out_fast(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d); u32 icmp_match_in2out_slow(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d); u32 icmp_match_out2in_fast(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d); u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d); diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index a517a597..be3b41ef 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -299,7 +299,7 @@ vlib_buffer_validate_alloc_free (vlib_main_t * vm, if (CLIB_DEBUG == 0) return; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); /* smp disaster check */ if (vec_len (vlib_mains) > 1) @@ -355,7 +355,7 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm, vlib_buffer_free_list_t *f; int i; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); if (!is_default && pool_elts (bm->buffer_free_list_pool) == 0) { @@ -474,7 +474,7 @@ vlib_buffer_delete_free_list_internal (vlib_main_t * vm, u32 free_list_index) u32 merge_index; int i; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); f = vlib_buffer_get_free_list (vm, free_list_index); diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 394c336a..328660a3 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -209,7 +209,7 @@ always_inline vlib_buffer_known_state_t vlib_buffer_is_known (vlib_main_t * vm, u32 buffer_index) { vlib_buffer_main_t *bm = vm->buffer_main; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); uword *p = hash_get (bm->buffer_known_hash, buffer_index); return p ? p[0] : VLIB_BUFFER_UNKNOWN; @@ -221,7 +221,7 @@ vlib_buffer_set_known_state (vlib_main_t * vm, vlib_buffer_known_state_t state) { vlib_buffer_main_t *bm = vm->buffer_main; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); hash_set (bm->buffer_known_hash, buffer_index, state); } diff --git a/src/vlib/cli.c b/src/vlib/cli.c index f853f655..3cc95076 100644 --- a/src/vlib/cli.c +++ b/src/vlib/cli.c @@ -709,7 +709,7 @@ test_heap_validate (vlib_main_t * vm, unformat_input_t * input, { /* *INDENT-OFF* */ foreach_vlib_main({ - heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index]; + heap = clib_per_cpu_mheaps[this_vlib_main->thread_index]; mheap = mheap_header(heap); mheap->flags |= MHEAP_FLAG_VALIDATE; // Turn off small object cache because it delays detection of errors @@ -722,7 +722,7 @@ test_heap_validate (vlib_main_t * vm, unformat_input_t * input, { /* *INDENT-OFF* */ foreach_vlib_main({ - heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index]; + heap = clib_per_cpu_mheaps[this_vlib_main->thread_index]; mheap = mheap_header(heap); mheap->flags &= ~MHEAP_FLAG_VALIDATE; mheap->flags |= MHEAP_FLAG_SMALL_OBJECT_CACHE; @@ -733,7 +733,7 @@ test_heap_validate (vlib_main_t * vm, unformat_input_t * input, { /* *INDENT-OFF* */ foreach_vlib_main({ - heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index]; + heap = clib_per_cpu_mheaps[this_vlib_main->thread_index]; mheap = mheap_header(heap); mheap_validate(heap); }); diff --git a/src/vlib/counter.h b/src/vlib/counter.h index 17a85217..60e2055d 100644 --- a/src/vlib/counter.h +++ b/src/vlib/counter.h @@ -70,17 +70,17 @@ u32 vlib_simple_counter_n_counters (const vlib_simple_counter_main_t * cm); /** Increment a simple counter @param cm - (vlib_simple_counter_main_t *) simple counter main pointer - @param cpu_index - (u32) the current cpu index + @param thread_index - (u32) the current cpu index @param index - (u32) index of the counter to increment @param increment - (u64) quantitiy to add to the counter */ always_inline void vlib_increment_simple_counter (vlib_simple_counter_main_t * cm, - u32 cpu_index, u32 index, u64 increment) + u32 thread_index, u32 index, u64 increment) { counter_t *my_counters; - my_counters = cm->counters[cpu_index]; + my_counters = cm->counters[thread_index]; my_counters[index] += increment; } @@ -201,7 +201,7 @@ void vlib_clear_combined_counters (vlib_combined_counter_main_t * cm); /** Increment a combined counter @param cm - (vlib_combined_counter_main_t *) comined counter main pointer - @param cpu_index - (u32) the current cpu index + @param thread_index - (u32) the current cpu index @param index - (u32) index of the counter to increment @param packet_increment - (u64) number of packets to add to the counter @param byte_increment - (u64) number of bytes to add to the counter @@ -209,13 +209,13 @@ void vlib_clear_combined_counters (vlib_combined_counter_main_t * cm); always_inline void vlib_increment_combined_counter (vlib_combined_counter_main_t * cm, - u32 cpu_index, + u32 thread_index, u32 index, u64 n_packets, u64 n_bytes) { vlib_counter_t *my_counters; /* Use this CPU's counter array */ - my_counters = cm->counters[cpu_index]; + my_counters = cm->counters[thread_index]; my_counters[index].packets += n_packets; my_counters[index].bytes += n_bytes; @@ -224,14 +224,14 @@ vlib_increment_combined_counter (vlib_combined_counter_main_t * cm, /** Pre-fetch a per-thread combined counter for the given object index */ always_inline void vlib_prefetch_combined_counter (const vlib_combined_counter_main_t * cm, - u32 cpu_index, u32 index) + u32 thread_index, u32 index) { vlib_counter_t *cpu_counters; /* * This CPU's index is assumed to already be in cache */ - cpu_counters = cm->counters[cpu_index]; + cpu_counters = cm->counters[thread_index]; CLIB_PREFETCH (cpu_counters + index, CLIB_CACHE_LINE_BYTES, STORE); } diff --git a/src/vlib/error.c b/src/vlib/error.c index a2c23176..e4ed4ee3 100644 --- a/src/vlib/error.c +++ b/src/vlib/error.c @@ -149,7 +149,7 @@ vlib_register_errors (vlib_main_t * vm, vlib_node_t *n = vlib_get_node (vm, node_index); uword l; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); /* Free up any previous error strings. */ if (n->n_errors > 0) diff --git a/src/vlib/global_funcs.h b/src/vlib/global_funcs.h index f51ec381..9dd01fbf 100644 --- a/src/vlib/global_funcs.h +++ b/src/vlib/global_funcs.h @@ -23,7 +23,7 @@ always_inline vlib_main_t * vlib_get_main (void) { vlib_main_t *vm; - vm = vlib_mains[os_get_cpu_number ()]; + vm = vlib_mains[vlib_get_thread_index ()]; ASSERT (vm); return vm; } diff --git a/src/vlib/main.c b/src/vlib/main.c index b22203f0..422d3e26 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -136,18 +136,18 @@ vlib_frame_alloc_to_node (vlib_main_t * vm, u32 to_node_index, else { f = clib_mem_alloc_aligned_no_fail (n, VLIB_FRAME_ALIGN); - f->cpu_index = vm->cpu_index; + f->thread_index = vm->thread_index; fi = vlib_frame_index_no_check (vm, f); } /* Poison frame when debugging. */ if (CLIB_DEBUG > 0) { - u32 save_cpu_index = f->cpu_index; + u32 save_thread_index = f->thread_index; memset (f, 0xfe, n); - f->cpu_index = save_cpu_index; + f->thread_index = save_thread_index; } /* Insert magic number. */ @@ -517,7 +517,7 @@ vlib_put_next_frame (vlib_main_t * vm, * a dangling frame reference. Each thread has its own copy of * the next_frames vector. */ - if (0 && r->cpu_index != next_runtime->cpu_index) + if (0 && r->thread_index != next_runtime->thread_index) { nf->frame_index = ~0; nf->flags &= ~(VLIB_FRAME_PENDING | VLIB_FRAME_IS_ALLOCATED); @@ -866,7 +866,7 @@ vlib_elog_main_loop_event (vlib_main_t * vm, : evm->node_call_elog_event_types, node_index), /* track */ - (vm->cpu_index ? &vlib_worker_threads[vm->cpu_index]. + (vm->thread_index ? &vlib_worker_threads[vm->thread_index]. elog_track : &em->default_track), /* data to log */ n_vectors); } @@ -963,7 +963,7 @@ dispatch_node (vlib_main_t * vm, vm->cpu_time_last_node_dispatch = last_time_stamp; - if (1 /* || vm->cpu_index == node->cpu_index */ ) + if (1 /* || vm->thread_index == node->thread_index */ ) { vlib_main_t *stat_vm; @@ -1029,7 +1029,7 @@ dispatch_node (vlib_main_t * vm, { u32 node_name, vector_length, is_polling; } *ed; - vlib_worker_thread_t *w = vlib_worker_threads + vm->cpu_index; + vlib_worker_thread_t *w = vlib_worker_threads + vm->thread_index; #endif if ((dispatch_state == VLIB_NODE_STATE_INTERRUPT diff --git a/src/vlib/main.h b/src/vlib/main.h index 0197b4f3..329bf073 100644 --- a/src/vlib/main.h +++ b/src/vlib/main.h @@ -156,7 +156,7 @@ typedef struct vlib_main_t uword *init_functions_called; /* to compare with node runtime */ - u32 cpu_index; + u32 thread_index; void **mbuf_alloc_list; diff --git a/src/vlib/node.c b/src/vlib/node.c index dc0a4de5..bbd3a42e 100644 --- a/src/vlib/node.c +++ b/src/vlib/node.c @@ -99,7 +99,7 @@ vlib_node_runtime_update (vlib_main_t * vm, u32 node_index, u32 next_index) vlib_pending_frame_t *pf; i32 i, j, n_insert; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); vlib_worker_thread_barrier_sync (vm); diff --git a/src/vlib/node.h b/src/vlib/node.h index fc7e7da2..1e2f4c38 100644 --- a/src/vlib/node.h +++ b/src/vlib/node.h @@ -344,8 +344,8 @@ typedef struct vlib_frame_t /* Number of vector elements currently in frame. */ u16 n_vectors; - /* Owner cpuid / heap id */ - u16 cpu_index; + /* Owner thread / heap id */ + u16 thread_index; /* Scalar and vector arguments to next node. */ u8 arguments[0]; @@ -459,7 +459,7 @@ typedef struct vlib_node_runtime_t zero before first run of this node. */ - u16 cpu_index; /**< CPU this node runs on */ + u16 thread_index; /**< thread this node runs on */ u8 runtime_data[0]; /**< Function dependent node-runtime data. This data is diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h index 1f7d94e1..54e36874 100644 --- a/src/vlib/node_funcs.h +++ b/src/vlib/node_funcs.h @@ -201,9 +201,9 @@ always_inline vlib_frame_t * vlib_get_frame_no_check (vlib_main_t * vm, uword frame_index) { vlib_frame_t *f; - u32 cpu_index = frame_index & VLIB_CPU_MASK; + u32 thread_index = frame_index & VLIB_CPU_MASK; u32 offset = frame_index & VLIB_OFFSET_MASK; - vm = vlib_mains[cpu_index]; + vm = vlib_mains[thread_index]; f = vm->heap_base + offset; return f; } @@ -215,10 +215,10 @@ vlib_frame_index_no_check (vlib_main_t * vm, vlib_frame_t * f) ASSERT (((uword) f & VLIB_CPU_MASK) == 0); - vm = vlib_mains[f->cpu_index]; + vm = vlib_mains[f->thread_index]; i = ((u8 *) f - (u8 *) vm->heap_base); - return i | f->cpu_index; + return i | f->thread_index; } always_inline vlib_frame_t * diff --git a/src/vlib/threads.c b/src/vlib/threads.c index ef3a24d3..4a111f8d 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -35,27 +35,12 @@ vl (void *p) vlib_worker_thread_t *vlib_worker_threads; vlib_thread_main_t vlib_thread_main; +__thread uword vlib_thread_index = 0; + uword os_get_cpu_number (void) { - void *sp; - uword n; - u32 len; - - len = vec_len (vlib_thread_stacks); - if (len == 0) - return 0; - - /* Get any old stack address. */ - sp = &sp; - - n = ((uword) sp - (uword) vlib_thread_stacks[0]) - >> VLIB_LOG2_THREAD_STACK_SIZE; - - /* "processes" have their own stacks, and they always run in thread 0 */ - n = n >= len ? 0 : n; - - return n; + return vlib_thread_index; } uword @@ -275,21 +260,6 @@ vlib_thread_init (vlib_main_t * vm) return 0; } -vlib_worker_thread_t * -vlib_alloc_thread (vlib_main_t * vm) -{ - vlib_worker_thread_t *w; - - if (vec_len (vlib_worker_threads) >= vec_len (vlib_thread_stacks)) - { - clib_warning ("out of worker threads... Quitting..."); - exit (1); - } - vec_add2 (vlib_worker_threads, w, 1); - w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads]; - return w; -} - vlib_frame_queue_t * vlib_frame_queue_alloc (int nelts) { @@ -427,7 +397,7 @@ vlib_frame_queue_enqueue (vlib_main_t * vm, u32 node_runtime_index, f64 b4 = vlib_time_now_ticks (vm, before); vlib_worker_thread_barrier_check (vm, b4); /* Bad idea. Dequeue -> enqueue -> dequeue -> trouble */ - // vlib_frame_queue_dequeue (vm->cpu_index, vm, nm); + // vlib_frame_queue_dequeue (vm->thread_index, vm, nm); } elt = fq->elts + (new_tail & (fq->nelts - 1)); @@ -497,6 +467,8 @@ vlib_worker_thread_bootstrap_fn (void *arg) w->lwp = syscall (SYS_gettid); w->thread_id = pthread_self (); + vlib_thread_index = w - vlib_worker_threads; + rv = (void *) clib_calljmp ((uword (*)(uword)) w->thread_function, (uword) arg, w->thread_stack + VLIB_THREAD_STACK_SIZE); @@ -610,7 +582,9 @@ start_workers (vlib_main_t * vm) mheap_alloc (0 /* use VM */ , tr->mheap_size); else w->thread_mheap = main_heap; - w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads]; + + w->thread_stack = + vlib_thread_stack_init (w - vlib_worker_threads); w->thread_function = tr->function; w->thread_function_arg = w; w->instance_id = k; @@ -630,7 +604,7 @@ start_workers (vlib_main_t * vm) vm_clone = clib_mem_alloc (sizeof (*vm_clone)); clib_memcpy (vm_clone, vlib_mains[0], sizeof (*vm_clone)); - vm_clone->cpu_index = worker_thread_index; + vm_clone->thread_index = worker_thread_index; vm_clone->heap_base = w->thread_mheap; vm_clone->mbuf_alloc_list = 0; vm_clone->init_functions_called = @@ -679,7 +653,7 @@ start_workers (vlib_main_t * vm) vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]) { vlib_node_t *n = vlib_get_node (vm, rt->node_index); - rt->cpu_index = vm_clone->cpu_index; + rt->thread_index = vm_clone->thread_index; /* copy initial runtime_data from node */ if (n->runtime_data && n->runtime_data_bytes > 0) clib_memcpy (rt->runtime_data, n->runtime_data, @@ -692,7 +666,7 @@ start_workers (vlib_main_t * vm) vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]) { vlib_node_t *n = vlib_get_node (vm, rt->node_index); - rt->cpu_index = vm_clone->cpu_index; + rt->thread_index = vm_clone->thread_index; /* copy initial runtime_data from node */ if (n->runtime_data && n->runtime_data_bytes > 0) clib_memcpy (rt->runtime_data, n->runtime_data, @@ -756,7 +730,8 @@ start_workers (vlib_main_t * vm) mheap_alloc (0 /* use VM */ , tr->mheap_size); else w->thread_mheap = main_heap; - w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads]; + w->thread_stack = + vlib_thread_stack_init (w - vlib_worker_threads); w->thread_function = tr->function; w->thread_function_arg = w; w->instance_id = j; @@ -827,7 +802,7 @@ vlib_worker_thread_node_runtime_update (void) uword n_calls, uword n_vectors, uword n_clocks); - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); if (vec_len (vlib_mains) == 1) return; @@ -835,7 +810,7 @@ vlib_worker_thread_node_runtime_update (void) vm = vlib_mains[0]; nm = &vm->node_main; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); ASSERT (*vlib_worker_threads->wait_at_barrier == 1); /* @@ -955,7 +930,7 @@ vlib_worker_thread_node_runtime_update (void) vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]) { vlib_node_t *n = vlib_get_node (vm, rt->node_index); - rt->cpu_index = vm_clone->cpu_index; + rt->thread_index = vm_clone->thread_index; /* copy runtime_data, will be overwritten later for existing rt */ if (n->runtime_data && n->runtime_data_bytes > 0) clib_memcpy (rt->runtime_data, n->runtime_data, @@ -981,7 +956,7 @@ vlib_worker_thread_node_runtime_update (void) vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]) { vlib_node_t *n = vlib_get_node (vm, rt->node_index); - rt->cpu_index = vm_clone->cpu_index; + rt->thread_index = vm_clone->thread_index; /* copy runtime_data, will be overwritten later for existing rt */ if (n->runtime_data && n->runtime_data_bytes > 0) clib_memcpy (rt->runtime_data, n->runtime_data, @@ -1180,7 +1155,7 @@ vlib_worker_thread_fork_fixup (vlib_fork_fixup_t which) if (vlib_mains == 0) return; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); vlib_worker_thread_barrier_sync (vm); switch (which) @@ -1212,7 +1187,7 @@ vlib_worker_thread_barrier_sync (vlib_main_t * vm) vlib_worker_threads[0].barrier_sync_count++; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT; @@ -1260,7 +1235,7 @@ vlib_worker_thread_barrier_release (vlib_main_t * vm) int vlib_frame_queue_dequeue (vlib_main_t * vm, vlib_frame_queue_main_t * fqm) { - u32 thread_id = vm->cpu_index; + u32 thread_id = vm->thread_index; vlib_frame_queue_t *fq = fqm->vlib_frame_queues[thread_id]; vlib_frame_queue_elt_t *elt; u32 *from, *to; @@ -1393,7 +1368,7 @@ vlib_worker_thread_fn (void *arg) vlib_main_t *vm = vlib_get_main (); clib_error_t *e; - ASSERT (vm->cpu_index == os_get_cpu_number ()); + ASSERT (vm->thread_index == vlib_get_thread_index ()); vlib_worker_thread_init (w); clib_time_init (&vm->clib_time); diff --git a/src/vlib/threads.h b/src/vlib/threads.h index eca4fc26..101d3d4a 100644 --- a/src/vlib/threads.h +++ b/src/vlib/threads.h @@ -153,8 +153,6 @@ typedef struct /* Called early, in thread 0's context */ clib_error_t *vlib_thread_init (vlib_main_t * vm); -vlib_worker_thread_t *vlib_alloc_thread (vlib_main_t * vm); - int vlib_frame_queue_enqueue (vlib_main_t * vm, u32 node_runtime_index, u32 frame_queue_index, vlib_frame_t * frame, vlib_frame_queue_msg_type_t type); @@ -183,12 +181,19 @@ u32 vlib_frame_queue_main_init (u32 node_index, u32 frame_queue_nelts); void vlib_worker_thread_barrier_sync (vlib_main_t * vm); void vlib_worker_thread_barrier_release (vlib_main_t * vm); +extern __thread uword vlib_thread_index; +static_always_inline uword +vlib_get_thread_index (void) +{ + return vlib_thread_index; +} + always_inline void vlib_smp_unsafe_warning (void) { if (CLIB_DEBUG > 0) { - if (os_get_cpu_number ()) + if (vlib_get_thread_index ()) fformat (stderr, "%s: SMP unsafe warning...\n", __FUNCTION__); } } @@ -331,21 +336,21 @@ vlib_num_workers () } always_inline u32 -vlib_get_worker_cpu_index (u32 worker_index) +vlib_get_worker_thread_index (u32 worker_index) { return worker_index + 1; } always_inline u32 -vlib_get_worker_index (u32 cpu_index) +vlib_get_worker_index (u32 thread_index) { - return cpu_index - 1; + return thread_index - 1; } always_inline u32 vlib_get_current_worker_index () { - return os_get_cpu_number () - 1; + return vlib_get_thread_index () - 1; } static inline void @@ -467,6 +472,8 @@ vlib_get_worker_handoff_queue_elt (u32 frame_queue_index, return elt; } +u8 *vlib_thread_stack_init (uword thread_index); + int vlib_thread_cb_register (struct vlib_main_t *vm, vlib_thread_callbacks_t * cb); diff --git a/src/vlib/unix/cj.c b/src/vlib/unix/cj.c index 33ba163a..7c1e9475 100644 --- a/src/vlib/unix/cj.c +++ b/src/vlib/unix/cj.c @@ -48,7 +48,7 @@ cj_log (u32 type, void *data0, void *data1) r = (cj_record_t *) & (cjm->records[new_tail & (cjm->num_records - 1)]); r->time = vlib_time_now (cjm->vlib_main); - r->cpu = os_get_cpu_number (); + r->thread_index = vlib_get_thread_index (); r->type = type; r->data[0] = pointer_to_uword (data0); r->data[1] = pointer_to_uword (data1); @@ -133,7 +133,8 @@ static inline void cj_dump_one_record (cj_record_t * r) { fprintf (stderr, "[%d]: %10.6f T%02d %llx %llx\n", - r->cpu, r->time, r->type, (long long unsigned int) r->data[0], + r->thread_index, r->time, r->type, + (long long unsigned int) r->data[0], (long long unsigned int) r->data[1]); } @@ -161,7 +162,7 @@ cj_dump_internal (u8 filter0_enable, u64 filter0, index = (cjm->tail + 1) & (cjm->num_records - 1); r = &(cjm->records[index]); - if (r->cpu != (u32) ~ 0) + if (r->thread_index != (u32) ~ 0) { /* Yes, dump from tail + 1 to the end */ for (i = index; i < cjm->num_records; i++) diff --git a/src/vlib/unix/cj.h b/src/vlib/unix/cj.h index 67626afe..d0a1d46e 100644 --- a/src/vlib/unix/cj.h +++ b/src/vlib/unix/cj.h @@ -23,7 +23,7 @@ typedef struct { f64 time; - u32 cpu; + u32 thread_index; u32 type; u64 data[2]; } cj_record_t; diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c index 6b96cc0d..db5ddd64 100644 --- a/src/vlib/unix/main.c +++ b/src/vlib/unix/main.c @@ -510,13 +510,28 @@ thread0 (uword arg) return i; } +u8 * +vlib_thread_stack_init (uword thread_index) +{ + vec_validate (vlib_thread_stacks, thread_index); + vlib_thread_stacks[thread_index] = clib_mem_alloc_aligned + (VLIB_THREAD_STACK_SIZE, VLIB_THREAD_STACK_SIZE); + + /* + * Disallow writes to the bottom page of the stack, to + * catch stack overflows. + */ + if (mprotect (vlib_thread_stacks[thread_index], + clib_mem_get_page_size (), PROT_READ) < 0) + clib_unix_warning ("thread stack"); + return vlib_thread_stacks[thread_index]; +} + int vlib_unix_main (int argc, char *argv[]) { vlib_main_t *vm = &vlib_global_main; /* one and only time for this! */ - vlib_thread_main_t *tm = &vlib_thread_main; unformat_input_t input; - u8 *thread_stacks; clib_error_t *e; int i; @@ -548,29 +563,9 @@ vlib_unix_main (int argc, char *argv[]) } unformat_free (&input); - /* - * allocate n x VLIB_THREAD_STACK_SIZE stacks, aligned to a - * VLIB_THREAD_STACK_SIZE boundary - * See also: os_get_cpu_number() in vlib/vlib/threads.c - */ - thread_stacks = clib_mem_alloc_aligned - ((uword) tm->n_thread_stacks * VLIB_THREAD_STACK_SIZE, - VLIB_THREAD_STACK_SIZE); - - vec_validate (vlib_thread_stacks, tm->n_thread_stacks - 1); - for (i = 0; i < vec_len (vlib_thread_stacks); i++) - { - vlib_thread_stacks[i] = thread_stacks; - - /* - * Disallow writes to the bottom page of the stack, to - * catch stack overflows. - */ - if (mprotect (thread_stacks, clib_mem_get_page_size (), PROT_READ) < 0) - clib_unix_warning ("thread stack"); + vlib_thread_stack_init (0); - thread_stacks += VLIB_THREAD_STACK_SIZE; - } + vlib_thread_index = 0; i = clib_calljmp (thread0, (uword) vm, (void *) (vlib_thread_stacks[0] + diff --git a/src/vnet/adj/adj_l2.c b/src/vnet/adj/adj_l2.c index f68e54e0..20d70dd4 100644 --- a/src/vnet/adj/adj_l2.c +++ b/src/vnet/adj/adj_l2.c @@ -52,7 +52,7 @@ adj_l2_rewrite_inline (vlib_main_t * vm, { u32 * from = vlib_frame_vector_args (frame); u32 n_left_from, n_left_to_next, * to_next, next_index; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); ethernet_main_t * em = ðernet_main; n_left_from = frame->n_vectors; @@ -93,7 +93,7 @@ adj_l2_rewrite_inline (vlib_main_t * vm, vnet_buffer(p0)->sw_if_index[VLIB_TX] = adj0->rewrite_header.sw_if_index; vlib_increment_combined_counter(&adjacency_counters, - cpu_index, + thread_index, adj_index0, /* packet increment */ 0, /* byte increment */ rw_len0); diff --git a/src/vnet/adj/adj_midchain.c b/src/vnet/adj/adj_midchain.c index e8087f08..5756de43 100644 --- a/src/vnet/adj/adj_midchain.c +++ b/src/vnet/adj/adj_midchain.c @@ -49,7 +49,7 @@ adj_midchain_tx_inline (vlib_main_t * vm, u32 next_index; vnet_main_t *vnm = vnet_get_main (); vnet_interface_main_t *im = &vnm->interface_main; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; /* Vector of buffer / pkt indices we're supposed to process */ from = vlib_frame_vector_args (frame); @@ -124,13 +124,13 @@ adj_midchain_tx_inline (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, + thread_index, adj0->rewrite_header.sw_if_index, 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, + thread_index, adj1->rewrite_header.sw_if_index, 1, vlib_buffer_length_in_chain (vm, b1)); @@ -181,7 +181,7 @@ adj_midchain_tx_inline (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, + thread_index, adj0->rewrite_header.sw_if_index, 1, vlib_buffer_length_in_chain (vm, b0)); diff --git a/src/vnet/adj/adj_nsh.c b/src/vnet/adj/adj_nsh.c index 9a0f9d8b..128570b0 100644 --- a/src/vnet/adj/adj_nsh.c +++ b/src/vnet/adj/adj_nsh.c @@ -53,7 +53,7 @@ adj_nsh_rewrite_inline (vlib_main_t * vm, { u32 * from = vlib_frame_vector_args (frame); u32 n_left_from, n_left_to_next, * to_next, next_index; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); n_left_from = frame->n_vectors; next_index = node->cached_next_index; @@ -94,7 +94,7 @@ adj_nsh_rewrite_inline (vlib_main_t * vm, vnet_buffer(p0)->ip.save_rewrite_length = rw_len0; vlib_increment_combined_counter(&adjacency_counters, - cpu_index, + thread_index, adj_index0, /* packet increment */ 0, /* byte increment */ rw_len0); diff --git a/src/vnet/classify/vnet_classify.c b/src/vnet/classify/vnet_classify.c index 98842a48..70a189b0 100644 --- a/src/vnet/classify/vnet_classify.c +++ b/src/vnet/classify/vnet_classify.c @@ -251,12 +251,12 @@ static inline void make_working_copy vnet_classify_entry_##size##_t * working_copy##size = 0; foreach_size_in_u32x4; #undef _ - u32 cpu_number = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); - if (cpu_number >= vec_len (t->working_copies)) + if (thread_index >= vec_len (t->working_copies)) { oldheap = clib_mem_set_heap (t->mheap); - vec_validate (t->working_copies, cpu_number); + vec_validate (t->working_copies, thread_index); clib_mem_set_heap (oldheap); } @@ -265,7 +265,7 @@ static inline void make_working_copy * updates from multiple threads will not result in sporadic, spurious * lookup failures. */ - working_copy = t->working_copies[cpu_number]; + working_copy = t->working_copies[thread_index]; t->saved_bucket.as_u64 = b->as_u64; oldheap = clib_mem_set_heap (t->mheap); @@ -290,7 +290,7 @@ static inline void make_working_copy default: abort(); } - t->working_copies[cpu_number] = working_copy; + t->working_copies[thread_index] = working_copy; } _vec_len(working_copy) = (1<log2_pages)*t->entries_per_page; @@ -318,7 +318,7 @@ static inline void make_working_copy working_bucket.offset = vnet_classify_get_offset (t, working_copy); CLIB_MEMORY_BARRIER(); b->as_u64 = working_bucket.as_u64; - t->working_copies[cpu_number] = working_copy; + t->working_copies[thread_index] = working_copy; } static vnet_classify_entry_t * @@ -387,7 +387,7 @@ int vnet_classify_add_del (vnet_classify_table_t * t, int i; u64 hash, new_hash; u32 new_log2_pages; - u32 cpu_number = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u8 * key_minus_skip; ASSERT ((add_v->flags & VNET_CLASSIFY_ENTRY_FREE) == 0); @@ -498,7 +498,7 @@ int vnet_classify_add_del (vnet_classify_table_t * t, new_log2_pages = t->saved_bucket.log2_pages + 1; expand_again: - working_copy = t->working_copies[cpu_number]; + working_copy = t->working_copies[thread_index]; new_v = split_and_rehash (t, working_copy, new_log2_pages); if (new_v == 0) diff --git a/src/vnet/cop/ip4_whitelist.c b/src/vnet/cop/ip4_whitelist.c index 6ef3d7d7..1b5e336b 100644 --- a/src/vnet/cop/ip4_whitelist.c +++ b/src/vnet/cop/ip4_whitelist.c @@ -60,7 +60,7 @@ ip4_cop_whitelist_node_fn (vlib_main_t * vm, cop_feature_type_t next_index; cop_main_t *cm = &cop_main; vlib_combined_counter_main_t * vcm = &load_balance_main.lbm_via_counters; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -177,12 +177,12 @@ ip4_cop_whitelist_node_fn (vlib_main_t * vm, dpo1 = load_balance_get_bucket_i(lb1, 0); vlib_increment_combined_counter - (vcm, cpu_index, lb_index0, 1, + (vcm, thread_index, lb_index0, 1, vlib_buffer_length_in_chain (vm, b0) + sizeof(ethernet_header_t)); vlib_increment_combined_counter - (vcm, cpu_index, lb_index1, 1, + (vcm, thread_index, lb_index1, 1, vlib_buffer_length_in_chain (vm, b1) + sizeof(ethernet_header_t)); @@ -273,7 +273,7 @@ ip4_cop_whitelist_node_fn (vlib_main_t * vm, dpo0 = load_balance_get_bucket_i(lb0, 0); vlib_increment_combined_counter - (vcm, cpu_index, lb_index0, 1, + (vcm, thread_index, lb_index0, 1, vlib_buffer_length_in_chain (vm, b0) + sizeof(ethernet_header_t)); diff --git a/src/vnet/cop/ip6_whitelist.c b/src/vnet/cop/ip6_whitelist.c index c2e16ccf..f3fe62e3 100644 --- a/src/vnet/cop/ip6_whitelist.c +++ b/src/vnet/cop/ip6_whitelist.c @@ -61,7 +61,7 @@ ip6_cop_whitelist_node_fn (vlib_main_t * vm, cop_main_t *cm = &cop_main; ip6_main_t * im6 = &ip6_main; vlib_combined_counter_main_t * vcm = &load_balance_main.lbm_via_counters; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -153,12 +153,12 @@ ip6_cop_whitelist_node_fn (vlib_main_t * vm, dpo1 = load_balance_get_bucket_i(lb1, 0); vlib_increment_combined_counter - (vcm, cpu_index, lb_index0, 1, + (vcm, thread_index, lb_index0, 1, vlib_buffer_length_in_chain (vm, b0) + sizeof(ethernet_header_t)); vlib_increment_combined_counter - (vcm, cpu_index, lb_index1, 1, + (vcm, thread_index, lb_index1, 1, vlib_buffer_length_in_chain (vm, b1) + sizeof(ethernet_header_t)); @@ -233,7 +233,7 @@ ip6_cop_whitelist_node_fn (vlib_main_t * vm, dpo0 = load_balance_get_bucket_i(lb0, 0); vlib_increment_combined_counter - (vcm, cpu_index, lb_index0, 1, + (vcm, thread_index, lb_index0, 1, vlib_buffer_length_in_chain (vm, b0) + sizeof(ethernet_header_t)); diff --git a/src/vnet/devices/af_packet/node.c b/src/vnet/devices/af_packet/node.c index ba337f3f..76980102 100644 --- a/src/vnet/devices/af_packet/node.c +++ b/src/vnet/devices/af_packet/node.c @@ -124,7 +124,7 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, u32 frame_num = apif->rx_req->tp_frame_nr; u8 *block_start = apif->rx_ring + block * block_size; uword n_trace = vlib_get_trace_count (vm, node); - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); u32 min_bufs = apif->rx_req->tp_frame_size / n_buffer_bytes; @@ -132,15 +132,15 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, if (apif->per_interface_next_index != ~0) next_index = apif->per_interface_next_index; - n_free_bufs = vec_len (apm->rx_buffers[cpu_index]); + n_free_bufs = vec_len (apm->rx_buffers[thread_index]); if (PREDICT_FALSE (n_free_bufs < VLIB_FRAME_SIZE)) { - vec_validate (apm->rx_buffers[cpu_index], + vec_validate (apm->rx_buffers[thread_index], VLIB_FRAME_SIZE + n_free_bufs - 1); n_free_bufs += - vlib_buffer_alloc (vm, &apm->rx_buffers[cpu_index][n_free_bufs], + vlib_buffer_alloc (vm, &apm->rx_buffers[thread_index][n_free_bufs], VLIB_FRAME_SIZE); - _vec_len (apm->rx_buffers[cpu_index]) = n_free_bufs; + _vec_len (apm->rx_buffers[thread_index]) = n_free_bufs; } rx_frame = apif->next_rx_frame; @@ -163,11 +163,11 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, { /* grab free buffer */ u32 last_empty_buffer = - vec_len (apm->rx_buffers[cpu_index]) - 1; + vec_len (apm->rx_buffers[thread_index]) - 1; prev_bi0 = bi0; - bi0 = apm->rx_buffers[cpu_index][last_empty_buffer]; + bi0 = apm->rx_buffers[thread_index][last_empty_buffer]; b0 = vlib_get_buffer (vm, bi0); - _vec_len (apm->rx_buffers[cpu_index]) = last_empty_buffer; + _vec_len (apm->rx_buffers[thread_index]) = last_empty_buffer; n_free_bufs--; /* copy data */ @@ -236,9 +236,9 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (vnet_get_main ()->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number (), apif->hw_if_index, n_rx_packets, n_rx_bytes); + vlib_get_thread_index (), apif->hw_if_index, n_rx_packets, n_rx_bytes); - vnet_device_increment_rx_packets (cpu_index, n_rx_packets); + vnet_device_increment_rx_packets (thread_index, n_rx_packets); return n_rx_packets; } diff --git a/src/vnet/devices/devices.c b/src/vnet/devices/devices.c index 41645220..5e5e812c 100644 --- a/src/vnet/devices/devices.c +++ b/src/vnet/devices/devices.c @@ -104,7 +104,7 @@ vnet_device_queue_sort (void *a1, void *a2) void vnet_device_input_assign_thread (u32 hw_if_index, - u16 queue_id, uword cpu_index) + u16 queue_id, uword thread_index) { vnet_main_t *vnm = vnet_get_main (); vnet_device_main_t *vdm = &vnet_device_main; @@ -115,19 +115,19 @@ vnet_device_input_assign_thread (u32 hw_if_index, ASSERT (hw->input_node_index > 0); - if (vdm->first_worker_cpu_index == 0) - cpu_index = 0; + if (vdm->first_worker_thread_index == 0) + thread_index = 0; - if (cpu_index != 0 && - (cpu_index < vdm->first_worker_cpu_index || - cpu_index > vdm->last_worker_cpu_index)) + if (thread_index != 0 && + (thread_index < vdm->first_worker_thread_index || + thread_index > vdm->last_worker_thread_index)) { - cpu_index = vdm->next_worker_cpu_index++; - if (vdm->next_worker_cpu_index > vdm->last_worker_cpu_index) - vdm->next_worker_cpu_index = vdm->first_worker_cpu_index; + thread_index = vdm->next_worker_thread_index++; + if (vdm->next_worker_thread_index > vdm->last_worker_thread_index) + vdm->next_worker_thread_index = vdm->first_worker_thread_index; } - vm = vlib_mains[cpu_index]; + vm = vlib_mains[thread_index]; rt = vlib_node_get_runtime_data (vm, hw->input_node_index); vec_add2 (rt->devices_and_queues, dq, 1); @@ -136,33 +136,33 @@ vnet_device_input_assign_thread (u32 hw_if_index, dq->queue_id = queue_id; vec_sort_with_function (rt->devices_and_queues, vnet_device_queue_sort); - vec_validate (hw->input_node_cpu_index_by_queue, queue_id); - hw->input_node_cpu_index_by_queue[queue_id] = cpu_index; + vec_validate (hw->input_node_thread_index_by_queue, queue_id); + hw->input_node_thread_index_by_queue[queue_id] = thread_index; } static int vnet_device_input_unassign_thread (u32 hw_if_index, u16 queue_id, - uword cpu_index) + uword thread_index) { vnet_main_t *vnm = vnet_get_main (); vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); vnet_device_input_runtime_t *rt; vnet_device_and_queue_t *dq; - uword old_cpu_index; + uword old_thread_index; - if (hw->input_node_cpu_index_by_queue == 0) + if (hw->input_node_thread_index_by_queue == 0) return VNET_API_ERROR_INVALID_INTERFACE; - if (vec_len (hw->input_node_cpu_index_by_queue) < queue_id + 1) + if (vec_len (hw->input_node_thread_index_by_queue) < queue_id + 1) return VNET_API_ERROR_INVALID_INTERFACE; - old_cpu_index = hw->input_node_cpu_index_by_queue[queue_id]; + old_thread_index = hw->input_node_thread_index_by_queue[queue_id]; - if (old_cpu_index == cpu_index) + if (old_thread_index == thread_index) return 0; rt = - vlib_node_get_runtime_data (vlib_mains[old_cpu_index], + vlib_node_get_runtime_data (vlib_mains[old_thread_index], hw->input_node_index); vec_foreach (dq, rt->devices_and_queues) @@ -240,7 +240,7 @@ set_device_placement (vlib_main_t * vm, unformat_input_t * input, vnet_device_main_t *vdm = &vnet_device_main; u32 hw_if_index = (u32) ~ 0; u32 queue_id = (u32) 0; - u32 cpu_index = (u32) ~ 0; + u32 thread_index = (u32) ~ 0; int rv; if (!unformat_user (input, unformat_line_input, line_input)) @@ -253,10 +253,10 @@ set_device_placement (vlib_main_t * vm, unformat_input_t * input, ; else if (unformat (line_input, "queue %d", &queue_id)) ; - else if (unformat (line_input, "main", &cpu_index)) - cpu_index = 0; - else if (unformat (line_input, "worker %d", &cpu_index)) - cpu_index += vdm->first_worker_cpu_index; + else if (unformat (line_input, "main", &thread_index)) + thread_index = 0; + else if (unformat (line_input, "worker %d", &thread_index)) + thread_index += vdm->first_worker_thread_index; else { error = clib_error_return (0, "parse error: '%U'", @@ -271,16 +271,17 @@ set_device_placement (vlib_main_t * vm, unformat_input_t * input, if (hw_if_index == (u32) ~ 0) return clib_error_return (0, "please specify valid interface name"); - if (cpu_index > vdm->last_worker_cpu_index) + if (thread_index > vdm->last_worker_thread_index) return clib_error_return (0, "please specify valid worker thread or main"); - rv = vnet_device_input_unassign_thread (hw_if_index, queue_id, cpu_index); + rv = + vnet_device_input_unassign_thread (hw_if_index, queue_id, thread_index); if (rv) return clib_error_return (0, "not found"); - vnet_device_input_assign_thread (hw_if_index, queue_id, cpu_index); + vnet_device_input_assign_thread (hw_if_index, queue_id, thread_index); return 0; } @@ -326,9 +327,9 @@ vnet_device_init (vlib_main_t * vm) tr = p ? (vlib_thread_registration_t *) p[0] : 0; if (tr && tr->count > 0) { - vdm->first_worker_cpu_index = tr->first_index; - vdm->next_worker_cpu_index = tr->first_index; - vdm->last_worker_cpu_index = tr->first_index + tr->count - 1; + vdm->first_worker_thread_index = tr->first_index; + vdm->next_worker_thread_index = tr->first_index; + vdm->last_worker_thread_index = tr->first_index + tr->count - 1; } return 0; } diff --git a/src/vnet/devices/devices.h b/src/vnet/devices/devices.h index bbb29fe3..966f8302 100644 --- a/src/vnet/devices/devices.h +++ b/src/vnet/devices/devices.h @@ -50,9 +50,9 @@ typedef struct typedef struct { vnet_device_per_worker_data_t *workers; - uword first_worker_cpu_index; - uword last_worker_cpu_index; - uword next_worker_cpu_index; + uword first_worker_thread_index; + uword last_worker_thread_index; + uword next_worker_thread_index; } vnet_device_main_t; typedef struct @@ -80,7 +80,7 @@ vnet_set_device_input_node (u32 hw_if_index, u32 node_index) } void vnet_device_input_assign_thread (u32 hw_if_index, u16 queue_id, - uword cpu_index); + uword thread_index); static inline u64 vnet_get_aggregate_rx_packets (void) @@ -95,12 +95,12 @@ vnet_get_aggregate_rx_packets (void) } static inline void -vnet_device_increment_rx_packets (u32 cpu_index, u64 count) +vnet_device_increment_rx_packets (u32 thread_index, u64 count) { vnet_device_main_t *vdm = &vnet_device_main; vnet_device_per_worker_data_t *pwd; - pwd = vec_elt_at_index (vdm->workers, cpu_index); + pwd = vec_elt_at_index (vdm->workers, thread_index); pwd->aggregate_rx_packets += count; } @@ -117,9 +117,9 @@ vnet_device_input_set_interrupt_pending (vnet_main_t * vnm, u32 hw_if_index, { vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); - ASSERT (queue_id < vec_len (hw->input_node_cpu_index_by_queue)); - u32 cpu_index = hw->input_node_cpu_index_by_queue[queue_id]; - vlib_node_set_interrupt_pending (vlib_mains[cpu_index], + ASSERT (queue_id < vec_len (hw->input_node_thread_index_by_queue)); + u32 thread_index = hw->input_node_thread_index_by_queue[queue_id]; + vlib_node_set_interrupt_pending (vlib_mains[thread_index], hw->input_node_index); } diff --git a/src/vnet/devices/netmap/node.c b/src/vnet/devices/netmap/node.c index 68ea7832..e120eeae 100644 --- a/src/vnet/devices/netmap/node.c +++ b/src/vnet/devices/netmap/node.c @@ -98,22 +98,22 @@ netmap_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, u32 n_free_bufs; struct netmap_ring *ring; int cur_ring; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); if (nif->per_interface_next_index != ~0) next_index = nif->per_interface_next_index; - n_free_bufs = vec_len (nm->rx_buffers[cpu_index]); + n_free_bufs = vec_len (nm->rx_buffers[thread_index]); if (PREDICT_FALSE (n_free_bufs < VLIB_FRAME_SIZE)) { - vec_validate (nm->rx_buffers[cpu_index], + vec_validate (nm->rx_buffers[thread_index], VLIB_FRAME_SIZE + n_free_bufs - 1); n_free_bufs += - vlib_buffer_alloc (vm, &nm->rx_buffers[cpu_index][n_free_bufs], + vlib_buffer_alloc (vm, &nm->rx_buffers[thread_index][n_free_bufs], VLIB_FRAME_SIZE); - _vec_len (nm->rx_buffers[cpu_index]) = n_free_bufs; + _vec_len (nm->rx_buffers[thread_index]) = n_free_bufs; } cur_ring = nif->first_rx_ring; @@ -163,11 +163,11 @@ netmap_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t *b0; /* grab free buffer */ u32 last_empty_buffer = - vec_len (nm->rx_buffers[cpu_index]) - 1; + vec_len (nm->rx_buffers[thread_index]) - 1; prev_bi0 = bi0; - bi0 = nm->rx_buffers[cpu_index][last_empty_buffer]; + bi0 = nm->rx_buffers[thread_index][last_empty_buffer]; b0 = vlib_get_buffer (vm, bi0); - _vec_len (nm->rx_buffers[cpu_index]) = last_empty_buffer; + _vec_len (nm->rx_buffers[thread_index]) = last_empty_buffer; n_free_bufs--; /* copy data */ @@ -247,9 +247,9 @@ netmap_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (vnet_get_main ()->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number (), nif->hw_if_index, n_rx_packets, n_rx_bytes); + vlib_get_thread_index (), nif->hw_if_index, n_rx_packets, n_rx_bytes); - vnet_device_increment_rx_packets (cpu_index, n_rx_packets); + vnet_device_increment_rx_packets (thread_index, n_rx_packets); return n_rx_packets; } @@ -260,7 +260,7 @@ netmap_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, { int i; u32 n_rx_packets = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); netmap_main_t *nm = &netmap_main; netmap_if_t *nmi; @@ -269,7 +269,7 @@ netmap_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, nmi = vec_elt_at_index (nm->interfaces, i); if (nmi->is_admin_up && (i % nm->input_cpu_count) == - (cpu_index - nm->input_cpu_first_index)) + (thread_index - nm->input_cpu_first_index)) n_rx_packets += netmap_device_input_fn (vm, node, frame, nmi); } diff --git a/src/vnet/devices/ssvm/node.c b/src/vnet/devices/ssvm/node.c index a6c9dfd7..539b4161 100644 --- a/src/vnet/devices/ssvm/node.c +++ b/src/vnet/devices/ssvm/node.c @@ -89,7 +89,7 @@ ssvm_eth_device_input (ssvm_eth_main_t * em, ethernet_header_t *eh0; u16 type0; u32 n_rx_bytes = 0, l3_offset0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 trace_cnt __attribute__ ((unused)) = vlib_get_trace_count (vm, node); volatile u32 *lock; u32 *elt_indices; @@ -284,10 +284,10 @@ out: vlib_increment_combined_counter (vnet_get_main ()->interface_main.combined_sw_if_counters - + VNET_INTERFACE_COUNTER_RX, cpu_index, + + VNET_INTERFACE_COUNTER_RX, thread_index, intfc->vlib_hw_if_index, rx_queue_index, n_rx_bytes); - vnet_device_increment_rx_packets (cpu_index, rx_queue_index); + vnet_device_increment_rx_packets (thread_index, rx_queue_index); return rx_queue_index; } diff --git a/src/vnet/devices/virtio/vhost-user.c b/src/vnet/devices/virtio/vhost-user.c index 00807dc0..5e720f65 100644 --- a/src/vnet/devices/virtio/vhost-user.c +++ b/src/vnet/devices/virtio/vhost-user.c @@ -331,7 +331,7 @@ vhost_user_tx_thread_placement (vhost_user_intf_t * vui) { //Let's try to assign one queue to each thread u32 qid = 0; - u32 cpu_index = 0; + u32 thread_index = 0; vui->use_tx_spinlock = 0; while (1) { @@ -341,20 +341,21 @@ vhost_user_tx_thread_placement (vhost_user_intf_t * vui) if (!rxvq->started || !rxvq->enabled) continue; - vui->per_cpu_tx_qid[cpu_index] = qid; - cpu_index++; - if (cpu_index == vlib_get_thread_main ()->n_vlib_mains) + vui->per_cpu_tx_qid[thread_index] = qid; + thread_index++; + if (thread_index == vlib_get_thread_main ()->n_vlib_mains) return; } //We need to loop, meaning the spinlock has to be used vui->use_tx_spinlock = 1; - if (cpu_index == 0) + if (thread_index == 0) { //Could not find a single valid one - for (cpu_index = 0; - cpu_index < vlib_get_thread_main ()->n_vlib_mains; cpu_index++) + for (thread_index = 0; + thread_index < vlib_get_thread_main ()->n_vlib_mains; + thread_index++) { - vui->per_cpu_tx_qid[cpu_index] = 0; + vui->per_cpu_tx_qid[thread_index] = 0; } return; } @@ -368,7 +369,7 @@ vhost_user_rx_thread_placement () vhost_user_intf_t *vui; vhost_cpu_t *vhc; u32 *workers = 0; - u32 cpu_index; + u32 thread_index; vlib_main_t *vm; //Let's list all workers cpu indexes @@ -400,9 +401,9 @@ vhost_user_rx_thread_placement () continue; i %= vec_len (vui_workers); - cpu_index = vui_workers[i]; + thread_index = vui_workers[i]; i++; - vhc = &vum->cpus[cpu_index]; + vhc = &vum->cpus[thread_index]; iaq.qid = qid; iaq.vhost_iface_index = vui - vum->vhost_user_interfaces; @@ -429,14 +430,14 @@ vhost_user_rx_thread_placement () vhc->operation_mode = mode; } - for (cpu_index = vum->input_cpu_first_index; - cpu_index < vum->input_cpu_first_index + vum->input_cpu_count; - cpu_index++) + for (thread_index = vum->input_cpu_first_index; + thread_index < vum->input_cpu_first_index + vum->input_cpu_count; + thread_index++) { vlib_node_state_t state = VLIB_NODE_STATE_POLLING; - vhc = &vum->cpus[cpu_index]; - vm = vlib_mains ? vlib_mains[cpu_index] : &vlib_global_main; + vhc = &vum->cpus[thread_index]; + vm = vlib_mains ? vlib_mains[thread_index] : &vlib_global_main; switch (vhc->operation_mode) { case VHOST_USER_INTERRUPT_MODE: @@ -532,7 +533,7 @@ vhost_user_set_interrupt_pending (vhost_user_intf_t * vui, u32 ifq) { vhost_user_main_t *vum = &vhost_user_main; vhost_cpu_t *vhc; - u32 cpu_index; + u32 thread_index; vhost_iface_and_queue_t *vhiq; vlib_main_t *vm; u32 ifq2; @@ -553,8 +554,8 @@ vhost_user_set_interrupt_pending (vhost_user_intf_t * vui, u32 ifq) if ((vhiq->vhost_iface_index == (ifq >> 8)) && (VHOST_VRING_IDX_TX (vhiq->qid) == (ifq & 0xff))) { - cpu_index = vhc - vum->cpus; - vm = vlib_mains ? vlib_mains[cpu_index] : &vlib_global_main; + thread_index = vhc - vum->cpus; + vm = vlib_mains ? vlib_mains[thread_index] : &vlib_global_main; /* * Convert RX virtqueue number in the lower byte to vring * queue index for the input node process. Top bytes contain @@ -1592,7 +1593,7 @@ vhost_user_if_input (vlib_main_t * vm, u32 n_trace = vlib_get_trace_count (vm, node); u16 qsz_mask; u32 map_hint = 0; - u16 cpu_index = os_get_cpu_number (); + u16 thread_index = vlib_get_thread_index (); u16 copy_len = 0; { @@ -1651,32 +1652,32 @@ vhost_user_if_input (vlib_main_t * vm, * in the loop and come back later. This is not an issue as for big packet, * processing cost really comes from the memory copy. */ - if (PREDICT_FALSE (vum->cpus[cpu_index].rx_buffers_len < n_left + 1)) + if (PREDICT_FALSE (vum->cpus[thread_index].rx_buffers_len < n_left + 1)) { - u32 curr_len = vum->cpus[cpu_index].rx_buffers_len; - vum->cpus[cpu_index].rx_buffers_len += + u32 curr_len = vum->cpus[thread_index].rx_buffers_len; + vum->cpus[thread_index].rx_buffers_len += vlib_buffer_alloc_from_free_list (vm, - vum->cpus[cpu_index].rx_buffers + + vum->cpus[thread_index].rx_buffers + curr_len, VHOST_USER_RX_BUFFERS_N - curr_len, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); if (PREDICT_FALSE - (vum->cpus[cpu_index].rx_buffers_len < + (vum->cpus[thread_index].rx_buffers_len < VHOST_USER_RX_BUFFER_STARVATION)) { /* In case of buffer starvation, discard some packets from the queue * and log the event. * We keep doing best effort for the remaining packets. */ - u32 flush = (n_left + 1 > vum->cpus[cpu_index].rx_buffers_len) ? - n_left + 1 - vum->cpus[cpu_index].rx_buffers_len : 1; + u32 flush = (n_left + 1 > vum->cpus[thread_index].rx_buffers_len) ? + n_left + 1 - vum->cpus[thread_index].rx_buffers_len : 1; flush = vhost_user_rx_discard_packet (vm, vui, txvq, flush); n_left -= flush; vlib_increment_simple_counter (vnet_main. interface_main.sw_if_counters + VNET_INTERFACE_COUNTER_DROP, - os_get_cpu_number (), + vlib_get_thread_index (), vui->sw_if_index, flush); vlib_error_count (vm, vhost_user_input_node.index, @@ -1696,7 +1697,7 @@ vhost_user_if_input (vlib_main_t * vm, u32 desc_data_offset; vring_desc_t *desc_table = txvq->desc; - if (PREDICT_FALSE (vum->cpus[cpu_index].rx_buffers_len <= 1)) + if (PREDICT_FALSE (vum->cpus[thread_index].rx_buffers_len <= 1)) { /* Not enough rx_buffers * Note: We yeld on 1 so we don't need to do an additional @@ -1707,17 +1708,18 @@ vhost_user_if_input (vlib_main_t * vm, } desc_current = txvq->avail->ring[txvq->last_avail_idx & qsz_mask]; - vum->cpus[cpu_index].rx_buffers_len--; - bi_current = (vum->cpus[cpu_index].rx_buffers) - [vum->cpus[cpu_index].rx_buffers_len]; + vum->cpus[thread_index].rx_buffers_len--; + bi_current = (vum->cpus[thread_index].rx_buffers) + [vum->cpus[thread_index].rx_buffers_len]; b_head = b_current = vlib_get_buffer (vm, bi_current); to_next[0] = bi_current; //We do that now so we can forget about bi_current to_next++; n_left_to_next--; vlib_prefetch_buffer_with_index (vm, - (vum->cpus[cpu_index].rx_buffers) - [vum->cpus[cpu_index]. + (vum-> + cpus[thread_index].rx_buffers) + [vum->cpus[thread_index]. rx_buffers_len - 1], LOAD); /* Just preset the used descriptor id and length for later */ @@ -1791,7 +1793,7 @@ vhost_user_if_input (vlib_main_t * vm, (b_current->current_length == VLIB_BUFFER_DATA_SIZE)) { if (PREDICT_FALSE - (vum->cpus[cpu_index].rx_buffers_len == 0)) + (vum->cpus[thread_index].rx_buffers_len == 0)) { /* Cancel speculation */ to_next--; @@ -1805,17 +1807,18 @@ vhost_user_if_input (vlib_main_t * vm, * but valid. */ vhost_user_input_rewind_buffers (vm, - &vum->cpus[cpu_index], + &vum->cpus + [thread_index], b_head); n_left = 0; goto stop; } /* Get next output */ - vum->cpus[cpu_index].rx_buffers_len--; + vum->cpus[thread_index].rx_buffers_len--; u32 bi_next = - (vum->cpus[cpu_index].rx_buffers)[vum->cpus - [cpu_index].rx_buffers_len]; + (vum->cpus[thread_index].rx_buffers)[vum->cpus + [thread_index].rx_buffers_len]; b_current->next_buffer = bi_next; b_current->flags |= VLIB_BUFFER_NEXT_PRESENT; bi_current = bi_next; @@ -1823,7 +1826,7 @@ vhost_user_if_input (vlib_main_t * vm, } /* Prepare a copy order executed later for the data */ - vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len]; + vhost_copy_t *cpy = &vum->cpus[thread_index].copy[copy_len]; copy_len++; u32 desc_data_l = desc_table[desc_current].len - desc_data_offset; @@ -1880,7 +1883,7 @@ vhost_user_if_input (vlib_main_t * vm, if (PREDICT_FALSE (copy_len >= VHOST_USER_RX_COPY_THRESHOLD)) { if (PREDICT_FALSE - (vhost_user_input_copy (vui, vum->cpus[cpu_index].copy, + (vhost_user_input_copy (vui, vum->cpus[thread_index].copy, copy_len, &map_hint))) { clib_warning @@ -1905,7 +1908,7 @@ vhost_user_if_input (vlib_main_t * vm, /* Do the memory copies */ if (PREDICT_FALSE - (vhost_user_input_copy (vui, vum->cpus[cpu_index].copy, + (vhost_user_input_copy (vui, vum->cpus[thread_index].copy, copy_len, &map_hint))) { clib_warning ("Memory mapping error on interface hw_if_index=%d " @@ -1933,9 +1936,9 @@ vhost_user_if_input (vlib_main_t * vm, vlib_increment_combined_counter (vnet_main.interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number (), vui->sw_if_index, n_rx_packets, n_rx_bytes); + vlib_get_thread_index (), vui->sw_if_index, n_rx_packets, n_rx_bytes); - vnet_device_increment_rx_packets (cpu_index, n_rx_packets); + vnet_device_increment_rx_packets (thread_index, n_rx_packets); return n_rx_packets; } @@ -1946,15 +1949,15 @@ vhost_user_input (vlib_main_t * vm, { vhost_user_main_t *vum = &vhost_user_main; uword n_rx_packets = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); vhost_iface_and_queue_t *vhiq; vhost_user_intf_t *vui; vhost_cpu_t *vhc; - vhc = &vum->cpus[cpu_index]; + vhc = &vum->cpus[thread_index]; if (PREDICT_TRUE (vhc->operation_mode == VHOST_USER_POLLING_MODE)) { - vec_foreach (vhiq, vum->cpus[cpu_index].rx_queues) + vec_foreach (vhiq, vum->cpus[thread_index].rx_queues) { vui = &vum->vhost_user_interfaces[vhiq->vhost_iface_index]; n_rx_packets += vhost_user_if_input (vm, vum, vui, vhiq->qid, node); @@ -2096,7 +2099,7 @@ vhost_user_tx (vlib_main_t * vm, vhost_user_vring_t *rxvq; u16 qsz_mask; u8 error; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 map_hint = 0; u8 retry = 8; u16 copy_len; @@ -2116,7 +2119,7 @@ vhost_user_tx (vlib_main_t * vm, qid = VHOST_VRING_IDX_RX (*vec_elt_at_index - (vui->per_cpu_tx_qid, os_get_cpu_number ())); + (vui->per_cpu_tx_qid, vlib_get_thread_index ())); rxvq = &vui->vrings[qid]; if (PREDICT_FALSE (vui->use_tx_spinlock)) vhost_user_vring_lock (vui, qid); @@ -2143,10 +2146,10 @@ retry: if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - vum->cpus[cpu_index].current_trace = + vum->cpus[thread_index].current_trace = vlib_add_trace (vm, node, b0, - sizeof (*vum->cpus[cpu_index].current_trace)); - vhost_user_tx_trace (vum->cpus[cpu_index].current_trace, + sizeof (*vum->cpus[thread_index].current_trace)); + vhost_user_tx_trace (vum->cpus[thread_index].current_trace, vui, qid / 2, b0, rxvq); } @@ -2188,14 +2191,14 @@ retry: { // Get a header from the header array virtio_net_hdr_mrg_rxbuf_t *hdr = - &vum->cpus[cpu_index].tx_headers[tx_headers_len]; + &vum->cpus[thread_index].tx_headers[tx_headers_len]; tx_headers_len++; hdr->hdr.flags = 0; hdr->hdr.gso_type = 0; hdr->num_buffers = 1; //This is local, no need to check // Prepare a copy order executed later for the header - vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len]; + vhost_copy_t *cpy = &vum->cpus[thread_index].copy[copy_len]; copy_len++; cpy->len = vui->virtio_net_hdr_sz; cpy->dst = buffer_map_addr; @@ -2220,7 +2223,7 @@ retry: else if (vui->virtio_net_hdr_sz == 12) //MRG is available { virtio_net_hdr_mrg_rxbuf_t *hdr = - &vum->cpus[cpu_index].tx_headers[tx_headers_len - 1]; + &vum->cpus[thread_index].tx_headers[tx_headers_len - 1]; //Move from available to used buffer rxvq->used->ring[rxvq->last_used_idx & qsz_mask].id = @@ -2282,7 +2285,7 @@ retry: } { - vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len]; + vhost_copy_t *cpy = &vum->cpus[thread_index].copy[copy_len]; copy_len++; cpy->len = bytes_left; cpy->len = (cpy->len > buffer_len) ? buffer_len : cpy->len; @@ -2325,8 +2328,8 @@ retry: if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - vum->cpus[cpu_index].current_trace->hdr = - vum->cpus[cpu_index].tx_headers[tx_headers_len - 1]; + vum->cpus[thread_index].current_trace->hdr = + vum->cpus[thread_index].tx_headers[tx_headers_len - 1]; } n_left--; //At the end for error counting when 'goto done' is invoked @@ -2336,7 +2339,7 @@ retry: done: //Do the memory copies if (PREDICT_FALSE - (vhost_user_tx_copy (vui, vum->cpus[cpu_index].copy, + (vhost_user_tx_copy (vui, vum->cpus[thread_index].copy, copy_len, &map_hint))) { clib_warning ("Memory mapping error on interface hw_if_index=%d " @@ -2386,7 +2389,7 @@ done3: vlib_increment_simple_counter (vnet_main.interface_main.sw_if_counters + VNET_INTERFACE_COUNTER_DROP, - os_get_cpu_number (), vui->sw_if_index, n_left); + vlib_get_thread_index (), vui->sw_if_index, n_left); } vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors); @@ -2773,11 +2776,11 @@ vhost_user_send_interrupt_process (vlib_main_t * vm, case ~0: vec_foreach (vhc, vum->cpus) { - u32 cpu_index = vhc - vum->cpus; + u32 thread_index = vhc - vum->cpus; f64 next_timeout; next_timeout = timeout; - vec_foreach (vhiq, vum->cpus[cpu_index].rx_queues) + vec_foreach (vhiq, vum->cpus[thread_index].rx_queues) { vui = &vum->vhost_user_interfaces[vhiq->vhost_iface_index]; vhost_user_vring_t *rxvq = diff --git a/src/vnet/dpo/lookup_dpo.c b/src/vnet/dpo/lookup_dpo.c index e94e871c..97ad0a44 100644 --- a/src/vnet/dpo/lookup_dpo.c +++ b/src/vnet/dpo/lookup_dpo.c @@ -266,7 +266,7 @@ lookup_dpo_ip4_inline (vlib_main_t * vm, int table_from_interface) { u32 n_left_from, next_index, * from, * to_next; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters; from = vlib_frame_vector_args (from_frame); @@ -407,10 +407,10 @@ lookup_dpo_ip4_inline (vlib_main_t * vm, vnet_buffer(b1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, b1)); if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) @@ -511,7 +511,7 @@ lookup_dpo_ip4_inline (vlib_main_t * vm, vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) @@ -606,7 +606,7 @@ lookup_dpo_ip6_inline (vlib_main_t * vm, { vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters; u32 n_left_from, next_index, * from, * to_next; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -749,10 +749,10 @@ lookup_dpo_ip6_inline (vlib_main_t * vm, vnet_buffer(b1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, b1)); if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) @@ -853,7 +853,7 @@ lookup_dpo_ip6_inline (vlib_main_t * vm, vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) @@ -930,7 +930,7 @@ lookup_dpo_mpls_inline (vlib_main_t * vm, int table_from_interface) { u32 n_left_from, next_index, * from, * to_next; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters; from = vlib_frame_vector_args (from_frame); @@ -994,7 +994,7 @@ lookup_dpo_mpls_inline (vlib_main_t * vm, vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) diff --git a/src/vnet/dpo/replicate_dpo.c b/src/vnet/dpo/replicate_dpo.c index a9f334be..e25ceae9 100644 --- a/src/vnet/dpo/replicate_dpo.c +++ b/src/vnet/dpo/replicate_dpo.c @@ -627,7 +627,7 @@ replicate_inline (vlib_main_t * vm, vlib_combined_counter_main_t * cm = &replicate_main.repm_counters; replicate_main_t * rm = &replicate_main; u32 n_left_from, * from, * to_next, next_index; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -657,12 +657,12 @@ replicate_inline (vlib_main_t * vm, rep0 = replicate_get(repi0); vlib_increment_combined_counter( - cm, cpu_index, repi0, 1, + cm, thread_index, repi0, 1, vlib_buffer_length_in_chain(vm, b0)); - vec_validate (rm->clones[cpu_index], rep0->rep_n_buckets - 1); + vec_validate (rm->clones[thread_index], rep0->rep_n_buckets - 1); - num_cloned = vlib_buffer_clone (vm, bi0, rm->clones[cpu_index], rep0->rep_n_buckets, 128); + num_cloned = vlib_buffer_clone (vm, bi0, rm->clones[thread_index], rep0->rep_n_buckets, 128); if (num_cloned != rep0->rep_n_buckets) { @@ -673,7 +673,7 @@ replicate_inline (vlib_main_t * vm, for (bucket = 0; bucket < num_cloned; bucket++) { - ci0 = rm->clones[cpu_index][bucket]; + ci0 = rm->clones[thread_index][bucket]; c0 = vlib_get_buffer(vm, ci0); to_next[0] = ci0; @@ -700,7 +700,7 @@ replicate_inline (vlib_main_t * vm, vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); } } - vec_reset_length (rm->clones[cpu_index]); + vec_reset_length (rm->clones[thread_index]); } vlib_put_next_frame (vm, node, next_index, n_left_to_next); diff --git a/src/vnet/ethernet/arp.c b/src/vnet/ethernet/arp.c index ee757505..c74a097e 100644 --- a/src/vnet/ethernet/arp.c +++ b/src/vnet/ethernet/arp.c @@ -1771,7 +1771,7 @@ set_ip4_over_ethernet_rpc_callback (vnet_arp_set_ip4_over_ethernet_rpc_args_t * a) { vnet_main_t *vm = vnet_get_main (); - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); if (a->flags & ETHERNET_ARP_ARGS_REMOVE) vnet_arp_unset_ip4_over_ethernet_internal (vm, a); diff --git a/src/vnet/ethernet/interface.c b/src/vnet/ethernet/interface.c index 9894e3c8..335e3f9f 100644 --- a/src/vnet/ethernet/interface.c +++ b/src/vnet/ethernet/interface.c @@ -362,7 +362,7 @@ simulated_ethernet_interface_tx (vlib_main_t * vm, u32 next_index = VNET_SIMULATED_ETHERNET_TX_NEXT_ETHERNET_INPUT; u32 i, next_node_index, bvi_flag, sw_if_index; u32 n_pkts = 0, n_bytes = 0; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; vnet_main_t *vnm = vnet_get_main (); vnet_interface_main_t *im = &vnm->interface_main; vlib_node_main_t *nm = &vm->node_main; @@ -420,8 +420,9 @@ simulated_ethernet_interface_tx (vlib_main_t * vm, /* increment TX interface stat */ vlib_increment_combined_counter (im->combined_sw_if_counters + - VNET_INTERFACE_COUNTER_TX, cpu_index, - sw_if_index, n_pkts, n_bytes); + VNET_INTERFACE_COUNTER_TX, + thread_index, sw_if_index, n_pkts, + n_bytes); } return n_left_from; diff --git a/src/vnet/ethernet/node.c b/src/vnet/ethernet/node.c index b699e381..f7787ed2 100755 --- a/src/vnet/ethernet/node.c +++ b/src/vnet/ethernet/node.c @@ -291,7 +291,7 @@ ethernet_input_inline (vlib_main_t * vm, vlib_node_runtime_t *error_node; u32 n_left_from, next_index, *from, *to_next; u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 cached_sw_if_index = ~0; u32 cached_is_l2 = 0; /* shut up gcc */ vnet_hw_interface_t *hi = NULL; /* used for main interface only */ @@ -510,7 +510,7 @@ ethernet_input_inline (vlib_main_t * vm, interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, new_sw_if_index0, 1, len0); if (new_sw_if_index1 != old_sw_if_index1 @@ -519,7 +519,7 @@ ethernet_input_inline (vlib_main_t * vm, interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, new_sw_if_index1, 1, len1); @@ -530,7 +530,7 @@ ethernet_input_inline (vlib_main_t * vm, vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = stats_n_bytes = 0; @@ -696,13 +696,13 @@ ethernet_input_inline (vlib_main_t * vm, vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, new_sw_if_index0, 1, len0); + thread_index, new_sw_if_index0, 1, len0); if (stats_n_packets > 0) { vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = stats_n_bytes = 0; } @@ -734,7 +734,7 @@ ethernet_input_inline (vlib_main_t * vm, vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); node->runtime_data[0] = stats_sw_if_index; } diff --git a/src/vnet/gre/node.c b/src/vnet/gre/node.c index 2683586e..acf15f24 100644 --- a/src/vnet/gre/node.c +++ b/src/vnet/gre/node.c @@ -75,7 +75,7 @@ gre_input (vlib_main_t * vm, u64 cached_tunnel_key6[4]; u32 cached_tunnel_sw_if_index = 0, tunnel_sw_if_index = 0; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u32 len; vnet_interface_main_t *im = &gm->vnet_main->interface_main; @@ -257,7 +257,7 @@ gre_input (vlib_main_t * vm, len = vlib_buffer_length_in_chain (vm, b0); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, tunnel_sw_if_index, 1 /* packets */, len /* bytes */); @@ -324,7 +324,7 @@ drop0: len = vlib_buffer_length_in_chain (vm, b1); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, tunnel_sw_if_index, 1 /* packets */, len /* bytes */); @@ -502,7 +502,7 @@ drop1: len = vlib_buffer_length_in_chain (vm, b0); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, tunnel_sw_if_index, 1 /* packets */, len /* bytes */); diff --git a/src/vnet/interface.h b/src/vnet/interface.h index a1ea2d61..08f08b10 100644 --- a/src/vnet/interface.h +++ b/src/vnet/interface.h @@ -468,7 +468,7 @@ typedef struct vnet_hw_interface_t u32 input_node_index; /* input node cpu index by queue */ - u32 *input_node_cpu_index_by_queue; + u32 *input_node_thread_index_by_queue; } vnet_hw_interface_t; diff --git a/src/vnet/interface_output.c b/src/vnet/interface_output.c index 03f2cdca..663dc309 100644 --- a/src/vnet/interface_output.c +++ b/src/vnet/interface_output.c @@ -196,7 +196,7 @@ slow_path (vlib_main_t * vm, */ static_always_inline void incr_output_stats (vnet_main_t * vnm, - u32 cpu_index, + u32 thread_index, u32 length, u32 sw_if_index, u32 * last_sw_if_index, u32 * n_packets, u32 * n_bytes) @@ -216,7 +216,7 @@ incr_output_stats (vnet_main_t * vnm, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, + thread_index, *last_sw_if_index, *n_packets, *n_bytes); } @@ -240,7 +240,7 @@ vnet_interface_output_node_flatten (vlib_main_t * vm, u32 n_left_to_tx, *from, *from_end, *to_tx; u32 n_bytes, n_buffers, n_packets; u32 last_sw_if_index; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; n_buffers = frame->n_vectors; @@ -266,7 +266,7 @@ vnet_interface_output_node_flatten (vlib_main_t * vm, cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, VNET_INTERFACE_COUNTER_TX_ERROR); - vlib_increment_simple_counter (cm, cpu_index, + vlib_increment_simple_counter (cm, thread_index, rt->sw_if_index, n_buffers); return vlib_error_drop_buffers (vm, node, from, /* buffer stride */ 1, @@ -341,18 +341,18 @@ vnet_interface_output_node_flatten (vlib_main_t * vm, from += 1; to_tx += n_buffers; n_left_to_tx -= n_buffers; - incr_output_stats (vnm, cpu_index, n_slow_bytes, + incr_output_stats (vnm, thread_index, n_slow_bytes, vnet_buffer (b)->sw_if_index[VLIB_TX], &last_sw_if_index, &n_packets, &n_bytes); } } else { - incr_output_stats (vnm, cpu_index, + incr_output_stats (vnm, thread_index, vlib_buffer_length_in_chain (vm, b0), vnet_buffer (b0)->sw_if_index[VLIB_TX], &last_sw_if_index, &n_packets, &n_bytes); - incr_output_stats (vnm, cpu_index, + incr_output_stats (vnm, thread_index, vlib_buffer_length_in_chain (vm, b0), vnet_buffer (b1)->sw_if_index[VLIB_TX], &last_sw_if_index, &n_packets, &n_bytes); @@ -396,7 +396,7 @@ vnet_interface_output_node_flatten (vlib_main_t * vm, to_tx += n_buffers; n_left_to_tx -= n_buffers; } - incr_output_stats (vnm, cpu_index, + incr_output_stats (vnm, thread_index, vlib_buffer_length_in_chain (vm, b0), vnet_buffer (b0)->sw_if_index[VLIB_TX], &last_sw_if_index, &n_packets, &n_bytes); @@ -408,7 +408,7 @@ vnet_interface_output_node_flatten (vlib_main_t * vm, } /* Final update of interface stats. */ - incr_output_stats (vnm, cpu_index, 0, ~0, /* ~0 will flush stats */ + incr_output_stats (vnm, thread_index, 0, ~0, /* ~0 will flush stats */ &last_sw_if_index, &n_packets, &n_bytes); return n_buffers; @@ -428,7 +428,7 @@ vnet_interface_output_node (vlib_main_t * vm, u32 n_left_to_tx, *from, *from_end, *to_tx; u32 n_bytes, n_buffers, n_packets; u32 n_bytes_b0, n_bytes_b1, n_bytes_b2, n_bytes_b3; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; vnet_interface_main_t *im = &vnm->interface_main; u32 next_index = VNET_INTERFACE_OUTPUT_NEXT_TX; u32 current_config_index = ~0; @@ -458,7 +458,7 @@ vnet_interface_output_node (vlib_main_t * vm, cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, VNET_INTERFACE_COUNTER_TX_ERROR); - vlib_increment_simple_counter (cm, cpu_index, + vlib_increment_simple_counter (cm, thread_index, rt->sw_if_index, n_buffers); return vlib_error_drop_buffers (vm, node, from, @@ -558,7 +558,7 @@ vnet_interface_output_node (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, tx_swif0, 1, + thread_index, tx_swif0, 1, n_bytes_b0); } @@ -567,7 +567,7 @@ vnet_interface_output_node (vlib_main_t * vm, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, tx_swif1, 1, + thread_index, tx_swif1, 1, n_bytes_b1); } @@ -576,7 +576,7 @@ vnet_interface_output_node (vlib_main_t * vm, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, tx_swif2, 1, + thread_index, tx_swif2, 1, n_bytes_b2); } if (PREDICT_FALSE (tx_swif3 != rt->sw_if_index)) @@ -584,7 +584,7 @@ vnet_interface_output_node (vlib_main_t * vm, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, tx_swif3, 1, + thread_index, tx_swif3, 1, n_bytes_b3); } } @@ -623,7 +623,7 @@ vnet_interface_output_node (vlib_main_t * vm, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, tx_swif0, 1, + thread_index, tx_swif0, 1, n_bytes_b0); } } @@ -634,7 +634,7 @@ vnet_interface_output_node (vlib_main_t * vm, /* Update main interface stats. */ vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, + thread_index, rt->sw_if_index, n_packets, n_bytes); return n_buffers; } @@ -893,7 +893,7 @@ process_drop_punt (vlib_main_t * vm, u32 current_sw_if_index, n_errors_current_sw_if_index; u64 current_counter; vlib_simple_counter_main_t *cm; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; static vlib_error_t memory[VNET_ERROR_N_DISPOSITION]; static char memory_init[VNET_ERROR_N_DISPOSITION]; @@ -965,19 +965,19 @@ process_drop_punt (vlib_main_t * vm, current_counter -= 2; n_errors_current_sw_if_index -= 2; - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index1, 1); /* Increment super-interface drop/punt counters for sub-interfaces. */ sw_if0 = vnet_get_sw_interface (vnm, sw_if_index0); vlib_increment_simple_counter - (cm, cpu_index, sw_if0->sup_sw_if_index, + (cm, thread_index, sw_if0->sup_sw_if_index, sw_if0->sup_sw_if_index != sw_if_index0); sw_if1 = vnet_get_sw_interface (vnm, sw_if_index1); vlib_increment_simple_counter - (cm, cpu_index, sw_if1->sup_sw_if_index, + (cm, thread_index, sw_if1->sup_sw_if_index, sw_if1->sup_sw_if_index != sw_if_index1); em->counters[current_counter_index] = current_counter; @@ -1013,11 +1013,12 @@ process_drop_punt (vlib_main_t * vm, sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX]; /* Increment drop/punt counters. */ - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); /* Increment super-interface drop/punt counters for sub-interfaces. */ sw_if0 = vnet_get_sw_interface (vnm, sw_if_index0); - vlib_increment_simple_counter (cm, cpu_index, sw_if0->sup_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, + sw_if0->sup_sw_if_index, sw_if0->sup_sw_if_index != sw_if_index0); if (PREDICT_FALSE (e0 != current_error)) @@ -1041,12 +1042,12 @@ process_drop_punt (vlib_main_t * vm, { vnet_sw_interface_t *si; - vlib_increment_simple_counter (cm, cpu_index, current_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, current_sw_if_index, n_errors_current_sw_if_index); si = vnet_get_sw_interface (vnm, current_sw_if_index); if (si->sup_sw_if_index != current_sw_if_index) - vlib_increment_simple_counter (cm, cpu_index, si->sup_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, si->sup_sw_if_index, n_errors_current_sw_if_index); } diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index ee1703e7..fdfe7f63 100644 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -75,7 +75,7 @@ ip4_lookup_inline (vlib_main_t * vm, vlib_combined_counter_main_t *cm = &load_balance_main.lbm_to_counters; u32 n_left_from, n_left_to_next, *from, *to_next; ip_lookup_next_t next; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -292,19 +292,19 @@ ip4_lookup_inline (vlib_main_t * vm, vnet_buffer (p3)->ip.adj_index[VLIB_TX] = dpo3->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lb_index0, 1, + (cm, thread_index, lb_index0, 1, vlib_buffer_length_in_chain (vm, p0) + sizeof (ethernet_header_t)); vlib_increment_combined_counter - (cm, cpu_index, lb_index1, 1, + (cm, thread_index, lb_index1, 1, vlib_buffer_length_in_chain (vm, p1) + sizeof (ethernet_header_t)); vlib_increment_combined_counter - (cm, cpu_index, lb_index2, 1, + (cm, thread_index, lb_index2, 1, vlib_buffer_length_in_chain (vm, p2) + sizeof (ethernet_header_t)); vlib_increment_combined_counter - (cm, cpu_index, lb_index3, 1, + (cm, thread_index, lb_index3, 1, vlib_buffer_length_in_chain (vm, p3) + sizeof (ethernet_header_t)); @@ -392,7 +392,7 @@ ip4_lookup_inline (vlib_main_t * vm, vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); from += 1; to_next += 1; @@ -479,7 +479,7 @@ ip4_load_balance (vlib_main_t * vm, vlib_combined_counter_main_t *cm = &load_balance_main.lbm_via_counters; u32 n_left_from, n_left_to_next, *from, *to_next; ip_lookup_next_t next; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -584,9 +584,9 @@ ip4_load_balance (vlib_main_t * vm, vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); vlib_validate_buffer_enqueue_x2 (vm, node, next, to_next, n_left_to_next, @@ -639,7 +639,7 @@ ip4_load_balance (vlib_main_t * vm, vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_validate_buffer_enqueue_x1 (vm, node, next, to_next, n_left_to_next, @@ -2330,7 +2330,7 @@ ip4_rewrite_inline (vlib_main_t * vm, n_left_from = frame->n_vectors; next_index = node->cached_next_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -2379,9 +2379,9 @@ ip4_rewrite_inline (vlib_main_t * vm, if (do_counters) { vlib_prefetch_combined_counter (&adjacency_counters, - cpu_index, adj_index0); + thread_index, adj_index0); vlib_prefetch_combined_counter (&adjacency_counters, - cpu_index, adj_index1); + thread_index, adj_index1); } ip0 = vlib_buffer_get_current (p0); @@ -2527,13 +2527,13 @@ ip4_rewrite_inline (vlib_main_t * vm, { vlib_increment_combined_counter (&adjacency_counters, - cpu_index, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); vlib_increment_combined_counter (&adjacency_counters, - cpu_index, + thread_index, adj_index1, 1, vlib_buffer_length_in_chain (vm, p1) + rw_len1); } @@ -2618,7 +2618,7 @@ ip4_rewrite_inline (vlib_main_t * vm, if (do_counters) vlib_prefetch_combined_counter (&adjacency_counters, - cpu_index, adj_index0); + thread_index, adj_index0); /* Guess we are only writing on simple Ethernet header. */ vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t)); @@ -2637,7 +2637,7 @@ ip4_rewrite_inline (vlib_main_t * vm, if (do_counters) vlib_increment_combined_counter (&adjacency_counters, - cpu_index, adj_index0, 1, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); /* Check MTU of outgoing interface. */ diff --git a/src/vnet/ip/ip4_input.c b/src/vnet/ip/ip4_input.c index ba200a9f..3b08f4b0 100644 --- a/src/vnet/ip/ip4_input.c +++ b/src/vnet/ip/ip4_input.c @@ -85,7 +85,7 @@ ip4_input_inline (vlib_main_t * vm, vlib_node_runtime_t *error_node = vlib_node_get_runtime (vm, ip4_input_node.index); vlib_simple_counter_main_t *cm; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -178,8 +178,8 @@ ip4_input_inline (vlib_main_t * vm, vnet_feature_arc_start (arc0, sw_if_index0, &next0, p0); vnet_feature_arc_start (arc1, sw_if_index1, &next1, p1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index1, 1); /* Punt packets with options or wrong version. */ if (PREDICT_FALSE (ip0->ip_version_and_header_length != 0x45)) @@ -299,7 +299,7 @@ ip4_input_inline (vlib_main_t * vm, vnet_buffer (p0)->ip.adj_index[VLIB_RX] = ~0; vnet_feature_arc_start (arc0, sw_if_index0, &next0, p0); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); /* Punt packets with options or wrong version. */ if (PREDICT_FALSE (ip0->ip_version_and_header_length != 0x45)) diff --git a/src/vnet/ip/ip6_forward.c b/src/vnet/ip/ip6_forward.c index c120f12c..c2fc4f87 100644 --- a/src/vnet/ip/ip6_forward.c +++ b/src/vnet/ip/ip6_forward.c @@ -74,7 +74,7 @@ ip6_lookup_inline (vlib_main_t * vm, vlib_combined_counter_main_t *cm = &load_balance_main.lbm_to_counters; u32 n_left_from, n_left_to_next, *from, *to_next; ip_lookup_next_t next; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -185,9 +185,9 @@ ip6_lookup_inline (vlib_main_t * vm, vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); from += 2; to_next += 2; @@ -291,7 +291,7 @@ ip6_lookup_inline (vlib_main_t * vm, vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); from += 1; to_next += 1; @@ -703,7 +703,7 @@ ip6_load_balance (vlib_main_t * vm, vlib_combined_counter_main_t *cm = &load_balance_main.lbm_via_counters; u32 n_left_from, n_left_to_next, *from, *to_next; ip_lookup_next_t next; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); ip6_main_t *im = &ip6_main; from = vlib_frame_vector_args (frame); @@ -824,9 +824,9 @@ ip6_load_balance (vlib_main_t * vm, vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); vlib_validate_buffer_enqueue_x2 (vm, node, next, to_next, n_left_to_next, @@ -886,7 +886,7 @@ ip6_load_balance (vlib_main_t * vm, } vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_validate_buffer_enqueue_x1 (vm, node, next, to_next, n_left_to_next, @@ -1897,7 +1897,7 @@ ip6_rewrite_inline (vlib_main_t * vm, n_left_from = frame->n_vectors; next_index = node->cached_next_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -2019,11 +2019,11 @@ ip6_rewrite_inline (vlib_main_t * vm, { vlib_increment_combined_counter (&adjacency_counters, - cpu_index, adj_index0, 1, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); vlib_increment_combined_counter (&adjacency_counters, - cpu_index, adj_index1, 1, + thread_index, adj_index1, 1, vlib_buffer_length_in_chain (vm, p1) + rw_len1); } @@ -2156,7 +2156,7 @@ ip6_rewrite_inline (vlib_main_t * vm, { vlib_increment_combined_counter (&adjacency_counters, - cpu_index, adj_index0, 1, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); } diff --git a/src/vnet/ip/ip6_input.c b/src/vnet/ip/ip6_input.c index 20306088..ffdc4727 100644 --- a/src/vnet/ip/ip6_input.c +++ b/src/vnet/ip/ip6_input.c @@ -82,7 +82,7 @@ ip6_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) vlib_node_runtime_t *error_node = vlib_node_get_runtime (vm, ip6_input_node.index); vlib_simple_counter_main_t *cm; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -171,8 +171,8 @@ ip6_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) vnet_feature_arc_start (arc0, sw_if_index0, &next0, p0); vnet_feature_arc_start (arc1, sw_if_index1, &next1, p1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index1, 1); error0 = error1 = IP6_ERROR_NONE; @@ -270,7 +270,7 @@ ip6_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) vnet_buffer (p0)->ip.adj_index[VLIB_RX] = ~0; vnet_feature_arc_start (arc0, sw_if_index0, &next0, p0); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); error0 = IP6_ERROR_NONE; /* Version != 6? Drop it. */ diff --git a/src/vnet/ip/ip6_neighbor.c b/src/vnet/ip/ip6_neighbor.c index 5d1fb6f8..2af546df 100644 --- a/src/vnet/ip/ip6_neighbor.c +++ b/src/vnet/ip/ip6_neighbor.c @@ -581,7 +581,7 @@ vnet_set_ip6_ethernet_neighbor (vlib_main_t * vm, u32 next_index; pending_resolution_t *pr, *mc; - if (os_get_cpu_number ()) + if (vlib_get_thread_index ()) { set_unset_ip6_neighbor_rpc (vm, sw_if_index, a, link_layer_address, 1 /* set new neighbor */ , is_static, @@ -722,7 +722,7 @@ vnet_unset_ip6_ethernet_neighbor (vlib_main_t * vm, uword *p; int rv = 0; - if (os_get_cpu_number ()) + if (vlib_get_thread_index ()) { set_unset_ip6_neighbor_rpc (vm, sw_if_index, a, link_layer_address, 0 /* unset */ , 0, 0); diff --git a/src/vnet/ipsec/esp.h b/src/vnet/ipsec/esp.h index 50cac806..799003b9 100644 --- a/src/vnet/ipsec/esp.h +++ b/src/vnet/ipsec/esp.h @@ -282,8 +282,8 @@ hmac_calc (ipsec_integ_alg_t alg, u8 * data, int data_len, u8 * signature, u8 use_esn, u32 seq_hi) { esp_main_t *em = &esp_main; - u32 cpu_index = os_get_cpu_number (); - HMAC_CTX *ctx = &(em->per_thread_data[cpu_index].hmac_ctx); + u32 thread_index = vlib_get_thread_index (); + HMAC_CTX *ctx = &(em->per_thread_data[thread_index].hmac_ctx); const EVP_MD *md = NULL; unsigned int len; @@ -292,10 +292,10 @@ hmac_calc (ipsec_integ_alg_t alg, if (PREDICT_FALSE (em->esp_integ_algs[alg].md == 0)) return 0; - if (PREDICT_FALSE (alg != em->per_thread_data[cpu_index].last_integ_alg)) + if (PREDICT_FALSE (alg != em->per_thread_data[thread_index].last_integ_alg)) { md = em->esp_integ_algs[alg].md; - em->per_thread_data[cpu_index].last_integ_alg = alg; + em->per_thread_data[thread_index].last_integ_alg = alg; } HMAC_Init (ctx, key, key_len, md); diff --git a/src/vnet/ipsec/esp_decrypt.c b/src/vnet/ipsec/esp_decrypt.c index 7289b260..925d2b45 100644 --- a/src/vnet/ipsec/esp_decrypt.c +++ b/src/vnet/ipsec/esp_decrypt.c @@ -85,8 +85,8 @@ esp_decrypt_aes_cbc (ipsec_crypto_alg_t alg, u8 * in, u8 * out, size_t in_len, u8 * key, u8 * iv) { esp_main_t *em = &esp_main; - u32 cpu_index = os_get_cpu_number (); - EVP_CIPHER_CTX *ctx = &(em->per_thread_data[cpu_index].decrypt_ctx); + u32 thread_index = vlib_get_thread_index (); + EVP_CIPHER_CTX *ctx = &(em->per_thread_data[thread_index].decrypt_ctx); const EVP_CIPHER *cipher = NULL; int out_len; @@ -95,10 +95,11 @@ esp_decrypt_aes_cbc (ipsec_crypto_alg_t alg, if (PREDICT_FALSE (em->esp_crypto_algs[alg].type == 0)) return; - if (PREDICT_FALSE (alg != em->per_thread_data[cpu_index].last_decrypt_alg)) + if (PREDICT_FALSE + (alg != em->per_thread_data[thread_index].last_decrypt_alg)) { cipher = em->esp_crypto_algs[alg].type; - em->per_thread_data[cpu_index].last_decrypt_alg = alg; + em->per_thread_data[thread_index].last_decrypt_alg = alg; } EVP_DecryptInit_ex (ctx, cipher, NULL, key, iv); @@ -117,11 +118,11 @@ esp_decrypt_node_fn (vlib_main_t * vm, u32 *recycle = 0; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); ipsec_alloc_empty_buffers (vm, im); - u32 *empty_buffers = im->empty_buffers[cpu_index]; + u32 *empty_buffers = im->empty_buffers[thread_index]; if (PREDICT_FALSE (vec_len (empty_buffers) < n_left_from)) { diff --git a/src/vnet/ipsec/esp_encrypt.c b/src/vnet/ipsec/esp_encrypt.c index 44ae2297..b2bc4e0b 100644 --- a/src/vnet/ipsec/esp_encrypt.c +++ b/src/vnet/ipsec/esp_encrypt.c @@ -88,8 +88,8 @@ esp_encrypt_aes_cbc (ipsec_crypto_alg_t alg, u8 * in, u8 * out, size_t in_len, u8 * key, u8 * iv) { esp_main_t *em = &esp_main; - u32 cpu_index = os_get_cpu_number (); - EVP_CIPHER_CTX *ctx = &(em->per_thread_data[cpu_index].encrypt_ctx); + u32 thread_index = vlib_get_thread_index (); + EVP_CIPHER_CTX *ctx = &(em->per_thread_data[thread_index].encrypt_ctx); const EVP_CIPHER *cipher = NULL; int out_len; @@ -98,10 +98,11 @@ esp_encrypt_aes_cbc (ipsec_crypto_alg_t alg, if (PREDICT_FALSE (em->esp_crypto_algs[alg].type == IPSEC_CRYPTO_ALG_NONE)) return; - if (PREDICT_FALSE (alg != em->per_thread_data[cpu_index].last_encrypt_alg)) + if (PREDICT_FALSE + (alg != em->per_thread_data[thread_index].last_encrypt_alg)) { cipher = em->esp_crypto_algs[alg].type; - em->per_thread_data[cpu_index].last_encrypt_alg = alg; + em->per_thread_data[thread_index].last_encrypt_alg = alg; } EVP_EncryptInit_ex (ctx, cipher, NULL, key, iv); @@ -119,11 +120,11 @@ esp_encrypt_node_fn (vlib_main_t * vm, n_left_from = from_frame->n_vectors; ipsec_main_t *im = &ipsec_main; u32 *recycle = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); ipsec_alloc_empty_buffers (vm, im); - u32 *empty_buffers = im->empty_buffers[cpu_index]; + u32 *empty_buffers = im->empty_buffers[thread_index]; if (PREDICT_FALSE (vec_len (empty_buffers) < n_left_from)) { diff --git a/src/vnet/ipsec/ikev2.c b/src/vnet/ipsec/ikev2.c index 2c1074d8..3f9978a7 100644 --- a/src/vnet/ipsec/ikev2.c +++ b/src/vnet/ipsec/ikev2.c @@ -303,16 +303,16 @@ static void ikev2_delete_sa (ikev2_sa_t * sa) { ikev2_main_t *km = &ikev2_main; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); uword *p; ikev2_sa_free_all_vec (sa); - p = hash_get (km->per_thread_data[cpu_index].sa_by_rspi, sa->rspi); + p = hash_get (km->per_thread_data[thread_index].sa_by_rspi, sa->rspi); if (p) { - hash_unset (km->per_thread_data[cpu_index].sa_by_rspi, sa->rspi); - pool_put (km->per_thread_data[cpu_index].sas, sa); + hash_unset (km->per_thread_data[thread_index].sa_by_rspi, sa->rspi); + pool_put (km->per_thread_data[thread_index].sas, sa); } } @@ -776,29 +776,31 @@ ikev2_initial_contact_cleanup (ikev2_sa_t * sa) ikev2_sa_t *tmp; u32 i, *delete = 0; ikev2_child_sa_t *c; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); if (!sa->initial_contact) return; /* find old IKE SAs with the same authenticated identity */ /* *INDENT-OFF* */ - pool_foreach (tmp, km->per_thread_data[cpu_index].sas, ({ + pool_foreach (tmp, km->per_thread_data[thread_index].sas, ({ if (tmp->i_id.type != sa->i_id.type || vec_len(tmp->i_id.data) != vec_len(sa->i_id.data) || memcmp(sa->i_id.data, tmp->i_id.data, vec_len(sa->i_id.data))) continue; if (sa->rspi != tmp->rspi) - vec_add1(delete, tmp - km->per_thread_data[cpu_index].sas); + vec_add1(delete, tmp - km->per_thread_data[thread_index].sas); })); /* *INDENT-ON* */ for (i = 0; i < vec_len (delete); i++) { - tmp = pool_elt_at_index (km->per_thread_data[cpu_index].sas, delete[i]); - vec_foreach (c, tmp->childs) - ikev2_delete_tunnel_interface (km->vnet_main, tmp, c); + tmp = + pool_elt_at_index (km->per_thread_data[thread_index].sas, delete[i]); + vec_foreach (c, + tmp->childs) ikev2_delete_tunnel_interface (km->vnet_main, + tmp, c); ikev2_delete_sa (tmp); } @@ -1922,10 +1924,10 @@ ikev2_retransmit_sa_init (ike_header_t * ike, { ikev2_main_t *km = &ikev2_main; ikev2_sa_t *sa; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); /* *INDENT-OFF* */ - pool_foreach (sa, km->per_thread_data[cpu_index].sas, ({ + pool_foreach (sa, km->per_thread_data[thread_index].sas, ({ if (sa->ispi == clib_net_to_host_u64(ike->ispi) && sa->iaddr.as_u32 == iaddr.as_u32 && sa->raddr.as_u32 == raddr.as_u32) @@ -2036,7 +2038,7 @@ ikev2_node_fn (vlib_main_t * vm, u32 n_left_from, *from, *to_next; ikev2_next_t next_index; ikev2_main_t *km = &ikev2_main; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -2134,11 +2136,14 @@ ikev2_node_fn (vlib_main_t * vm, if (sa0->state == IKEV2_STATE_SA_INIT) { /* add SA to the pool */ - pool_get (km->per_thread_data[cpu_index].sas, sa0); + pool_get (km->per_thread_data[thread_index].sas, + sa0); clib_memcpy (sa0, &sa, sizeof (*sa0)); - hash_set (km->per_thread_data[cpu_index].sa_by_rspi, + hash_set (km-> + per_thread_data[thread_index].sa_by_rspi, sa0->rspi, - sa0 - km->per_thread_data[cpu_index].sas); + sa0 - + km->per_thread_data[thread_index].sas); } else { @@ -2169,11 +2174,11 @@ ikev2_node_fn (vlib_main_t * vm, if (sa0->state == IKEV2_STATE_SA_INIT) { /* add SA to the pool */ - pool_get (km->per_thread_data[cpu_index].sas, sa0); + pool_get (km->per_thread_data[thread_index].sas, sa0); clib_memcpy (sa0, &sa, sizeof (*sa0)); - hash_set (km->per_thread_data[cpu_index].sa_by_rspi, + hash_set (km->per_thread_data[thread_index].sa_by_rspi, sa0->rspi, - sa0 - km->per_thread_data[cpu_index].sas); + sa0 - km->per_thread_data[thread_index].sas); } else { @@ -2184,12 +2189,13 @@ ikev2_node_fn (vlib_main_t * vm, else if (ike0->exchange == IKEV2_EXCHANGE_IKE_AUTH) { uword *p; - p = hash_get (km->per_thread_data[cpu_index].sa_by_rspi, + p = hash_get (km->per_thread_data[thread_index].sa_by_rspi, clib_net_to_host_u64 (ike0->rspi)); if (p) { - sa0 = pool_elt_at_index (km->per_thread_data[cpu_index].sas, - p[0]); + sa0 = + pool_elt_at_index (km->per_thread_data[thread_index].sas, + p[0]); r = ikev2_retransmit_resp (sa0, ike0); if (r == 1) @@ -2240,12 +2246,13 @@ ikev2_node_fn (vlib_main_t * vm, else if (ike0->exchange == IKEV2_EXCHANGE_INFORMATIONAL) { uword *p; - p = hash_get (km->per_thread_data[cpu_index].sa_by_rspi, + p = hash_get (km->per_thread_data[thread_index].sa_by_rspi, clib_net_to_host_u64 (ike0->rspi)); if (p) { - sa0 = pool_elt_at_index (km->per_thread_data[cpu_index].sas, - p[0]); + sa0 = + pool_elt_at_index (km->per_thread_data[thread_index].sas, + p[0]); r = ikev2_retransmit_resp (sa0, ike0); if (r == 1) @@ -2305,12 +2312,13 @@ ikev2_node_fn (vlib_main_t * vm, else if (ike0->exchange == IKEV2_EXCHANGE_CREATE_CHILD_SA) { uword *p; - p = hash_get (km->per_thread_data[cpu_index].sa_by_rspi, + p = hash_get (km->per_thread_data[thread_index].sa_by_rspi, clib_net_to_host_u64 (ike0->rspi)); if (p) { - sa0 = pool_elt_at_index (km->per_thread_data[cpu_index].sas, - p[0]); + sa0 = + pool_elt_at_index (km->per_thread_data[thread_index].sas, + p[0]); r = ikev2_retransmit_resp (sa0, ike0); if (r == 1) diff --git a/src/vnet/ipsec/ipsec.h b/src/vnet/ipsec/ipsec.h index 58f0f145..c884e360 100644 --- a/src/vnet/ipsec/ipsec.h +++ b/src/vnet/ipsec/ipsec.h @@ -324,21 +324,21 @@ int ipsec_set_interface_key (vnet_main_t * vnm, u32 hw_if_index, always_inline void ipsec_alloc_empty_buffers (vlib_main_t * vm, ipsec_main_t * im) { - u32 cpu_index = os_get_cpu_number (); - uword l = vec_len (im->empty_buffers[cpu_index]); + u32 thread_index = vlib_get_thread_index (); + uword l = vec_len (im->empty_buffers[thread_index]); uword n_alloc = 0; if (PREDICT_FALSE (l < VLIB_FRAME_SIZE)) { - if (!im->empty_buffers[cpu_index]) + if (!im->empty_buffers[thread_index]) { - vec_alloc (im->empty_buffers[cpu_index], 2 * VLIB_FRAME_SIZE); + vec_alloc (im->empty_buffers[thread_index], 2 * VLIB_FRAME_SIZE); } - n_alloc = vlib_buffer_alloc (vm, im->empty_buffers[cpu_index] + l, + n_alloc = vlib_buffer_alloc (vm, im->empty_buffers[thread_index] + l, 2 * VLIB_FRAME_SIZE - l); - _vec_len (im->empty_buffers[cpu_index]) = l + n_alloc; + _vec_len (im->empty_buffers[thread_index]) = l + n_alloc; } } diff --git a/src/vnet/ipsec/ipsec_if.c b/src/vnet/ipsec/ipsec_if.c index dc882004..ed124894 100644 --- a/src/vnet/ipsec/ipsec_if.c +++ b/src/vnet/ipsec/ipsec_if.c @@ -99,7 +99,7 @@ static int ipsec_add_del_tunnel_if_rpc_callback (ipsec_add_del_tunnel_args_t * a) { vnet_main_t *vnm = vnet_get_main (); - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); return ipsec_add_del_tunnel_if_internal (vnm, a); } diff --git a/src/vnet/l2/l2_bvi.h b/src/vnet/l2/l2_bvi.h index dd1130a6..e21a1616 100644 --- a/src/vnet/l2/l2_bvi.h +++ b/src/vnet/l2/l2_bvi.h @@ -97,7 +97,7 @@ l2_to_bvi (vlib_main_t * vlib_main, vlib_increment_combined_counter (vnet_main->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - vlib_main->cpu_index, + vlib_main->thread_index, vnet_buffer (b0)->sw_if_index[VLIB_RX], 1, vlib_buffer_length_in_chain (vlib_main, b0)); return TO_BVI_ERR_OK; diff --git a/src/vnet/l2/l2_input.c b/src/vnet/l2/l2_input.c index 041ff38d..e5d6878a 100644 --- a/src/vnet/l2/l2_input.c +++ b/src/vnet/l2/l2_input.c @@ -117,7 +117,7 @@ typedef enum static_always_inline void classify_and_dispatch (vlib_main_t * vm, vlib_node_runtime_t * node, - u32 cpu_index, + u32 thread_index, l2input_main_t * msm, vlib_buffer_t * b0, u32 * next0) { /* @@ -237,7 +237,7 @@ l2input_node_inline (vlib_main_t * vm, u32 n_left_from, *from, *to_next; l2input_next_t next_index; l2input_main_t *msm = &l2input_main; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; /* number of packets to process */ @@ -350,10 +350,10 @@ l2input_node_inline (vlib_main_t * vm, vlib_node_increment_counter (vm, l2input_node.index, L2INPUT_ERROR_L2INPUT, 4); - classify_and_dispatch (vm, node, cpu_index, msm, b0, &next0); - classify_and_dispatch (vm, node, cpu_index, msm, b1, &next1); - classify_and_dispatch (vm, node, cpu_index, msm, b2, &next2); - classify_and_dispatch (vm, node, cpu_index, msm, b3, &next3); + classify_and_dispatch (vm, node, thread_index, msm, b0, &next0); + classify_and_dispatch (vm, node, thread_index, msm, b1, &next1); + classify_and_dispatch (vm, node, thread_index, msm, b2, &next2); + classify_and_dispatch (vm, node, thread_index, msm, b3, &next3); /* verify speculative enqueues, maybe switch current next frame */ /* if next0==next1==next_index then nothing special needs to be done */ @@ -393,7 +393,7 @@ l2input_node_inline (vlib_main_t * vm, vlib_node_increment_counter (vm, l2input_node.index, L2INPUT_ERROR_L2INPUT, 1); - classify_and_dispatch (vm, node, cpu_index, msm, b0, &next0); + classify_and_dispatch (vm, node, thread_index, msm, b0, &next0); /* verify speculative enqueue, maybe switch current next frame */ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, diff --git a/src/vnet/l2/l2_output.c b/src/vnet/l2/l2_output.c index 00f22571..e17b2a16 100644 --- a/src/vnet/l2/l2_output.c +++ b/src/vnet/l2/l2_output.c @@ -643,11 +643,11 @@ l2output_create_output_node_mapping (vlib_main_t * vlib_main, vnet_main_t * vnet hw0 = vnet_get_sup_hw_interface (vnet_main, sw_if_index); - uword cpu_number; + uword thread_index; - cpu_number = os_get_cpu_number (); + thread_index = vlib_get_thread_index (); - if (cpu_number) + if (thread_index) { u32 oldflags; diff --git a/src/vnet/l2tp/decap.c b/src/vnet/l2tp/decap.c index e8986935..46104129 100644 --- a/src/vnet/l2tp/decap.c +++ b/src/vnet/l2tp/decap.c @@ -149,7 +149,7 @@ last_stage (vlib_main_t * vm, vlib_node_runtime_t * node, u32 bi) /* per-mapping byte stats include the ethernet header */ vlib_increment_combined_counter (&lm->counter_main, - os_get_cpu_number (), + vlib_get_thread_index (), counter_index, 1 /* packet_increment */ , vlib_buffer_length_in_chain (vm, b) + sizeof (ethernet_header_t)); diff --git a/src/vnet/l2tp/encap.c b/src/vnet/l2tp/encap.c index ed7a9580..dcdfde4b 100644 --- a/src/vnet/l2tp/encap.c +++ b/src/vnet/l2tp/encap.c @@ -124,7 +124,7 @@ last_stage (vlib_main_t * vm, vlib_node_runtime_t * node, u32 bi) /* per-mapping byte stats include the ethernet header */ vlib_increment_combined_counter (&lm->counter_main, - os_get_cpu_number (), + vlib_get_thread_index (), counter_index, 1 /* packet_increment */ , vlib_buffer_length_in_chain (vm, b)); diff --git a/src/vnet/l2tp/l2tp.c b/src/vnet/l2tp/l2tp.c index cb94d7e7..3dedc447 100644 --- a/src/vnet/l2tp/l2tp.c +++ b/src/vnet/l2tp/l2tp.c @@ -157,7 +157,7 @@ test_counters_command_fn (vlib_main_t * vm, u32 session_index; u32 counter_index; u32 nincr = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); /* *INDENT-OFF* */ pool_foreach (session, lm->sessions, @@ -167,11 +167,11 @@ test_counters_command_fn (vlib_main_t * vm, session_index_to_counter_index (session_index, SESSION_COUNTER_USER_TO_NETWORK); vlib_increment_combined_counter (&lm->counter_main, - cpu_index, + thread_index, counter_index, 1/*pkt*/, 1111 /*bytes*/); vlib_increment_combined_counter (&lm->counter_main, - cpu_index, + thread_index, counter_index+1, 1/*pkt*/, 2222 /*bytes*/); nincr++; diff --git a/src/vnet/lisp-gpe/decap.c b/src/vnet/lisp-gpe/decap.c index d887a95f..68769710 100644 --- a/src/vnet/lisp-gpe/decap.c +++ b/src/vnet/lisp-gpe/decap.c @@ -103,7 +103,7 @@ next_index_to_iface (lisp_gpe_main_t * lgm, u32 next_index) } static_always_inline void -incr_decap_stats (vnet_main_t * vnm, u32 cpu_index, u32 length, +incr_decap_stats (vnet_main_t * vnm, u32 thread_index, u32 length, u32 sw_if_index, u32 * last_sw_if_index, u32 * n_packets, u32 * n_bytes) { @@ -122,7 +122,7 @@ incr_decap_stats (vnet_main_t * vnm, u32 cpu_index, u32 length, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, *last_sw_if_index, + thread_index, *last_sw_if_index, *n_packets, *n_bytes); } *last_sw_if_index = sw_if_index; @@ -150,11 +150,11 @@ static uword lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, u8 is_v4) { - u32 n_left_from, next_index, *from, *to_next, cpu_index; + u32 n_left_from, next_index, *from, *to_next, thread_index; u32 n_bytes = 0, n_packets = 0, last_sw_if_index = ~0, drops = 0; lisp_gpe_main_t *lgm = vnet_lisp_gpe_get_main (); - cpu_index = os_get_cpu_number (); + thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -267,7 +267,7 @@ lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (si0) { - incr_decap_stats (lgm->vnet_main, cpu_index, + incr_decap_stats (lgm->vnet_main, thread_index, vlib_buffer_length_in_chain (vm, b0), si0[0], &last_sw_if_index, &n_packets, &n_bytes); vnet_buffer (b0)->sw_if_index[VLIB_RX] = si0[0]; @@ -282,7 +282,7 @@ lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (si1) { - incr_decap_stats (lgm->vnet_main, cpu_index, + incr_decap_stats (lgm->vnet_main, thread_index, vlib_buffer_length_in_chain (vm, b1), si1[0], &last_sw_if_index, &n_packets, &n_bytes); vnet_buffer (b1)->sw_if_index[VLIB_RX] = si1[0]; @@ -397,7 +397,7 @@ lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (si0) { - incr_decap_stats (lgm->vnet_main, cpu_index, + incr_decap_stats (lgm->vnet_main, thread_index, vlib_buffer_length_in_chain (vm, b0), si0[0], &last_sw_if_index, &n_packets, &n_bytes); vnet_buffer (b0)->sw_if_index[VLIB_RX] = si0[0]; @@ -430,7 +430,7 @@ lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } /* flush iface stats */ - incr_decap_stats (lgm->vnet_main, cpu_index, 0, ~0, &last_sw_if_index, + incr_decap_stats (lgm->vnet_main, thread_index, 0, ~0, &last_sw_if_index, &n_packets, &n_bytes); vlib_node_increment_counter (vm, lisp_gpe_ip4_input_node.index, LISP_GPE_ERROR_NO_TUNNEL, drops); diff --git a/src/vnet/lldp/lldp_input.c b/src/vnet/lldp/lldp_input.c index 762743d0..e88f6fdb 100644 --- a/src/vnet/lldp/lldp_input.c +++ b/src/vnet/lldp/lldp_input.c @@ -35,7 +35,7 @@ typedef struct static void lldp_rpc_update_peer_cb (const lldp_intf_update_t * a) { - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); lldp_intf_t *n = lldp_get_intf (&lldp_main, a->hw_if_index); if (!n) diff --git a/src/vnet/map/ip4_map.c b/src/vnet/map/ip4_map.c index 1a20d704..e39b6f14 100644 --- a/src/vnet/map/ip4_map.c +++ b/src/vnet/map/ip4_map.c @@ -248,7 +248,7 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) next_index = node->cached_next_index; map_main_t *mm = &map_main; vlib_combined_counter_main_t *cm = mm->domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -377,7 +377,7 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) ip40) ? IP4_MAP_NEXT_IP6_REWRITE : next0; vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, map_domain_index0, 1, clib_net_to_host_u16 (ip6h0->payload_length) + @@ -409,7 +409,7 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) ip41) ? IP4_MAP_NEXT_IP6_REWRITE : next1; vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, map_domain_index1, 1, clib_net_to_host_u16 (ip6h1->payload_length) + @@ -520,7 +520,7 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) ip40) ? IP4_MAP_NEXT_IP6_REWRITE : next0; vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, map_domain_index0, 1, clib_net_to_host_u16 (ip6h0->payload_length) + @@ -564,7 +564,7 @@ ip4_map_reass (vlib_main_t * vm, next_index = node->cached_next_index; map_main_t *mm = &map_main; vlib_combined_counter_main_t *cm = mm->domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 *fragments_to_drop = NULL; u32 *fragments_to_loopback = NULL; @@ -694,8 +694,8 @@ ip4_map_reass (vlib_main_t * vm, { if (error0 == MAP_ERROR_NONE) vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, map_domain_index0, - 1, + thread_index, + map_domain_index0, 1, clib_net_to_host_u16 (ip60->payload_length) + 40); next0 = diff --git a/src/vnet/map/ip4_map_t.c b/src/vnet/map/ip4_map_t.c index b63d76bf..5f2bcbf9 100644 --- a/src/vnet/map/ip4_map_t.c +++ b/src/vnet/map/ip4_map_t.c @@ -477,7 +477,7 @@ ip4_map_t_icmp (vlib_main_t * vm, n_left_from = frame->n_vectors; next_index = node->cached_next_index; vlib_combined_counter_main_t *cm = map_main.domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -520,7 +520,7 @@ ip4_map_t_icmp (vlib_main_t * vm, if (PREDICT_TRUE (error0 == MAP_ERROR_NONE)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, vnet_buffer (p0)->map_t. map_domain_index, 1, len0); } @@ -1051,7 +1051,7 @@ ip4_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) n_left_from = frame->n_vectors; next_index = node->cached_next_index; vlib_combined_counter_main_t *cm = map_main.domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -1158,7 +1158,7 @@ ip4_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error0 == MAP_ERROR_NONE && next0 != IP4_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, vnet_buffer (p0)->map_t. map_domain_index, 1, clib_net_to_host_u16 (ip40-> @@ -1169,7 +1169,7 @@ ip4_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error1 == MAP_ERROR_NONE && next1 != IP4_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, vnet_buffer (p1)->map_t. map_domain_index, 1, clib_net_to_host_u16 (ip41-> @@ -1252,7 +1252,7 @@ ip4_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error0 == MAP_ERROR_NONE && next0 != IP4_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, vnet_buffer (p0)->map_t. map_domain_index, 1, clib_net_to_host_u16 (ip40-> diff --git a/src/vnet/map/ip6_map.c b/src/vnet/map/ip6_map.c index f7eb768f..63ada962 100644 --- a/src/vnet/map/ip6_map.c +++ b/src/vnet/map/ip6_map.c @@ -172,7 +172,7 @@ ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) vlib_node_get_runtime (vm, ip6_map_node.index); map_main_t *mm = &map_main; vlib_combined_counter_main_t *cm = mm->domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -319,7 +319,7 @@ ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) IP6_MAP_NEXT_IP4_REWRITE : next0; } vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, map_domain_index0, 1, clib_net_to_host_u16 (ip40->length)); @@ -352,7 +352,7 @@ ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) IP6_MAP_NEXT_IP4_REWRITE : next1; } vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, map_domain_index1, 1, clib_net_to_host_u16 (ip41->length)); @@ -505,7 +505,7 @@ ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) IP6_MAP_NEXT_IP4_REWRITE : next0; } vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, map_domain_index0, 1, clib_net_to_host_u16 (ip40->length)); @@ -820,7 +820,7 @@ ip6_map_ip4_reass (vlib_main_t * vm, vlib_node_get_runtime (vm, ip6_map_ip4_reass_node.index); map_main_t *mm = &map_main; vlib_combined_counter_main_t *cm = mm->domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 *fragments_to_drop = NULL; u32 *fragments_to_loopback = NULL; @@ -958,8 +958,8 @@ ip6_map_ip4_reass (vlib_main_t * vm, { if (error0 == MAP_ERROR_NONE) vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, map_domain_index0, - 1, + thread_index, + map_domain_index0, 1, clib_net_to_host_u16 (ip40->length)); next0 = @@ -1015,7 +1015,7 @@ ip6_map_icmp_relay (vlib_main_t * vm, vlib_node_runtime_t *error_node = vlib_node_get_runtime (vm, ip6_map_icmp_relay_node.index); map_main_t *mm = &map_main; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u16 *fragment_ids, *fid; from = vlib_frame_vector_args (frame); @@ -1143,7 +1143,8 @@ ip6_map_icmp_relay (vlib_main_t * vm, ip_csum_t sum = ip_incremental_checksum (0, new_icmp40, nlen - 20); new_icmp40->checksum = ~ip_csum_fold (sum); - vlib_increment_simple_counter (&mm->icmp_relayed, cpu_index, 0, 1); + vlib_increment_simple_counter (&mm->icmp_relayed, thread_index, 0, + 1); error: if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED)) diff --git a/src/vnet/map/ip6_map_t.c b/src/vnet/map/ip6_map_t.c index eb3996c2..99151678 100644 --- a/src/vnet/map/ip6_map_t.c +++ b/src/vnet/map/ip6_map_t.c @@ -448,7 +448,7 @@ ip6_map_t_icmp (vlib_main_t * vm, n_left_from = frame->n_vectors; next_index = node->cached_next_index; vlib_combined_counter_main_t *cm = map_main.domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -493,7 +493,7 @@ ip6_map_t_icmp (vlib_main_t * vm, if (PREDICT_TRUE (error0 == MAP_ERROR_NONE)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, vnet_buffer (p0)-> map_t.map_domain_index, 1, len0); @@ -1051,7 +1051,7 @@ ip6_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) vlib_node_runtime_t *error_node = vlib_node_get_runtime (vm, ip6_map_t_node.index); vlib_combined_counter_main_t *cm = map_main.domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -1218,7 +1218,7 @@ ip6_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error0 == MAP_ERROR_NONE && next0 != IP6_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, vnet_buffer (p0)-> map_t.map_domain_index, 1, clib_net_to_host_u16 @@ -1229,7 +1229,7 @@ ip6_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error1 == MAP_ERROR_NONE && next1 != IP6_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, vnet_buffer (p1)-> map_t.map_domain_index, 1, clib_net_to_host_u16 @@ -1403,7 +1403,7 @@ ip6_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error0 == MAP_ERROR_NONE && next0 != IP6_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, vnet_buffer (p0)-> map_t.map_domain_index, 1, clib_net_to_host_u16 diff --git a/src/vnet/mpls/mpls_input.c b/src/vnet/mpls/mpls_input.c index 893c4511..1b9bdd05 100644 --- a/src/vnet/mpls/mpls_input.c +++ b/src/vnet/mpls/mpls_input.c @@ -76,7 +76,7 @@ mpls_input_inline (vlib_main_t * vm, u32 n_left_from, next_index, * from, * to_next; mpls_input_runtime_t * rt; mpls_main_t * mm; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); vlib_simple_counter_main_t * cm; vnet_main_t * vnm = vnet_get_main(); @@ -151,7 +151,7 @@ mpls_input_inline (vlib_main_t * vm, next0 = MPLS_INPUT_NEXT_LOOKUP; vnet_feature_arc_start(mm->input_feature_arc_index, sw_if_index0, &next0, b0); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); } if (PREDICT_FALSE(h1[3] == 0)) @@ -164,7 +164,7 @@ mpls_input_inline (vlib_main_t * vm, next1 = MPLS_INPUT_NEXT_LOOKUP; vnet_feature_arc_start(mm->input_feature_arc_index, sw_if_index1, &next1, b1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index1, 1); } if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) @@ -215,7 +215,7 @@ mpls_input_inline (vlib_main_t * vm, { next0 = MPLS_INPUT_NEXT_LOOKUP; vnet_feature_arc_start(mm->input_feature_arc_index, sw_if_index0, &next0, b0); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); } if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) diff --git a/src/vnet/mpls/mpls_lookup.c b/src/vnet/mpls/mpls_lookup.c index 475bb204..ace6a70f 100644 --- a/src/vnet/mpls/mpls_lookup.c +++ b/src/vnet/mpls/mpls_lookup.c @@ -67,7 +67,7 @@ mpls_lookup (vlib_main_t * vm, vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters; u32 n_left_from, next_index, * from, * to_next; mpls_main_t * mm = &mpls_main; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -220,16 +220,16 @@ mpls_lookup (vlib_main_t * vm, vnet_buffer (b3)->ip.adj_index[VLIB_TX] = dpo3->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, b1)); vlib_increment_combined_counter - (cm, cpu_index, lbi2, 1, + (cm, thread_index, lbi2, 1, vlib_buffer_length_in_chain (vm, b2)); vlib_increment_combined_counter - (cm, cpu_index, lbi3, 1, + (cm, thread_index, lbi3, 1, vlib_buffer_length_in_chain (vm, b3)); /* @@ -351,7 +351,7 @@ mpls_lookup (vlib_main_t * vm, vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); /* @@ -440,7 +440,7 @@ mpls_load_balance (vlib_main_t * vm, { vlib_combined_counter_main_t * cm = &load_balance_main.lbm_via_counters; u32 n_left_from, n_left_to_next, * from, * to_next; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u32 next; from = vlib_frame_vector_args (frame); @@ -536,10 +536,10 @@ mpls_load_balance (vlib_main_t * vm, vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) @@ -597,7 +597,7 @@ mpls_load_balance (vlib_main_t * vm, vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_validate_buffer_enqueue_x1 (vm, node, next, diff --git a/src/vnet/mpls/mpls_output.c b/src/vnet/mpls/mpls_output.c index 08018fd1..d90dec21 100644 --- a/src/vnet/mpls/mpls_output.c +++ b/src/vnet/mpls/mpls_output.c @@ -64,12 +64,12 @@ mpls_output_inline (vlib_main_t * vm, vlib_frame_t * from_frame, int is_midchain) { - u32 n_left_from, next_index, * from, * to_next, cpu_index; + u32 n_left_from, next_index, * from, * to_next, thread_index; vlib_node_runtime_t * error_node; u32 n_left_to_next; mpls_main_t *mm; - cpu_index = os_get_cpu_number(); + thread_index = vlib_get_thread_index(); error_node = vlib_node_get_runtime (vm, mpls_output_node.index); from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -137,13 +137,13 @@ mpls_output_inline (vlib_main_t * vm, /* Bump the adj counters for packet and bytes */ vlib_increment_combined_counter (&adjacency_counters, - cpu_index, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); vlib_increment_combined_counter (&adjacency_counters, - cpu_index, + thread_index, adj_index1, 1, vlib_buffer_length_in_chain (vm, p1) + rw_len1); @@ -245,7 +245,7 @@ mpls_output_inline (vlib_main_t * vm, vlib_increment_combined_counter (&adjacency_counters, - cpu_index, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); diff --git a/src/vnet/pg/input.c b/src/vnet/pg/input.c index 2649798b..597ae060 100644 --- a/src/vnet/pg/input.c +++ b/src/vnet/pg/input.c @@ -893,7 +893,7 @@ pg_generate_set_lengths (pg_main_t * pg, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number (), + vlib_get_thread_index (), si->sw_if_index, n_buffers, length_sum); } @@ -1266,7 +1266,7 @@ pg_stream_fill_helper (pg_main_t * pg, l += vlib_buffer_index_length_in_chain (vm, buffers[i]); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number (), + vlib_get_thread_index (), si->sw_if_index, n_alloc, l); s->current_replay_packet_index += n_alloc; s->current_replay_packet_index %= diff --git a/src/vnet/replication.c b/src/vnet/replication.c index 86d922b5..233a8c2f 100644 --- a/src/vnet/replication.c +++ b/src/vnet/replication.c @@ -31,16 +31,16 @@ replication_prep (vlib_main_t * vm, { replication_main_t *rm = &replication_main; replication_context_t *ctx; - uword cpu_number = vm->cpu_index; + uword thread_index = vm->thread_index; ip4_header_t *ip; u32 ctx_id; /* Allocate a context, reserve context 0 */ - if (PREDICT_FALSE (rm->contexts[cpu_number] == 0)) - pool_get_aligned (rm->contexts[cpu_number], ctx, CLIB_CACHE_LINE_BYTES); + if (PREDICT_FALSE (rm->contexts[thread_index] == 0)) + pool_get_aligned (rm->contexts[thread_index], ctx, CLIB_CACHE_LINE_BYTES); - pool_get_aligned (rm->contexts[cpu_number], ctx, CLIB_CACHE_LINE_BYTES); - ctx_id = ctx - rm->contexts[cpu_number]; + pool_get_aligned (rm->contexts[thread_index], ctx, CLIB_CACHE_LINE_BYTES); + ctx_id = ctx - rm->contexts[thread_index]; /* Save state from vlib buffer */ ctx->saved_free_list_index = b0->free_list_index; @@ -94,11 +94,11 @@ replication_recycle (vlib_main_t * vm, vlib_buffer_t * b0, u32 is_last) { replication_main_t *rm = &replication_main; replication_context_t *ctx; - uword cpu_number = vm->cpu_index; + uword thread_index = vm->thread_index; ip4_header_t *ip; /* Get access to the replication context */ - ctx = pool_elt_at_index (rm->contexts[cpu_number], b0->recycle_count); + ctx = pool_elt_at_index (rm->contexts[thread_index], b0->recycle_count); /* Restore vnet buffer state */ clib_memcpy (vnet_buffer (b0), ctx->vnet_buffer, @@ -133,7 +133,7 @@ replication_recycle (vlib_main_t * vm, vlib_buffer_t * b0, u32 is_last) b0->flags &= ~VLIB_BUFFER_RECYCLE; /* Free context back to its pool */ - pool_put (rm->contexts[cpu_number], ctx); + pool_put (rm->contexts[thread_index], ctx); } return ctx; @@ -160,7 +160,7 @@ replication_recycle_callback (vlib_main_t * vm, vlib_buffer_free_list_t * fl) replication_main_t *rm = &replication_main; replication_context_t *ctx; u32 feature_node_index = 0; - uword cpu_number = vm->cpu_index; + uword thread_index = vm->thread_index; /* * All buffers in the list are destined to the same recycle node. @@ -172,7 +172,7 @@ replication_recycle_callback (vlib_main_t * vm, vlib_buffer_free_list_t * fl) { bi0 = fl->buffers[0]; b0 = vlib_get_buffer (vm, bi0); - ctx = pool_elt_at_index (rm->contexts[cpu_number], b0->recycle_count); + ctx = pool_elt_at_index (rm->contexts[thread_index], b0->recycle_count); feature_node_index = ctx->recycle_node_index; } diff --git a/src/vnet/replication.h b/src/vnet/replication.h index 5dc554c9..ce4b3ff1 100644 --- a/src/vnet/replication.h +++ b/src/vnet/replication.h @@ -100,7 +100,7 @@ replication_get_ctx (vlib_buffer_t * b0) replication_main_t *rm = &replication_main; return replication_is_recycled (b0) ? - pool_elt_at_index (rm->contexts[os_get_cpu_number ()], + pool_elt_at_index (rm->contexts[vlib_get_thread_index ()], b0->recycle_count) : 0; } diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c index b86e87d9..dd211c51 100644 --- a/src/vnet/session/node.c +++ b/src/vnet/session/node.c @@ -311,7 +311,7 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, unix_shared_memory_queue_t *q; application_t *app; int n_tx_packets = 0; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; int i, rv; f64 now = vlib_time_now (vm); diff --git a/src/vnet/sr/sr_localsid.c b/src/vnet/sr/sr_localsid.c index 2e3d56de..6d72a506 100755 --- a/src/vnet/sr/sr_localsid.c +++ b/src/vnet/sr/sr_localsid.c @@ -887,7 +887,7 @@ sr_localsid_d_fn (vlib_main_t * vm, vlib_node_runtime_t * node, from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; next_index = node->cached_next_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -974,26 +974,26 @@ sr_localsid_d_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (((next0 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls0 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b0)); + &(sm->sr_ls_valid_counters)), thread_index, ls0 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter (((next1 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls1 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b1)); + &(sm->sr_ls_valid_counters)), thread_index, ls1 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b1)); vlib_increment_combined_counter (((next2 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls2 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b2)); + &(sm->sr_ls_valid_counters)), thread_index, ls2 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b2)); vlib_increment_combined_counter (((next3 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls3 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b3)); + &(sm->sr_ls_valid_counters)), thread_index, ls3 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b3)); vlib_validate_buffer_enqueue_x4 (vm, node, next_index, to_next, n_left_to_next, bi0, bi1, bi2, bi3, @@ -1062,8 +1062,8 @@ sr_localsid_d_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (((next0 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls0 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b0)); + &(sm->sr_ls_valid_counters)), thread_index, ls0 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b0)); vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, n_left_to_next, bi0, next0); @@ -1103,7 +1103,7 @@ sr_localsid_fn (vlib_main_t * vm, vlib_node_runtime_t * node, from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; next_index = node->cached_next_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -1205,26 +1205,26 @@ sr_localsid_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (((next0 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls0 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b0)); + &(sm->sr_ls_valid_counters)), thread_index, ls0 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter (((next1 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls1 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b1)); + &(sm->sr_ls_valid_counters)), thread_index, ls1 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b1)); vlib_increment_combined_counter (((next2 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls2 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b2)); + &(sm->sr_ls_valid_counters)), thread_index, ls2 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b2)); vlib_increment_combined_counter (((next3 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls3 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b3)); + &(sm->sr_ls_valid_counters)), thread_index, ls3 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b3)); vlib_validate_buffer_enqueue_x4 (vm, node, next_index, to_next, n_left_to_next, bi0, bi1, bi2, bi3, @@ -1295,8 +1295,8 @@ sr_localsid_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (((next0 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls0 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b0)); + &(sm->sr_ls_valid_counters)), thread_index, ls0 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b0)); vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, n_left_to_next, bi0, next0); diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index e3705060..c1567aa0 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -174,7 +174,7 @@ tclient_thread_fn (void *arg) pthread_sigmask (SIG_SETMASK, &s, 0); } - clib_per_cpu_mheaps[os_get_cpu_number ()] = clib_per_cpu_mheaps[0]; + clib_per_cpu_mheaps[vlib_get_thread_index ()] = clib_per_cpu_mheaps[0]; while (1) { diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index b2a371e2..b6c34828 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -646,10 +646,10 @@ const static transport_proto_vft_t tcp6_proto = { void tcp_timer_keep_handler (u32 conn_index) { - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; - tc = tcp_connection_get (conn_index, cpu_index); + tc = tcp_connection_get (conn_index, thread_index); tc->timers[TCP_TIMER_KEEP] = TCP_TIMER_HANDLE_INVALID; tcp_connection_close (tc); @@ -675,10 +675,10 @@ tcp_timer_establish_handler (u32 conn_index) void tcp_timer_waitclose_handler (u32 conn_index) { - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; - tc = tcp_connection_get (conn_index, cpu_index); + tc = tcp_connection_get (conn_index, thread_index); tc->timers[TCP_TIMER_WAITCLOSE] = TCP_TIMER_HANDLE_INVALID; /* Session didn't come back with a close(). Send FIN either way diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index 0090e15e..eaca672c 100644 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -343,7 +343,7 @@ typedef enum _tcp_dbg_evt } \ else \ { \ - u32 _thread_index = os_get_cpu_number (); \ + u32 _thread_index = vlib_get_thread_index (); \ _tc = tcp_connection_get (_tc_index, _thread_index); \ } \ ELOG_TYPE_DECLARE (_e) = \ diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index a8224dc2..7e9fa47b 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -1142,7 +1142,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, int is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index, errors = 0; + u32 my_thread_index = vm->thread_index, errors = 0; tcp_main_t *tm = vnet_get_tcp_main (); from = vlib_frame_vector_args (from_frame); @@ -1332,7 +1332,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { tcp_main_t *tm = vnet_get_tcp_main (); u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index, errors = 0; + u32 my_thread_index = vm->thread_index, errors = 0; u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; from = vlib_frame_vector_args (from_frame); @@ -1634,7 +1634,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { tcp_main_t *tm = vnet_get_tcp_main (); u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index, errors = 0; + u32 my_thread_index = vm->thread_index, errors = 0; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -1989,7 +1989,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, int is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; tcp_main_t *tm = vnet_get_tcp_main (); u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; @@ -2243,7 +2243,7 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, int is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; tcp_main_t *tm = vnet_get_tcp_main (); from = vlib_frame_vector_args (from_frame); diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index ea157bd7..e18bfad7 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -387,8 +387,8 @@ tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts, #define tcp_get_free_buffer_index(tm, bidx) \ do { \ u32 *my_tx_buffers, n_free_buffers; \ - u32 cpu_index = os_get_cpu_number(); \ - my_tx_buffers = tm->tx_buffers[cpu_index]; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ if (PREDICT_FALSE(vec_len (my_tx_buffers) == 0)) \ { \ n_free_buffers = 32; /* TODO config or macro */ \ @@ -396,7 +396,7 @@ do { \ _vec_len(my_tx_buffers) = vlib_buffer_alloc_from_free_list ( \ tm->vlib_main, my_tx_buffers, n_free_buffers, \ VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); \ - tm->tx_buffers[cpu_index] = my_tx_buffers; \ + tm->tx_buffers[thread_index] = my_tx_buffers; \ } \ /* buffer shortage */ \ if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) \ @@ -408,8 +408,8 @@ do { \ #define tcp_return_buffer(tm) \ do { \ u32 *my_tx_buffers; \ - u32 cpu_index = os_get_cpu_number(); \ - my_tx_buffers = tm->tx_buffers[cpu_index]; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ _vec_len (my_tx_buffers) +=1; \ } while (0) @@ -942,7 +942,7 @@ tcp_send_ack (tcp_connection_t * tc) void tcp_timer_delack_handler (u32 index) { - u32 thread_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; tc = tcp_connection_get (index, thread_index); @@ -1022,7 +1022,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); - u32 thread_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; u32 bi, snd_space, n_bytes; @@ -1152,7 +1152,7 @@ tcp_timer_persist_handler (u32 index) { tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); - u32 thread_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; u32 bi, n_bytes; @@ -1313,7 +1313,7 @@ tcp46_output_inline (vlib_main_t * vm, vlib_frame_t * from_frame, int is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -1524,7 +1524,7 @@ tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, u8 is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; diff --git a/src/vnet/udp/udp_input.c b/src/vnet/udp/udp_input.c index 4b22109b..810278e6 100644 --- a/src/vnet/udp/udp_input.c +++ b/src/vnet/udp/udp_input.c @@ -70,7 +70,7 @@ udp4_uri_input_node_fn (vlib_main_t * vm, udp4_uri_input_next_t next_index; udp_uri_main_t *um = vnet_get_udp_main (); session_manager_main_t *smm = vnet_get_session_manager_main (); - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; u8 my_enqueue_epoch; u32 *session_indices_to_enqueue; static u32 serial_number; diff --git a/src/vnet/unix/tapcli.c b/src/vnet/unix/tapcli.c index fb1a8bac..0fc62f6c 100644 --- a/src/vnet/unix/tapcli.c +++ b/src/vnet/unix/tapcli.c @@ -366,7 +366,7 @@ static uword tapcli_rx_iface(vlib_main_t * vm, vlib_increment_combined_counter ( vnet_main.interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number(), ti->sw_if_index, + vlib_get_thread_index(), ti->sw_if_index, 1, n_bytes_in_packet); if (PREDICT_FALSE(n_trace > 0)) { diff --git a/src/vnet/unix/tuntap.c b/src/vnet/unix/tuntap.c index 2cfcc92f..ac674653 100644 --- a/src/vnet/unix/tuntap.c +++ b/src/vnet/unix/tuntap.c @@ -189,7 +189,7 @@ tuntap_tx (vlib_main_t * vm, /* Update tuntap interface output stats. */ vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - vm->cpu_index, + vm->thread_index, tm->sw_if_index, n_packets, n_bytes); @@ -297,7 +297,7 @@ tuntap_rx (vlib_main_t * vm, vlib_increment_combined_counter (vnet_main.interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number(), + vlib_get_thread_index(), tm->sw_if_index, 1, n_bytes_in_packet); diff --git a/src/vnet/vxlan-gpe/decap.c b/src/vnet/vxlan-gpe/decap.c index 22ab4b62..d4fe4231 100644 --- a/src/vnet/vxlan-gpe/decap.c +++ b/src/vnet/vxlan-gpe/decap.c @@ -115,7 +115,7 @@ vxlan_gpe_input (vlib_main_t * vm, vxlan4_gpe_tunnel_key_t last_key4; vxlan6_gpe_tunnel_key_t last_key6; u32 pkts_decapsulated = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; if (is_ip4) @@ -342,7 +342,7 @@ vxlan_gpe_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; stats_sw_if_index = sw_if_index0; @@ -427,7 +427,7 @@ vxlan_gpe_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len1; stats_sw_if_index = sw_if_index1; @@ -588,7 +588,7 @@ vxlan_gpe_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; stats_sw_if_index = sw_if_index0; @@ -615,7 +615,7 @@ vxlan_gpe_input (vlib_main_t * vm, if (stats_n_packets) { vlib_increment_combined_counter ( - im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, cpu_index, + im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); node->runtime_data[0] = stats_sw_if_index; } diff --git a/src/vnet/vxlan-gpe/encap.c b/src/vnet/vxlan-gpe/encap.c index 3a486e56..67ed94b4 100644 --- a/src/vnet/vxlan-gpe/encap.c +++ b/src/vnet/vxlan-gpe/encap.c @@ -151,7 +151,7 @@ vxlan_gpe_encap (vlib_main_t * vm, vnet_main_t * vnm = ngm->vnet_main; vnet_interface_main_t * im = &vnm->interface_main; u32 pkts_encapsulated = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; from = vlib_frame_vector_args (from_frame); @@ -253,7 +253,7 @@ vxlan_gpe_encap (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_sw_if_index = sw_if_index0; stats_n_packets = 2; stats_n_bytes = len0 + len1; @@ -262,10 +262,10 @@ vxlan_gpe_encap (vlib_main_t * vm, { vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, sw_if_index0, 1, len0); + thread_index, sw_if_index0, 1, len0); vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, sw_if_index1, 1, len1); + thread_index, sw_if_index1, 1, len1); } } @@ -335,7 +335,7 @@ vxlan_gpe_encap (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; stats_sw_if_index = sw_if_index0; @@ -359,7 +359,7 @@ vxlan_gpe_encap (vlib_main_t * vm, if (stats_n_packets) { vlib_increment_combined_counter ( - im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, cpu_index, + im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); node->runtime_data[0] = stats_sw_if_index; } diff --git a/src/vnet/vxlan/decap.c b/src/vnet/vxlan/decap.c index 514b2c99..2acb1f6f 100644 --- a/src/vnet/vxlan/decap.c +++ b/src/vnet/vxlan/decap.c @@ -81,7 +81,7 @@ vxlan_input (vlib_main_t * vm, vxlan4_tunnel_key_t last_key4; vxlan6_tunnel_key_t last_key6; u32 pkts_decapsulated = 0; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; if (is_ip4) @@ -314,7 +314,7 @@ vxlan_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; @@ -468,7 +468,7 @@ vxlan_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len1; @@ -674,7 +674,7 @@ vxlan_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; @@ -711,7 +711,7 @@ vxlan_input (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); node->runtime_data[0] = stats_sw_if_index; } diff --git a/src/vnet/vxlan/encap.c b/src/vnet/vxlan/encap.c index 5b63064a..4cfbbc23 100644 --- a/src/vnet/vxlan/encap.c +++ b/src/vnet/vxlan/encap.c @@ -77,7 +77,7 @@ vxlan_encap_inline (vlib_main_t * vm, vnet_interface_main_t * im = &vnm->interface_main; u32 pkts_encapsulated = 0; u16 old_l0 = 0, old_l1 = 0; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; u32 sw_if_index0 = 0, sw_if_index1 = 0; u32 next0 = 0, next1 = 0; @@ -301,7 +301,7 @@ vxlan_encap_inline (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, stats_sw_if_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_sw_if_index = sw_if_index0; stats_n_packets = 2; @@ -311,10 +311,10 @@ vxlan_encap_inline (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, sw_if_index0, 1, len0); + thread_index, sw_if_index0, 1, len0); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, sw_if_index1, 1, len1); + thread_index, sw_if_index1, 1, len1); } } @@ -464,7 +464,7 @@ vxlan_encap_inline (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, stats_sw_if_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; @@ -496,7 +496,7 @@ vxlan_encap_inline (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); node->runtime_data[0] = stats_sw_if_index; } diff --git a/src/vpp/stats/stats.c b/src/vpp/stats/stats.c index 042d02e2..4309cd51 100644 --- a/src/vpp/stats/stats.c +++ b/src/vpp/stats/stats.c @@ -66,14 +66,14 @@ _(VNET_IP6_NBR_COUNTERS, vnet_ip6_nbr_counters) void dslock (stats_main_t * sm, int release_hint, int tag) { - u32 thread_id; + u32 thread_index; data_structure_lock_t *l = sm->data_structure_lock; if (PREDICT_FALSE (l == 0)) return; - thread_id = os_get_cpu_number (); - if (l->lock && l->thread_id == thread_id) + thread_index = vlib_get_thread_index (); + if (l->lock && l->thread_index == thread_index) { l->count++; return; @@ -85,7 +85,7 @@ dslock (stats_main_t * sm, int release_hint, int tag) while (__sync_lock_test_and_set (&l->lock, 1)) /* zzzz */ ; l->tag = tag; - l->thread_id = thread_id; + l->thread_index = thread_index; l->count = 1; } @@ -99,14 +99,14 @@ stats_dslock_with_hint (int hint, int tag) void dsunlock (stats_main_t * sm) { - u32 thread_id; + u32 thread_index; data_structure_lock_t *l = sm->data_structure_lock; if (PREDICT_FALSE (l == 0)) return; - thread_id = os_get_cpu_number (); - ASSERT (l->lock && l->thread_id == thread_id); + thread_index = vlib_get_thread_index (); + ASSERT (l->lock && l->thread_index == thread_index); l->count--; if (l->count == 0) { diff --git a/src/vpp/stats/stats.h b/src/vpp/stats/stats.h index 118115be..024dc78e 100644 --- a/src/vpp/stats/stats.h +++ b/src/vpp/stats/stats.h @@ -30,7 +30,7 @@ typedef struct { volatile u32 lock; volatile u32 release_hint; - u32 thread_id; + u32 thread_index; u32 count; int tag; } data_structure_lock_t; -- cgit 1.2.3-korg From 6cf30adc2cd3aa818e5d97cf71ea8b2fc2aaefa7 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Tue, 4 Apr 2017 23:08:23 -0700 Subject: Session layer refactoring Major refactoring of the session layer api - Add attatch api for application binding to the the session layer - Simplify listen/connect calls - Update application CLI - Add transport endpoint to accept callback - Associate segment manager to application and allow for multiple binds/connects per app Additional: - svm fifo cleanup - add fifo free, format fns - add fifo offset enqueue unit test Change-Id: Id93a65047de61afc2bf3d58c9b544339c02065af Signed-off-by: Florin Coras Signed-off-by: Dave Barach --- src/scripts/vnet/uri/udp | 3 +- src/svm/svm_fifo.c | 66 ++- src/svm/svm_fifo.h | 32 +- src/svm/svm_fifo_segment.h | 14 +- src/uri/uri_tcp_test.c | 315 +++++++++----- src/uri/uri_udp_test.c | 326 +++++++++------ src/vnet.am | 2 + src/vnet/api_errno.h | 4 +- src/vnet/session/application.c | 458 +++++++++++++++------ src/vnet/session/application.h | 77 ++-- src/vnet/session/application_interface.c | 278 ++++++------- src/vnet/session/application_interface.h | 45 +- src/vnet/session/segment_manager.c | 342 ++++++++++++++++ src/vnet/session/segment_manager.h | 106 +++++ src/vnet/session/session.api | 237 +++++------ src/vnet/session/session.c | 564 +++++++------------------ src/vnet/session/session.h | 175 +++++--- src/vnet/session/session_api.c | 678 ++++++++++++------------------- src/vnet/session/transport.h | 23 +- src/vnet/tcp/builtin_client.c | 161 ++++++-- src/vnet/tcp/builtin_client.h | 7 +- src/vnet/tcp/builtin_server.c | 206 +++++++++- src/vnet/tcp/tcp.c | 20 +- src/vnet/tcp/tcp.h | 9 +- src/vnet/tcp/tcp_input.c | 7 +- src/vnet/tcp/tcp_test.c | 127 +++++- src/vnet/udp/builtin_server.c | 34 +- 27 files changed, 2601 insertions(+), 1715 deletions(-) create mode 100644 src/vnet/session/segment_manager.c create mode 100644 src/vnet/session/segment_manager.h (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/scripts/vnet/uri/udp b/src/scripts/vnet/uri/udp index ca13b83c..c7628f49 100644 --- a/src/scripts/vnet/uri/udp +++ b/src/scripts/vnet/uri/udp @@ -1,5 +1,5 @@ loop create -set int ip address loop0 10.0.0.1/32 +set int ip address loop0 6.0.0.1/32 set int state loop0 up packet-generator new { @@ -17,3 +17,4 @@ packet-generator new { incrementing 100 } } +session enable diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c index cc84feb9..097bab77 100644 --- a/src/svm/svm_fifo.c +++ b/src/svm/svm_fifo.c @@ -20,8 +20,6 @@ svm_fifo_t * svm_fifo_create (u32 data_size_in_bytes) { svm_fifo_t *f; - pthread_mutexattr_t attr; - pthread_condattr_t cattr; f = clib_mem_alloc_aligned_or_null (sizeof (*f) + data_size_in_bytes, CLIB_CACHE_LINE_BYTES); @@ -32,29 +30,16 @@ svm_fifo_create (u32 data_size_in_bytes) f->nitems = data_size_in_bytes; f->ooos_list_head = OOO_SEGMENT_INVALID_INDEX; - memset (&attr, 0, sizeof (attr)); - memset (&cattr, 0, sizeof (cattr)); - - if (pthread_mutexattr_init (&attr)) - clib_unix_warning ("mutexattr_init"); - if (pthread_mutexattr_setpshared (&attr, PTHREAD_PROCESS_SHARED)) - clib_unix_warning ("pthread_mutexattr_setpshared"); - if (pthread_mutex_init (&f->mutex, &attr)) - clib_unix_warning ("mutex_init"); - if (pthread_mutexattr_destroy (&attr)) - clib_unix_warning ("mutexattr_destroy"); - if (pthread_condattr_init (&cattr)) - clib_unix_warning ("condattr_init"); - if (pthread_condattr_setpshared (&cattr, PTHREAD_PROCESS_SHARED)) - clib_unix_warning ("condattr_setpshared"); - if (pthread_cond_init (&f->condvar, &cattr)) - clib_unix_warning ("cond_init1"); - if (pthread_condattr_destroy (&cattr)) - clib_unix_warning ("cond_init2"); - return (f); } +void +svm_fifo_free (svm_fifo_t * f) +{ + pool_free (f->ooo_segments); + clib_mem_free (f); +} + always_inline ooo_segment_t * ooo_segment_new (svm_fifo_t * f, u32 start, u32 length) { @@ -567,6 +552,43 @@ svm_fifo_dequeue_drop (svm_fifo_t * f, int pid, u32 max_bytes) return total_drop_bytes; } +u8 * +format_svm_fifo (u8 * s, va_list * args) +{ + svm_fifo_t *f = va_arg (*args, svm_fifo_t *); + int verbose = va_arg (*args, int); + + s = format (s, "cursize %u nitems %u has_event %d\n", + f->cursize, f->nitems, f->has_event); + s = format (s, "head %d tail %d\n", f->head, f->tail); + + if (verbose > 1) + s = format + (s, "server session %d thread %d client session %d thread %d\n", + f->server_session_index, f->server_thread_index, + f->client_session_index, f->client_thread_index); + + if (verbose) + { + ooo_segment_t *seg; + u32 seg_index; + + s = + format (s, "ooo pool %d active elts\n", pool_elts (f->ooo_segments)); + + seg_index = f->ooos_list_head; + + while (seg_index != OOO_SEGMENT_INVALID_INDEX) + { + seg = pool_elt_at_index (f->ooo_segments, seg_index); + s = format (s, " pos %u, len %u next %d\n", + seg->fifo_position, seg->length, seg->next); + seg_index = seg->next; + } + } + return s; +} + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/svm/svm_fifo.h b/src/svm/svm_fifo.h index 80e5b0f2..9beb63f5 100644 --- a/src/svm/svm_fifo.h +++ b/src/svm/svm_fifo.h @@ -48,10 +48,6 @@ typedef struct u32 nitems; CLIB_CACHE_LINE_ALIGN_MARK (end_cursize); - pthread_mutex_t mutex; /* 8 bytes */ - pthread_cond_t condvar; /* 8 bytes */ - svm_lock_tag_t tag; - volatile u8 has_event; /**< non-zero if deq event exists */ u32 owner_pid; @@ -60,6 +56,7 @@ typedef struct u32 client_session_index; u8 server_thread_index; u8 client_thread_index; + u32 segment_manager; CLIB_CACHE_LINE_ALIGN_MARK (end_shared); u32 head; CLIB_CACHE_LINE_ALIGN_MARK (end_consumer); @@ -74,30 +71,6 @@ typedef struct CLIB_CACHE_LINE_ALIGN_MARK (data); } svm_fifo_t; -static inline int -svm_fifo_lock (svm_fifo_t * f, u32 pid, u32 tag, int nowait) -{ - if (PREDICT_TRUE (nowait == 0)) - pthread_mutex_lock (&f->mutex); - else - { - if (pthread_mutex_trylock (&f->mutex)) - return -1; - } - f->owner_pid = pid; - f->tag = tag; - return 0; -} - -static inline void -svm_fifo_unlock (svm_fifo_t * f) -{ - f->owner_pid = 0; - f->tag = 0; - CLIB_MEMORY_BARRIER (); - pthread_mutex_unlock (&f->mutex); -} - static inline u32 svm_fifo_max_dequeue (svm_fifo_t * f) { @@ -139,6 +112,7 @@ svm_fifo_unset_event (svm_fifo_t * f) } svm_fifo_t *svm_fifo_create (u32 data_size_in_bytes); +void svm_fifo_free (svm_fifo_t * f); int svm_fifo_enqueue_nowait (svm_fifo_t * f, int pid, u32 max_bytes, u8 * copy_from_here); @@ -154,6 +128,8 @@ int svm_fifo_peek (svm_fifo_t * f, int pid, u32 offset, u32 max_bytes, u8 * copy_here); int svm_fifo_dequeue_drop (svm_fifo_t * f, int pid, u32 max_bytes); +format_function_t format_svm_fifo; + always_inline ooo_segment_t * svm_fifo_newest_ooo_segment (svm_fifo_t * f) { diff --git a/src/svm/svm_fifo_segment.h b/src/svm/svm_fifo_segment.h index ecb5653a..9ab47a4c 100644 --- a/src/svm/svm_fifo_segment.h +++ b/src/svm/svm_fifo_segment.h @@ -55,6 +55,18 @@ svm_fifo_get_segment (u32 segment_index) return vec_elt_at_index (ssm->segments, segment_index); } +static inline u8 +svm_fifo_segment_has_fifos (svm_fifo_segment_private_t * fifo_segment) +{ + return vec_len ((svm_fifo_t **) fifo_segment->h->fifos) != 0; +} + +static inline svm_fifo_t ** +svm_fifo_segment_get_fifos (svm_fifo_segment_private_t * fifo_segment) +{ + return (svm_fifo_t **) fifo_segment->h->fifos; +} + #define foreach_ssvm_fifo_segment_api_error \ _(OUT_OF_SPACE, "Out of space in segment", -200) @@ -73,9 +85,7 @@ svm_fifo_t *svm_fifo_segment_alloc_fifo (svm_fifo_segment_private_t * s, u32 data_size_in_bytes); void svm_fifo_segment_free_fifo (svm_fifo_segment_private_t * s, svm_fifo_t * f); - void svm_fifo_segment_init (u64 baseva, u32 timeout_in_seconds); - u32 svm_fifo_segment_index (svm_fifo_segment_private_t * s); #endif /* __included_ssvm_fifo_segment_h__ */ diff --git a/src/uri/uri_tcp_test.c b/src/uri/uri_tcp_test.c index e2834817..c057e06e 100644 --- a/src/uri/uri_tcp_test.c +++ b/src/uri/uri_tcp_test.c @@ -15,8 +15,6 @@ #include #include -#include -#include #include #include #include @@ -47,8 +45,7 @@ typedef struct svm_fifo_t *server_rx_fifo; svm_fifo_t *server_tx_fifo; - u32 vpp_session_index; - u32 vpp_session_thread; + u32 vpp_session_handle; } session_t; typedef enum @@ -116,7 +113,7 @@ typedef struct pthread_t client_rx_thread_handle; u32 client_bytes_received; u8 test_return_packets; - u32 bytes_to_send; + u64 bytes_to_send; /* convenience */ svm_fifo_segment_main_t *segment_main; @@ -152,6 +149,88 @@ wait_for_state_change (uri_tcp_test_main_t * utm, connection_state_t state) return -1; } +void +application_attach (uri_tcp_test_main_t * utm) +{ + vl_api_application_attach_t *bmp; + u32 fifo_size = 3 << 20; + bmp = vl_msg_api_alloc (sizeof (*bmp)); + memset (bmp, 0, sizeof (*bmp)); + + bmp->_vl_msg_id = ntohs (VL_API_APPLICATION_ATTACH); + bmp->client_index = utm->my_client_index; + bmp->context = ntohl (0xfeedface); + bmp->options[SESSION_OPTIONS_FLAGS] = + SESSION_OPTIONS_FLAGS_USE_FIFO | SESSION_OPTIONS_FLAGS_ADD_SEGMENT; + bmp->options[SESSION_OPTIONS_RX_FIFO_SIZE] = fifo_size; + bmp->options[SESSION_OPTIONS_TX_FIFO_SIZE] = fifo_size; + bmp->options[SESSION_OPTIONS_ADD_SEGMENT_SIZE] = 128 << 20; + bmp->options[SESSION_OPTIONS_SEGMENT_SIZE] = 256 << 20; + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & bmp); +} + +void +application_detach (uri_tcp_test_main_t * utm) +{ + vl_api_application_detach_t *bmp; + bmp = vl_msg_api_alloc (sizeof (*bmp)); + memset (bmp, 0, sizeof (*bmp)); + + bmp->_vl_msg_id = ntohs (VL_API_APPLICATION_DETACH); + bmp->client_index = utm->my_client_index; + bmp->context = ntohl (0xfeedface); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & bmp); +} + +static void +vl_api_application_attach_reply_t_handler (vl_api_application_attach_reply_t * + mp) +{ + uri_tcp_test_main_t *utm = &uri_tcp_test_main; + svm_fifo_segment_create_args_t _a, *a = &_a; + int rv; + + if (mp->retval) + { + uword *errp = hash_get (utm->error_string_by_error_number, mp->retval); + clib_warning ("attach failed: %s", *errp); + utm->state = STATE_FAILED; + return; + } + + if (mp->segment_name_length == 0) + { + clib_warning ("segment_name_length zero"); + return; + } + + a->segment_name = (char *) mp->segment_name; + a->segment_size = mp->segment_size; + + ASSERT (mp->app_event_queue_address); + + /* Attach to the segment vpp created */ + rv = svm_fifo_segment_attach (a); + if (rv) + { + clib_warning ("svm_fifo_segment_attach ('%s') failed", + mp->segment_name); + return; + } + + utm->our_event_queue = + (unix_shared_memory_queue_t *) mp->app_event_queue_address; + +} + +static void +vl_api_application_detach_reply_t_handler (vl_api_application_detach_reply_t * + mp) +{ + if (mp->retval) + clib_warning ("detach returned with err: %d", mp->retval); +} + static void init_error_string_table (uri_tcp_test_main_t * utm) { @@ -239,21 +318,18 @@ vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) vl_api_disconnect_session_reply_t *rmp; uword *p; int rv = 0; - u64 key; - - key = (((u64) mp->session_thread_index) << 32) | (u64) mp->session_index; - p = hash_get (utm->session_index_by_vpp_handles, key); + p = hash_get (utm->session_index_by_vpp_handles, mp->handle); if (p) { session = pool_elt_at_index (utm->sessions, p[0]); - hash_unset (utm->session_index_by_vpp_handles, key); + hash_unset (utm->session_index_by_vpp_handles, mp->handle); pool_put (utm->sessions, session); } else { - clib_warning ("couldn't find session key %llx", key); + clib_warning ("couldn't find session key %llx", mp->handle); rv = -11; } @@ -264,8 +340,7 @@ vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) rmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION_REPLY); rmp->retval = rv; - rmp->session_index = mp->session_index; - rmp->session_thread_index = mp->session_thread_index; + rmp->handle = mp->handle; vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & rmp); } @@ -277,22 +352,19 @@ vl_api_reset_session_t_handler (vl_api_reset_session_t * mp) vl_api_reset_session_reply_t *rmp; uword *p; int rv = 0; - u64 key; - key = (((u64) mp->session_thread_index) << 32) | (u64) mp->session_index; - - p = hash_get (utm->session_index_by_vpp_handles, key); + p = hash_get (utm->session_index_by_vpp_handles, mp->handle); if (p) { session = pool_elt_at_index (utm->sessions, p[0]); - hash_unset (utm->session_index_by_vpp_handles, key); + hash_unset (utm->session_index_by_vpp_handles, mp->handle); pool_put (utm->sessions, session); utm->time_to_stop = 1; } else { - clib_warning ("couldn't find session key %llx", key); + clib_warning ("couldn't find session key %llx", mp->handle); rv = -11; } @@ -300,8 +372,7 @@ vl_api_reset_session_t_handler (vl_api_reset_session_t * mp) memset (rmp, 0, sizeof (*rmp)); rmp->_vl_msg_id = ntohs (VL_API_RESET_SESSION_REPLY); rmp->retval = rv; - rmp->session_index = mp->session_index; - rmp->session_thread_index = mp->session_thread_index; + rmp->handle = mp->handle; vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & rmp); } @@ -343,7 +414,7 @@ client_handle_fifo_event_rx (uri_tcp_test_main_t * utm, { if (n_read == -2) { - clib_warning ("weird!"); +// clib_warning ("weird!"); break; } } @@ -409,52 +480,19 @@ static void vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) { uri_tcp_test_main_t *utm = &uri_tcp_test_main; - svm_fifo_segment_create_args_t _a, *a = &_a; session_t *session; u32 session_index; svm_fifo_t *rx_fifo, *tx_fifo; int rv; - u64 key; if (mp->retval) { - clib_warning ("connection failed with code: %d", mp->retval); - utm->state = STATE_FAILED; - return; - } - - /* - * Attatch to segment - */ - - if (mp->segment_name_length == 0) - { - clib_warning ("segment_name_length zero"); + uword *errp = hash_get (utm->error_string_by_error_number, -mp->retval); + clib_warning ("connection failed with code: %s", *errp); utm->state = STATE_FAILED; return; } - a->segment_name = (char *) mp->segment_name; - a->segment_size = mp->segment_size; - - ASSERT (mp->client_event_queue_address); - - /* Attach to the segment vpp created */ - rv = svm_fifo_segment_attach (a); - if (rv) - { - clib_warning ("svm_fifo_segment_attach ('%s') failed", - mp->segment_name); - return; - } - - /* - * Save the queues - */ - - utm->our_event_queue = (unix_shared_memory_queue_t *) - mp->client_event_queue_address; - utm->vpp_event_queue = (unix_shared_memory_queue_t *) mp->vpp_event_queue_address; @@ -472,16 +510,14 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) session->server_rx_fifo = rx_fifo; session->server_tx_fifo = tx_fifo; - session->vpp_session_index = mp->session_index; - session->vpp_session_thread = mp->session_thread_index; + session->vpp_session_handle = mp->handle; /* Save handle */ utm->connected_session_index = session_index; utm->state = STATE_READY; /* Add it to lookup table */ - key = (((u64) mp->session_thread_index) << 32) | (u64) mp->session_index; - hash_set (utm->session_index_by_vpp_handles, key, session_index); + hash_set (utm->session_index_by_vpp_handles, mp->handle, session_index); /* Start RX thread */ rv = pthread_create (&utm->client_rx_thread_handle, @@ -606,8 +642,7 @@ client_disconnect (uri_tcp_test_main_t * utm) memset (dmp, 0, sizeof (*dmp)); dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION); dmp->client_index = utm->my_client_index; - dmp->session_index = connected_session->vpp_session_index; - dmp->session_thread_index = connected_session->vpp_session_thread; + dmp->handle = connected_session->vpp_session_handle; vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & dmp); } @@ -616,6 +651,7 @@ client_test (uri_tcp_test_main_t * utm) { int i; + application_attach (utm); client_connect (utm); if (wait_for_state_change (utm, STATE_READY)) @@ -636,47 +672,26 @@ client_test (uri_tcp_test_main_t * utm) if (wait_for_state_change (utm, STATE_START)) { + clib_warning ("Disconnect failed"); return; } + application_detach (utm); } static void vl_api_bind_uri_reply_t_handler (vl_api_bind_uri_reply_t * mp) { uri_tcp_test_main_t *utm = &uri_tcp_test_main; - svm_fifo_segment_create_args_t _a, *a = &_a; - int rv; if (mp->retval) { - clib_warning ("bind failed: %d", mp->retval); + uword *errp = hash_get (utm->error_string_by_error_number, + -clib_net_to_host_u32 (mp->retval)); + clib_warning ("bind failed: %s", (char *) *errp); utm->state = STATE_FAILED; return; } - if (mp->segment_name_length == 0) - { - clib_warning ("segment_name_length zero"); - return; - } - - a->segment_name = (char *) mp->segment_name; - a->segment_size = mp->segment_size; - - ASSERT (mp->server_event_queue_address); - - /* Attach to the segment vpp created */ - rv = svm_fifo_segment_attach (a); - if (rv) - { - clib_warning ("svm_fifo_segment_attach ('%s') failed", - mp->segment_name); - return; - } - - utm->our_event_queue = - (unix_shared_memory_queue_t *) mp->server_event_queue_address; - utm->state = STATE_READY; } @@ -691,6 +706,89 @@ vl_api_unbind_uri_reply_t_handler (vl_api_unbind_uri_reply_t * mp) utm->state = STATE_START; } +u8 * +format_ip4_address (u8 * s, va_list * args) +{ + u8 *a = va_arg (*args, u8 *); + return format (s, "%d.%d.%d.%d", a[0], a[1], a[2], a[3]); +} + +u8 * +format_ip6_address (u8 * s, va_list * args) +{ + ip6_address_t *a = va_arg (*args, ip6_address_t *); + u32 i, i_max_n_zero, max_n_zeros, i_first_zero, n_zeros, last_double_colon; + + i_max_n_zero = ARRAY_LEN (a->as_u16); + max_n_zeros = 0; + i_first_zero = i_max_n_zero; + n_zeros = 0; + for (i = 0; i < ARRAY_LEN (a->as_u16); i++) + { + u32 is_zero = a->as_u16[i] == 0; + if (is_zero && i_first_zero >= ARRAY_LEN (a->as_u16)) + { + i_first_zero = i; + n_zeros = 0; + } + n_zeros += is_zero; + if ((!is_zero && n_zeros > max_n_zeros) + || (i + 1 >= ARRAY_LEN (a->as_u16) && n_zeros > max_n_zeros)) + { + i_max_n_zero = i_first_zero; + max_n_zeros = n_zeros; + i_first_zero = ARRAY_LEN (a->as_u16); + n_zeros = 0; + } + } + + last_double_colon = 0; + for (i = 0; i < ARRAY_LEN (a->as_u16); i++) + { + if (i == i_max_n_zero && max_n_zeros > 1) + { + s = format (s, "::"); + i += max_n_zeros - 1; + last_double_colon = 1; + } + else + { + s = format (s, "%s%x", + (last_double_colon || i == 0) ? "" : ":", + clib_net_to_host_u16 (a->as_u16[i])); + last_double_colon = 0; + } + } + + return s; +} + +/* Format an IP46 address. */ +u8 * +format_ip46_address (u8 * s, va_list * args) +{ + ip46_address_t *ip46 = va_arg (*args, ip46_address_t *); + ip46_type_t type = va_arg (*args, ip46_type_t); + int is_ip4 = 1; + + switch (type) + { + case IP46_TYPE_ANY: + is_ip4 = ip46_address_is_ip4 (ip46); + break; + case IP46_TYPE_IP4: + is_ip4 = 1; + break; + case IP46_TYPE_IP6: + is_ip4 = 0; + break; + } + + return is_ip4 ? + format (s, "%U", format_ip4_address, &ip46->ip4) : + format (s, "%U", format_ip6_address, &ip46->ip6); +} + static void vl_api_accept_session_t_handler (vl_api_accept_session_t * mp) { @@ -699,12 +797,15 @@ vl_api_accept_session_t_handler (vl_api_accept_session_t * mp) svm_fifo_t *rx_fifo, *tx_fifo; session_t *session; static f64 start_time; - u64 key; u32 session_index; + u8 *ip_str; if (start_time == 0.0) start_time = clib_time_now (&utm->clib_time); + ip_str = format (0, "%U", format_ip46_address, &mp->ip, mp->is_ip4); + clib_warning ("Accepted session from: %s:%d", ip_str, + clib_net_to_host_u16 (mp->port)); utm->vpp_event_queue = (unix_shared_memory_queue_t *) mp->vpp_event_queue_address; @@ -721,8 +822,7 @@ vl_api_accept_session_t_handler (vl_api_accept_session_t * mp) session->server_tx_fifo = tx_fifo; /* Add it to lookup table */ - key = (((u64) mp->session_thread_index) << 32) | (u64) mp->session_index; - hash_set (utm->session_index_by_vpp_handles, key, session_index); + hash_set (utm->session_index_by_vpp_handles, mp->handle, session_index); utm->state = STATE_READY; @@ -741,9 +841,7 @@ vl_api_accept_session_t_handler (vl_api_accept_session_t * mp) rmp = vl_msg_api_alloc (sizeof (*rmp)); memset (rmp, 0, sizeof (*rmp)); rmp->_vl_msg_id = ntohs (VL_API_ACCEPT_SESSION_REPLY); - rmp->session_type = mp->session_type; - rmp->session_index = mp->session_index; - rmp->session_thread_index = mp->session_thread_index; + rmp->handle = mp->handle; vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & rmp); } @@ -837,22 +935,15 @@ server_handle_event_queue (uri_tcp_test_main_t * utm) } void -server_bind (uri_tcp_test_main_t * utm) +server_listen (uri_tcp_test_main_t * utm) { vl_api_bind_uri_t *bmp; - u32 fifo_size = 3 << 20; bmp = vl_msg_api_alloc (sizeof (*bmp)); memset (bmp, 0, sizeof (*bmp)); bmp->_vl_msg_id = ntohs (VL_API_BIND_URI); bmp->client_index = utm->my_client_index; bmp->context = ntohl (0xfeedface); - bmp->initial_segment_size = 256 << 20; /* size of initial segment */ - bmp->options[SESSION_OPTIONS_FLAGS] = - SESSION_OPTIONS_FLAGS_USE_FIFO | SESSION_OPTIONS_FLAGS_ADD_SEGMENT; - bmp->options[SESSION_OPTIONS_RX_FIFO_SIZE] = fifo_size; - bmp->options[SESSION_OPTIONS_TX_FIFO_SIZE] = fifo_size; - bmp->options[SESSION_OPTIONS_ADD_SEGMENT_SIZE] = 128 << 20; memcpy (bmp->uri, utm->uri, vec_len (utm->uri)); vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & bmp); } @@ -874,8 +965,10 @@ server_unbind (uri_tcp_test_main_t * utm) void server_test (uri_tcp_test_main_t * utm) { + application_attach (utm); + /* Bind to uri */ - server_bind (utm); + server_listen (utm); if (wait_for_state_change (utm, STATE_READY)) { @@ -895,6 +988,8 @@ server_test (uri_tcp_test_main_t * utm) return; } + application_detach (utm); + fformat (stdout, "Test complete...\n"); } @@ -916,7 +1011,9 @@ _(CONNECT_URI_REPLY, connect_uri_reply) \ _(DISCONNECT_SESSION, disconnect_session) \ _(DISCONNECT_SESSION_REPLY, disconnect_session_reply) \ _(RESET_SESSION, reset_session) \ -_(MAP_ANOTHER_SEGMENT, map_another_segment) +_(APPLICATION_ATTACH_REPLY, application_attach_reply) \ +_(APPLICATION_DETACH_REPLY, application_detach_reply) \ +_(MAP_ANOTHER_SEGMENT, map_another_segment) \ void uri_api_hookup (uri_tcp_test_main_t * utm) @@ -941,7 +1038,7 @@ main (int argc, char **argv) u8 *heap, *uri = 0; u8 *bind_uri = (u8 *) "tcp://0.0.0.0/1234"; u8 *connect_uri = (u8 *) "tcp://6.0.1.2/1234"; - u32 bytes_to_send = 64 << 10, mbytes; + u64 bytes_to_send = 64 << 10, mbytes; u32 tmp; mheap_t *h; session_t *session; @@ -988,10 +1085,14 @@ main (int argc, char **argv) drop_packets = 1; else if (unformat (a, "test")) test_return_packets = 1; - else if (unformat (a, "mbytes %d", &mbytes)) + else if (unformat (a, "mbytes %lld", &mbytes)) { bytes_to_send = mbytes << 20; } + else if (unformat (a, "gbytes %lld", &mbytes)) + { + bytes_to_send = mbytes << 30; + } else { fformat (stderr, "%s: usage [master|slave]\n"); diff --git a/src/uri/uri_udp_test.c b/src/uri/uri_udp_test.c index e6c239c1..598052bc 100644 --- a/src/uri/uri_udp_test.c +++ b/src/uri/uri_udp_test.c @@ -55,6 +55,7 @@ typedef enum { STATE_START, STATE_READY, + STATE_FAILED, STATE_DISCONNECTING, } connection_state_t; @@ -162,6 +163,86 @@ setup_signal_handlers (void) return 0; } +void +application_attach (uri_udp_test_main_t * utm) +{ + vl_api_application_attach_t *bmp; + u32 fifo_size = 3 << 20; + bmp = vl_msg_api_alloc (sizeof (*bmp)); + memset (bmp, 0, sizeof (*bmp)); + + bmp->_vl_msg_id = ntohs (VL_API_APPLICATION_ATTACH); + bmp->client_index = utm->my_client_index; + bmp->context = ntohl (0xfeedface); + bmp->options[SESSION_OPTIONS_FLAGS] = + SESSION_OPTIONS_FLAGS_USE_FIFO | SESSION_OPTIONS_FLAGS_ADD_SEGMENT; + bmp->options[SESSION_OPTIONS_RX_FIFO_SIZE] = fifo_size; + bmp->options[SESSION_OPTIONS_TX_FIFO_SIZE] = fifo_size; + bmp->options[SESSION_OPTIONS_ADD_SEGMENT_SIZE] = 128 << 20; + bmp->options[SESSION_OPTIONS_SEGMENT_SIZE] = 256 << 20; + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & bmp); +} + +void +application_detach (uri_udp_test_main_t * utm) +{ + vl_api_application_detach_t *bmp; + bmp = vl_msg_api_alloc (sizeof (*bmp)); + memset (bmp, 0, sizeof (*bmp)); + + bmp->_vl_msg_id = ntohs (VL_API_APPLICATION_DETACH); + bmp->client_index = utm->my_client_index; + bmp->context = ntohl (0xfeedface); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & bmp); +} + +static void +vl_api_application_attach_reply_t_handler (vl_api_application_attach_reply_t * + mp) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + svm_fifo_segment_create_args_t _a, *a = &_a; + int rv; + + if (mp->retval) + { + clib_warning ("attach failed: %d", mp->retval); + utm->state = STATE_FAILED; + return; + } + + if (mp->segment_name_length == 0) + { + clib_warning ("segment_name_length zero"); + return; + } + + a->segment_name = (char *) mp->segment_name; + a->segment_size = mp->segment_size; + + ASSERT (mp->app_event_queue_address); + + /* Attach to the segment vpp created */ + rv = svm_fifo_segment_attach (a); + if (rv) + { + clib_warning ("svm_fifo_segment_attach ('%s') failed", + mp->segment_name); + return; + } + + utm->our_event_queue = + (unix_shared_memory_queue_t *) mp->app_event_queue_address; +} + +static void +vl_api_application_detach_reply_t_handler (vl_api_application_detach_reply_t * + mp) +{ + if (mp->retval) + clib_warning ("detach returned with err: %d", mp->retval); +} + u8 * format_api_error (u8 * s, va_list * args) { @@ -255,9 +336,22 @@ cut_through_thread_fn (void *arg) } static void -uri_udp_slave_test (uri_udp_test_main_t * utm) +udp_client_connect (uri_udp_test_main_t * utm) { vl_api_connect_uri_t *cmp; + cmp = vl_msg_api_alloc (sizeof (*cmp)); + memset (cmp, 0, sizeof (*cmp)); + + cmp->_vl_msg_id = ntohs (VL_API_CONNECT_URI); + cmp->client_index = utm->my_client_index; + cmp->context = ntohl (0xfeedface); + memcpy (cmp->uri, utm->connect_uri, vec_len (utm->connect_uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & cmp); +} + +static void +client_send (uri_udp_test_main_t * utm, session_t * session) +{ int i; u8 *test_data = 0; u64 bytes_received = 0, bytes_sent = 0; @@ -265,30 +359,16 @@ uri_udp_slave_test (uri_udp_test_main_t * utm) int rv; int mypid = getpid (); f64 before, after, delta, bytes_per_second; - session_t *session; svm_fifo_t *rx_fifo, *tx_fifo; int buffer_offset, bytes_to_send = 0; + /* + * Prepare test data + */ vec_validate (test_data, 64 * 1024 - 1); for (i = 0; i < vec_len (test_data); i++) test_data[i] = i & 0xff; - cmp = vl_msg_api_alloc (sizeof (*cmp)); - memset (cmp, 0, sizeof (*cmp)); - - cmp->_vl_msg_id = ntohs (VL_API_CONNECT_URI); - cmp->client_index = utm->my_client_index; - cmp->context = ntohl (0xfeedface); - memcpy (cmp->uri, utm->connect_uri, vec_len (utm->connect_uri)); - vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & cmp); - - if (wait_for_state_change (utm, STATE_READY)) - { - clib_warning ("timeout waiting for STATE_READY"); - return; - } - - session = pool_elt_at_index (utm->sessions, utm->cut_through_session_index); rx_fifo = session->server_rx_fifo; tx_fifo = session->server_tx_fifo; @@ -375,35 +455,38 @@ uri_udp_slave_test (uri_udp_test_main_t * utm) } static void -vl_api_bind_uri_reply_t_handler (vl_api_bind_uri_reply_t * mp) +uri_udp_client_test (uri_udp_test_main_t * utm) { - uri_udp_test_main_t *utm = &uri_udp_test_main; - svm_fifo_segment_create_args_t _a, *a = &_a; - int rv; + session_t *session; - if (mp->segment_name_length == 0) + application_attach (utm); + udp_client_connect (utm); + + if (wait_for_state_change (utm, STATE_READY)) { - clib_warning ("segment_name_length zero"); + clib_warning ("timeout waiting for STATE_READY"); return; } - a->segment_name = (char *) mp->segment_name; - a->segment_size = mp->segment_size; + /* Only works with cut through sessions */ + session = pool_elt_at_index (utm->sessions, utm->cut_through_session_index); - ASSERT (mp->server_event_queue_address); + client_send (utm, session); + application_detach (utm); +} - /* Attach to the segment vpp created */ - rv = svm_fifo_segment_attach (a); - if (rv) +static void +vl_api_bind_uri_reply_t_handler (vl_api_bind_uri_reply_t * mp) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + + if (mp->retval) { - clib_warning ("svm_fifo_segment_attach ('%s') failed", - mp->segment_name); + clib_warning ("bind failed: %d", mp->retval); + utm->state = STATE_FAILED; return; } - utm->our_event_queue = (unix_shared_memory_queue_t *) - mp->server_event_queue_address; - utm->state = STATE_READY; } @@ -427,6 +510,9 @@ vl_api_map_another_segment_t_handler (vl_api_map_another_segment_t * mp) mp->segment_size); } +/** + * Acting as server for redirected connect requests + */ static void vl_api_connect_uri_t_handler (vl_api_connect_uri_t * mp) { @@ -456,7 +542,6 @@ vl_api_connect_uri_t_handler (vl_api_connect_uri_t * mp) vec_add2 (utm->seg, seg, 1); segment_index = vec_len (sm->segments) - 1; - memcpy (seg, sm->segments + segment_index, sizeof (utm->seg[0])); pool_get (utm->sessions, session); @@ -521,7 +606,6 @@ vl_api_accept_session_t_handler (vl_api_accept_session_t * mp) svm_fifo_t *rx_fifo, *tx_fifo; session_t *session; static f64 start_time; - u64 key; if (start_time == 0.0) start_time = clib_time_now (&utm->clib_time); @@ -539,9 +623,8 @@ vl_api_accept_session_t_handler (vl_api_accept_session_t * mp) session->server_rx_fifo = rx_fifo; session->server_tx_fifo = tx_fifo; - key = (((u64) mp->session_thread_index) << 32) | (u64) mp->session_index; - - hash_set (utm->session_index_by_vpp_handles, key, session - utm->sessions); + hash_set (utm->session_index_by_vpp_handles, mp->handle, + session - utm->sessions); utm->state = STATE_READY; @@ -556,9 +639,7 @@ vl_api_accept_session_t_handler (vl_api_accept_session_t * mp) rmp = vl_msg_api_alloc (sizeof (*rmp)); memset (rmp, 0, sizeof (*rmp)); rmp->_vl_msg_id = ntohs (VL_API_ACCEPT_SESSION_REPLY); - rmp->session_type = mp->session_type; - rmp->session_index = mp->session_index; - rmp->session_thread_index = mp->session_thread_index; + rmp->handle = mp->handle; vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & rmp); } @@ -570,21 +651,18 @@ vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) vl_api_disconnect_session_reply_t *rmp; uword *p; int rv = 0; - u64 key; - - key = (((u64) mp->session_thread_index) << 32) | (u64) mp->session_index; - p = hash_get (utm->session_index_by_vpp_handles, key); + p = hash_get (utm->session_index_by_vpp_handles, mp->handle); if (p) { session = pool_elt_at_index (utm->sessions, p[0]); - hash_unset (utm->session_index_by_vpp_handles, key); + hash_unset (utm->session_index_by_vpp_handles, mp->handle); pool_put (utm->sessions, session); } else { - clib_warning ("couldn't find session key %llx", key); + clib_warning ("couldn't find session key %llx", mp->handle); rv = -11; } @@ -592,77 +670,76 @@ vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) memset (rmp, 0, sizeof (*rmp)); rmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION_REPLY); rmp->retval = rv; - rmp->session_index = mp->session_index; - rmp->session_thread_index = mp->session_thread_index; + rmp->handle = mp->handle; vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & rmp); } static void vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) { - svm_fifo_segment_main_t *sm = &svm_fifo_segment_main; uri_udp_test_main_t *utm = &uri_udp_test_main; - svm_fifo_segment_create_args_t _a, *a = &_a; - ssvm_shared_header_t *sh; - svm_fifo_segment_private_t *seg; - svm_fifo_segment_header_t *fsh; - session_t *session; - u32 segment_index; - int rv; ASSERT (utm->i_am_master == 0); - if (mp->segment_name_length == 0) + /* We've been redirected */ + if (mp->segment_name_length > 0) { - clib_warning ("segment_name_length zero"); - return; - } - - memset (a, 0, sizeof (*a)); - - a->segment_name = (char *) mp->segment_name; - - sleep (1); - - rv = svm_fifo_segment_attach (a); - if (rv) - { - clib_warning ("sm_fifo_segment_create ('%v') failed", mp->segment_name); - return; - } - - segment_index = vec_len (sm->segments) - 1; + svm_fifo_segment_main_t *sm = &svm_fifo_segment_main; + svm_fifo_segment_create_args_t _a, *a = &_a; + u32 segment_index; + session_t *session; + ssvm_shared_header_t *sh; + svm_fifo_segment_private_t *seg; + svm_fifo_segment_header_t *fsh; + int rv; + + memset (a, 0, sizeof (*a)); + a->segment_name = (char *) mp->segment_name; + + sleep (1); + + rv = svm_fifo_segment_attach (a); + if (rv) + { + clib_warning ("sm_fifo_segment_create ('%v') failed", + mp->segment_name); + return; + } - vec_add2 (utm->seg, seg, 1); + segment_index = vec_len (sm->segments) - 1; + vec_add2 (utm->seg, seg, 1); - memcpy (seg, sm->segments + segment_index, sizeof (*seg)); - sh = seg->ssvm.sh; - fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; + memcpy (seg, sm->segments + segment_index, sizeof (*seg)); + sh = seg->ssvm.sh; + fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; - while (vec_len (fsh->fifos) < 2) - sleep (1); + while (vec_len (fsh->fifos) < 2) + sleep (1); - pool_get (utm->sessions, session); - utm->cut_through_session_index = session - utm->sessions; + pool_get (utm->sessions, session); + utm->cut_through_session_index = session - utm->sessions; - session->server_rx_fifo = (svm_fifo_t *) fsh->fifos[0]; - ASSERT (session->server_rx_fifo); - session->server_tx_fifo = (svm_fifo_t *) fsh->fifos[1]; - ASSERT (session->server_tx_fifo); + session->server_rx_fifo = (svm_fifo_t *) fsh->fifos[0]; + ASSERT (session->server_rx_fifo); + session->server_tx_fifo = (svm_fifo_t *) fsh->fifos[1]; + ASSERT (session->server_tx_fifo); + } /* security: could unlink /dev/shm/segment_name> here, maybe */ utm->state = STATE_READY; } -#define foreach_uri_msg \ -_(BIND_URI_REPLY, bind_uri_reply) \ -_(CONNECT_URI, connect_uri) \ -_(CONNECT_URI_REPLY, connect_uri_reply) \ -_(UNBIND_URI_REPLY, unbind_uri_reply) \ -_(ACCEPT_SESSION, accept_session) \ -_(DISCONNECT_SESSION, disconnect_session) \ -_(MAP_ANOTHER_SEGMENT, map_another_segment) +#define foreach_uri_msg \ +_(BIND_URI_REPLY, bind_uri_reply) \ +_(CONNECT_URI, connect_uri) \ +_(CONNECT_URI_REPLY, connect_uri_reply) \ +_(UNBIND_URI_REPLY, unbind_uri_reply) \ +_(ACCEPT_SESSION, accept_session) \ +_(DISCONNECT_SESSION, disconnect_session) \ +_(MAP_ANOTHER_SEGMENT, map_another_segment) \ +_(APPLICATION_ATTACH_REPLY, application_attach_reply) \ +_(APPLICATION_DETACH_REPLY, application_detach_reply) \ void uri_api_hookup (uri_udp_test_main_t * utm) @@ -679,7 +756,6 @@ uri_api_hookup (uri_udp_test_main_t * utm) } - int connect_to_vpp (char *name) { @@ -784,26 +860,43 @@ server_handle_event_queue (uri_udp_test_main_t * utm) } } -void -uri_udp_test (uri_udp_test_main_t * utm) +static void +server_unbind (uri_udp_test_main_t * utm) { - vl_api_bind_uri_t *bmp; vl_api_unbind_uri_t *ump; + ump = vl_msg_api_alloc (sizeof (*ump)); + memset (ump, 0, sizeof (*ump)); + + ump->_vl_msg_id = ntohs (VL_API_UNBIND_URI); + ump->client_index = utm->my_client_index; + memcpy (ump->uri, utm->uri, vec_len (utm->uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & ump); +} + +static void +server_listen (uri_udp_test_main_t * utm) +{ + vl_api_bind_uri_t *bmp; + bmp = vl_msg_api_alloc (sizeof (*bmp)); memset (bmp, 0, sizeof (*bmp)); bmp->_vl_msg_id = ntohs (VL_API_BIND_URI); bmp->client_index = utm->my_client_index; bmp->context = ntohl (0xfeedface); - bmp->initial_segment_size = 256 << 20; /* size of initial segment */ - bmp->options[SESSION_OPTIONS_FLAGS] = - SESSION_OPTIONS_FLAGS_USE_FIFO | SESSION_OPTIONS_FLAGS_ADD_SEGMENT; - bmp->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 16 << 10; - bmp->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 16 << 10; - bmp->options[SESSION_OPTIONS_ADD_SEGMENT_SIZE] = 128 << 20; memcpy (bmp->uri, utm->uri, vec_len (utm->uri)); vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & bmp); +} + +void +udp_server_test (uri_udp_test_main_t * utm) +{ + + application_attach (utm); + + /* Bind to uri */ + server_listen (utm); if (wait_for_state_change (utm, STATE_READY)) { @@ -813,13 +906,8 @@ uri_udp_test (uri_udp_test_main_t * utm) server_handle_event_queue (utm); - ump = vl_msg_api_alloc (sizeof (*ump)); - memset (ump, 0, sizeof (*ump)); - - ump->_vl_msg_id = ntohs (VL_API_UNBIND_URI); - ump->client_index = utm->my_client_index; - memcpy (ump->uri, utm->uri, vec_len (utm->uri)); - vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & ump); + /* Cleanup */ + server_unbind (utm); if (wait_for_state_change (utm, STATE_START)) { @@ -827,6 +915,8 @@ uri_udp_test (uri_udp_test_main_t * utm) return; } + application_detach (utm); + fformat (stdout, "Test complete...\n"); } @@ -892,7 +982,7 @@ main (int argc, char **argv) utm->i_am_master = i_am_master; utm->segment_main = &svm_fifo_segment_main; - utm->connect_uri = format (0, "udp://10.0.0.1/1234%c", 0); + utm->connect_uri = format (0, "udp://6.0.0.1/1234%c", 0); setup_signal_handlers (); @@ -907,7 +997,7 @@ main (int argc, char **argv) if (i_am_master == 0) { - uri_udp_slave_test (utm); + uri_udp_client_test (utm); exit (0); } @@ -920,7 +1010,7 @@ main (int argc, char **argv) for (i = 0; i < 200000; i++) pool_put_index (utm->sessions, i); - uri_udp_test (utm); + udp_server_test (utm); vl_client_disconnect_from_vlib (); exit (0); diff --git a/src/vnet.am b/src/vnet.am index bed4902b..25b84616 100644 --- a/src/vnet.am +++ b/src/vnet.am @@ -827,6 +827,7 @@ libvnet_la_SOURCES += \ vnet/session/session_cli.c \ vnet/session/hashes.c \ vnet/session/application_interface.c \ + vnet/session/segment_manager.c \ vnet/session/session_api.c nobase_include_HEADERS += \ @@ -835,6 +836,7 @@ nobase_include_HEADERS += \ vnet/session/transport.h \ vnet/session/application_interface.h \ vnet/session/session_debug.h \ + vnet/session/segment_manager.h \ vnet/session/session.api.h API_FILES += vnet/session/session.api diff --git a/src/vnet/api_errno.h b/src/vnet/api_errno.h index f3ffd2a6..e939404b 100644 --- a/src/vnet/api_errno.h +++ b/src/vnet/api_errno.h @@ -105,7 +105,9 @@ _(INVALID_GPE_MODE, -112, "Invalid GPE mode") \ _(LISP_GPE_ENTRIES_PRESENT, -113, "LISP GPE entries are present") \ _(ADDRESS_FOUND_FOR_INTERFACE, -114, "Address found for interface") \ _(SESSION_CONNECT_FAIL, -115, "Session failed to connect") \ -_(ENTRY_ALREADY_EXISTS, -116, "Entry already exists") +_(ENTRY_ALREADY_EXISTS, -116, "Entry already exists") \ +_(SVM_SEGMENT_CREATE_FAIL, -117, "svm segment create fail") \ +_(APPLICATION_NOT_ATTACHED, -118, "application not attached") typedef enum { diff --git a/src/vnet/session/application.c b/src/vnet/session/application.c index 513e5fac..5a45537b 100644 --- a/src/vnet/session/application.c +++ b/src/vnet/session/application.c @@ -14,18 +14,24 @@ */ #include +#include #include -/* +/** * Pool from which we allocate all applications */ static application_t *app_pool; -/* +/** * Hash table of apps by api client index */ static uword *app_by_api_client_index; +/** + * Default application event queue size + */ +static u32 default_app_evt_queue_size = 128; + int application_api_queue_is_full (application_t * app) { @@ -67,37 +73,71 @@ application_lookup (u32 api_client_index) return 0; } +application_t * +application_new () +{ + application_t *app; + pool_get (app_pool, app); + memset (app, 0, sizeof (*app)); + app->index = application_get_index (app); + app->connects_seg_manager = ~0; + return app; +} + void application_del (application_t * app) { - session_manager_main_t *smm = vnet_get_session_manager_main (); api_main_t *am = &api_main; void *oldheap; - session_manager_t *sm; + segment_manager_t *sm; + u64 handle; + u32 index, *handles = 0; + int i; + vnet_unbind_args_t _a, *a = &_a; + + /* + * Cleanup segment managers + */ + if (app->connects_seg_manager != (u32) ~ 0) + { + sm = segment_manager_get (app->connects_seg_manager); + segment_manager_del (sm); + } - if (app->mode == APP_SERVER) + /* *INDENT-OFF* */ + hash_foreach (handle, index, app->listeners_table, + ({ + vec_add1 (handles, handle); + })); + /* *INDENT-ON* */ + + /* Actual listener cleanup */ + for (i = 0; i < vec_len (handles); i++) { - sm = session_manager_get (app->session_manager_index); - session_manager_del (smm, sm); + a->app_index = app->api_client_index; + a->handle = handles[i]; + /* seg manager is removed when unbind completes */ + vnet_unbind (a); } - /* Free the event fifo in the /vpe-api shared-memory segment */ + /* + * Free the event fifo in the /vpe-api shared-memory segment + */ oldheap = svm_push_data_heap (am->vlib_rp); if (app->event_queue) unix_shared_memory_queue_free (app->event_queue); svm_pop_heap (oldheap); application_table_del (app); - pool_put (app_pool, app); } static void -application_verify_cb_fns (application_type_t type, session_cb_vft_t * cb_fns) +application_verify_cb_fns (session_cb_vft_t * cb_fns) { - if (type == APP_SERVER && cb_fns->session_accept_callback == 0) + if (cb_fns->session_accept_callback == 0) clib_warning ("No accept callback function provided"); - if (type == APP_CLIENT && cb_fns->session_connected_callback == 0) + if (cb_fns->session_connected_callback == 0) clib_warning ("No session connected callback function provided"); if (cb_fns->session_disconnect_callback == 0) clib_warning ("No session disconnect callback function provided"); @@ -105,25 +145,26 @@ application_verify_cb_fns (application_type_t type, session_cb_vft_t * cb_fns) clib_warning ("No session reset callback function provided"); } -application_t * -application_new (application_type_t type, session_type_t sst, - u32 api_client_index, u32 flags, session_cb_vft_t * cb_fns) +int +application_init (application_t * app, u32 api_client_index, u64 * options, + session_cb_vft_t * cb_fns) { - session_manager_main_t *smm = vnet_get_session_manager_main (); api_main_t *am = &api_main; - application_t *app; + segment_manager_t *sm; + segment_manager_properties_t *props; void *oldheap; - session_manager_t *sm; + u32 app_evt_queue_size; + int rv; - pool_get (app_pool, app); - memset (app, 0, sizeof (*app)); + app_evt_queue_size = options[APP_EVT_QUEUE_SIZE] > 0 ? + options[APP_EVT_QUEUE_SIZE] : default_app_evt_queue_size; /* Allocate event fifo in the /vpe-api shared-memory segment */ oldheap = svm_push_data_heap (am->vlib_rp); /* Allocate server event queue */ app->event_queue = - unix_shared_memory_queue_init (128 /* nels $$$$ config */ , + unix_shared_memory_queue_init (app_evt_queue_size, sizeof (session_fifo_event_t), 0 /* consumer pid */ , 0 @@ -132,36 +173,31 @@ application_new (application_type_t type, session_type_t sst, svm_pop_heap (oldheap); - /* If a server, allocate session manager */ - if (type == APP_SERVER) - { - pool_get (smm->session_managers, sm); - memset (sm, 0, sizeof (*sm)); + /* Setup segment manager */ + sm = segment_manager_new (); + sm->app_index = app->index; + props = &app->sm_properties; + props->add_segment_size = options[SESSION_OPTIONS_ADD_SEGMENT_SIZE]; + props->rx_fifo_size = options[SESSION_OPTIONS_RX_FIFO_SIZE]; + props->tx_fifo_size = options[SESSION_OPTIONS_TX_FIFO_SIZE]; + props->add_segment = props->add_segment_size != 0; - app->session_manager_index = sm - smm->session_managers; - } - else if (type == APP_CLIENT) - { - /* Allocate connect session manager if needed */ - if (smm->connect_manager_index[sst] == INVALID_INDEX) - connects_session_manager_init (smm, sst); - app->session_manager_index = smm->connect_manager_index[sst]; - } + if ((rv = segment_manager_init (sm, props, + options[SESSION_OPTIONS_SEGMENT_SIZE]))) + return rv; - app->mode = type; - app->index = application_get_index (app); - app->session_type = sst; + app->first_segment_manager = segment_manager_index (sm); app->api_client_index = api_client_index; - app->flags = flags; + app->flags = options[SESSION_OPTIONS_FLAGS]; app->cb_fns = *cb_fns; /* Check that the obvious things are properly set up */ - application_verify_cb_fns (type, cb_fns); + application_verify_cb_fns (cb_fns); /* Add app to lookup by api_client_index table */ application_table_add (app); - return app; + return 0; } application_t * @@ -185,108 +221,286 @@ application_get_index (application_t * app) return app - app_pool; } +static segment_manager_t * +application_alloc_segment_manager (application_t * app) +{ + segment_manager_t *sm = 0; + + if (app->first_segment_manager != (u32) ~ 0) + { + sm = segment_manager_get (app->first_segment_manager); + app->first_segment_manager = ~0; + return sm; + } + + sm = segment_manager_new (); + if (segment_manager_init (sm, &app->sm_properties, 0)) + return 0; + return sm; +} + +/** + * Start listening local transport endpoint for requested transport. + * + * Creates a 'dummy' stream session with state LISTENING to be used in session + * lookups, prior to establishing connection. Requests transport to build + * it's own specific listening connection. + */ int -application_server_init (application_t * server, u32 segment_size, - u32 add_segment_size, u32 rx_fifo_size, - u32 tx_fifo_size, u8 ** segment_name) +application_start_listen (application_t * srv, session_type_t session_type, + transport_endpoint_t * tep, u64 * res) { - session_manager_main_t *smm = vnet_get_session_manager_main (); - session_manager_t *sm; - int rv; + segment_manager_t *sm; + stream_session_t *s; + u64 handle; + + s = listen_session_new (session_type); + s->app_index = srv->index; + + if (stream_session_listen (s, tep)) + goto err; + + /* Allocate segment manager. All sessions derived out of a listen session + * have fifos allocated by the same segment manager. */ + sm = application_alloc_segment_manager (srv); + if (sm == 0) + goto err; + + /* Add to app's listener table. Useful to find all child listeners + * when app goes down, although, just for unbinding this is not needed */ + handle = listen_session_get_handle (s); + hash_set (srv->listeners_table, handle, segment_manager_index (sm)); - sm = session_manager_get (server->session_manager_index); + *res = handle; + return 0; + +err: + listen_session_del (s); + return -1; +} + +/** + * Stop listening on session associated to handle + */ +int +application_stop_listen (application_t * srv, u64 handle) +{ + stream_session_t *listener; + uword *indexp; + segment_manager_t *sm; - /* Add first segment */ - if ((rv = session_manager_add_first_segment (smm, sm, segment_size, - segment_name))) + if (srv && hash_get (srv->listeners_table, handle) == 0) { - return rv; + clib_warning ("app doesn't own handle %llu!", handle); + return -1; } - /* Setup session manager */ - sm->add_segment_size = add_segment_size; - sm->rx_fifo_size = rx_fifo_size; - sm->tx_fifo_size = tx_fifo_size; - sm->add_segment = sm->add_segment_size != 0; + listener = listen_session_get_from_handle (handle); + stream_session_stop_listen (listener); + + indexp = hash_get (srv->listeners_table, handle); + ASSERT (indexp); + + sm = segment_manager_get (*indexp); + segment_manager_del (sm); + hash_unset (srv->listeners_table, handle); + listen_session_del (listener); + return 0; } +int +application_open_session (application_t * app, session_type_t sst, + transport_endpoint_t * tep, u32 api_context) +{ + segment_manager_t *sm; + transport_connection_t *tc = 0; + int rv; + + /* Make sure we have a segment manager for connects */ + if (app->connects_seg_manager == (u32) ~ 0) + { + sm = application_alloc_segment_manager (app); + if (sm == 0) + return -1; + app->connects_seg_manager = segment_manager_index (sm); + } + + if ((rv = stream_session_open (app->index, sst, tep, &tc))) + return rv; + + /* Store api_context for when the reply comes. Not the nicest thing + * but better allocating a separate half-open pool. */ + tc->s_index = api_context; + + return 0; +} + +segment_manager_t * +application_get_connect_segment_manager (application_t * app) +{ + ASSERT (app->connects_seg_manager != (u32) ~ 0); + return segment_manager_get (app->connects_seg_manager); +} + +segment_manager_t * +application_get_listen_segment_manager (application_t * app, + stream_session_t * s) +{ + uword *smp; + smp = hash_get (app->listeners_table, listen_session_get_handle (s)); + ASSERT (smp != 0); + return segment_manager_get (*smp); +} + +static u8 * +app_get_name_from_reg_index (application_t * app) +{ + u8 *app_name; + + vl_api_registration_t *regp; + regp = vl_api_client_index_to_registration (app->api_client_index); + if (!regp) + app_name = format (0, "builtin-%d%c", app->index, 0); + else + app_name = format (0, "%s%c", regp->name, 0); + + return app_name; +} + u8 * -format_application_server (u8 * s, va_list * args) +format_application_listener (u8 * s, va_list * args) { - application_t *srv = va_arg (*args, application_t *); + application_t *app = va_arg (*args, application_t *); + u64 handle = va_arg (*args, u64); + u32 index = va_arg (*args, u32); int verbose = va_arg (*args, int); - vl_api_registration_t *regp; stream_session_t *listener; - u8 *server_name, *str, *seg_name; - u32 segment_size; + u8 *app_name, *str; - if (srv == 0) + if (app == 0) { if (verbose) - s = format (s, "%-40s%-20s%-15s%-15s%-10s", "Connection", "Server", - "Segment", "API Client", "Cookie"); + s = format (s, "%-40s%-20s%-15s%-15s%-10s", "Connection", "App", + "API Client", "ListenerID", "SegManager"); else - s = format (s, "%-40s%-20s", "Connection", "Server"); + s = format (s, "%-40s%-20s", "Connection", "App"); return s; } - regp = vl_api_client_index_to_registration (srv->api_client_index); - if (!regp) - server_name = format (0, "builtin-%d%c", srv->index, 0); - else - server_name = regp->name; - - listener = stream_session_listener_get (srv->session_type, - srv->session_index); + app_name = app_get_name_from_reg_index (app); + listener = listen_session_get_from_handle (handle); str = format (0, "%U", format_stream_session, listener, verbose); - session_manager_get_segment_info (listener->server_segment_index, &seg_name, - &segment_size); if (verbose) { - s = format (s, "%-40s%-20s%-20s%-10d%-10d", str, server_name, - seg_name, srv->api_client_index, srv->accept_cookie); + s = format (s, "%-40s%-20s%-15u%-15u%-10u", str, app_name, + app->api_client_index, handle, index); } else - s = format (s, "%-40s%-20s", str, server_name); + s = format (s, "%-40s%-20s", str, app_name); + + vec_free (app_name); return s; } -u8 * -format_application_client (u8 * s, va_list * args) +void +application_format_connects (application_t * app, int verbose) { - application_t *client = va_arg (*args, application_t *); - int verbose = va_arg (*args, int); - stream_session_t *session; - u8 *str, *seg_name; - u32 segment_size; + vlib_main_t *vm = vlib_get_main (); + segment_manager_t *sm; + u8 *app_name, *s = 0; + int i, j; - if (client == 0) + /* Header */ + if (app == 0) { if (verbose) - s = - format (s, "%-40s%-20s%-10s", "Connection", "Segment", - "API Client"); + vlib_cli_output (vm, "%-40s%-20s%-15s%-10s", "Connection", "App", + "API Client", "SegManager"); else - s = format (s, "%-40s", "Connection"); + vlib_cli_output (vm, "%-40s%-20s", "Connection", "App"); + return; + } - return s; + /* make sure */ + if (app->connects_seg_manager == (u32) ~ 0) + return; + + app_name = app_get_name_from_reg_index (app); + + /* Across all fifo segments */ + sm = segment_manager_get (app->connects_seg_manager); + for (j = 0; j < vec_len (sm->segment_indices); j++) + { + svm_fifo_segment_private_t *fifo_segment; + svm_fifo_t **fifos; + u8 *str; + + fifo_segment = svm_fifo_get_segment (sm->segment_indices[j]); + fifos = svm_fifo_segment_get_fifos (fifo_segment); + for (i = 0; i < vec_len (fifos); i++) + { + svm_fifo_t *fifo; + u32 session_index, thread_index; + stream_session_t *session; + + /* There are 2 fifos/session. Avoid printing twice. */ + if (i % 2) + continue; + + fifo = fifos[i]; + session_index = fifo->server_session_index; + thread_index = fifo->server_thread_index; + + session = stream_session_get (session_index, thread_index); + str = format (0, "%U", format_stream_session, session, verbose); + + if (verbose) + s = format (s, "%-40s%-20s%-15u%-10u", str, app_name, + app->api_client_index, app->connects_seg_manager); + else + s = format (s, "%-40s%-20s", str, app_name); + + vlib_cli_output (vm, "%v", s); + + vec_reset_length (s); + vec_free (str); + } + vec_free (s); } - session = stream_session_get (client->session_index, client->thread_index); - str = format (0, "%U", format_stream_session, session, verbose); + vec_free (app_name); +} - session_manager_get_segment_info (session->server_segment_index, &seg_name, - &segment_size); - if (verbose) +u8 * +format_application (u8 * s, va_list * args) +{ + application_t *app = va_arg (*args, application_t *); + CLIB_UNUSED (int verbose) = va_arg (*args, int); + u8 *app_name; + + if (app == 0) { - s = format (s, "%-40s%-20s%-10d%", str, seg_name, - client->api_client_index); + if (verbose) + s = format (s, "%-10s%-20s%-15s%-15s%-15s%-15s", "Index", "Name", + "API Client", "Add seg size", "Rx fifo size", + "Tx fifo size"); + else + s = format (s, "%-10s%-20s%-20s", "Index", "Name", "API Client"); + return s; } + + app_name = app_get_name_from_reg_index (app); + if (verbose) + s = format (s, "%-10d%-20s%-15d%-15d%-15d%-15d", app->index, app_name, + app->api_client_index, app->sm_properties.add_segment_size, + app->sm_properties.rx_fifo_size, + app->sm_properties.tx_fifo_size); else - s = format (s, "%-40s", str); + s = format (s, "%-10d%-20s%-20d", app->index, app_name, + app->api_client_index); return s; } @@ -294,13 +508,12 @@ static clib_error_t * show_app_command_fn (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) { - session_manager_main_t *smm = &session_manager_main; application_t *app; int do_server = 0; int do_client = 0; int verbose = 0; - if (!smm->is_enabled) + if (!session_manager_is_enabled ()) { clib_error_return (0, "session layer is not enabled"); } @@ -319,17 +532,24 @@ show_app_command_fn (vlib_main_t * vm, unformat_input_t * input, if (do_server) { + u64 handle; + u32 index; if (pool_elts (app_pool)) { - vlib_cli_output (vm, "%U", format_application_server, - 0 /* header */ , + vlib_cli_output (vm, "%U", format_application_listener, + 0 /* header */ , 0, 0, verbose); /* *INDENT-OFF* */ pool_foreach (app, app_pool, ({ - if (app->mode == APP_SERVER) - vlib_cli_output (vm, "%U", format_application_server, app, - verbose); + /* App's listener sessions */ + if (hash_elts (app->listeners_table) == 0) + continue; + hash_foreach (handle, index, app->listeners_table, + ({ + vlib_cli_output (vm, "%U", format_application_listener, app, + handle, index, verbose); + })); })); /* *INDENT-ON* */ } @@ -341,15 +561,14 @@ show_app_command_fn (vlib_main_t * vm, unformat_input_t * input, { if (pool_elts (app_pool)) { - vlib_cli_output (vm, "%U", format_application_client, - 0 /* header */ , - verbose); + application_format_connects (0, verbose); + /* *INDENT-OFF* */ pool_foreach (app, app_pool, ({ - if (app->mode == APP_CLIENT) - vlib_cli_output (vm, "%U", format_application_client, app, - verbose); + if (app->connects_seg_manager == (u32)~0) + continue; + application_format_connects (app, verbose); })); /* *INDENT-ON* */ } @@ -357,6 +576,19 @@ show_app_command_fn (vlib_main_t * vm, unformat_input_t * input, vlib_cli_output (vm, "No active client bindings"); } + /* Print app related info */ + if (!do_server && !do_client) + { + vlib_cli_output (vm, "%U", format_application, 0, verbose); + pool_foreach (app, app_pool, ( + { + vlib_cli_output (vm, "%U", + format_application, app, + verbose); + } + )); + } + return 0; } diff --git a/src/vnet/session/application.h b/src/vnet/session/application.h index 480828f7..6bcee9d3 100644 --- a/src/vnet/session/application.h +++ b/src/vnet/session/application.h @@ -18,11 +18,13 @@ #include #include +#include typedef enum { APP_SERVER, - APP_CLIENT + APP_CLIENT, + APP_N_TYPES } application_type_t; typedef struct _stream_session_cb_vft @@ -35,7 +37,7 @@ typedef struct _stream_session_cb_vft int (*session_accept_callback) (stream_session_t * new_session); /* Connection request callback */ - int (*session_connected_callback) (u32 api_client_index, + int (*session_connected_callback) (u32 app_index, u32 api_context, stream_session_t * s, u8 code); /** Notify app that session is closing */ @@ -59,45 +61,52 @@ typedef struct _application /** Flags */ u32 flags; + /* Stream server mode: accept or connect + * TODO REMOVE*/ + u8 mode; + + /** Index of the listen session or connect session + * TODO REMOVE*/ + u32 session_index; + + /** Session thread index for client connect sessions + * TODO REMOVE */ + u32 thread_index; + + /* + * Binary API interface to external app + */ + /** Binary API connection index, ~0 if internal */ u32 api_client_index; - /* */ - u32 api_context; - /** Application listens for events on this svm queue */ unix_shared_memory_queue_t *event_queue; - /** Stream session type */ - u8 session_type; - - /* Stream server mode: accept or connect */ - u8 mode; + /* + * Callbacks: shoulder-taps for the server/client + */ - u32 session_manager_index; + session_cb_vft_t cb_fns; /* - * Bind/Listen specific + * svm segment management */ + u32 connects_seg_manager; - /** Accept cookie, for multiple session flavors ($$$ maybe) */ - u32 accept_cookie; + /* Lookup tables for listeners. Value is segment manager index */ + uword *listeners_table; - /** Index of the listen session or connect session */ - u32 session_index; + u32 first_segment_manager; - /** Session thread index for client connect sessions */ - u32 thread_index; - - /* - * Callbacks: shoulder-taps for the server/client - */ - session_cb_vft_t cb_fns; + /** Segment manager properties. Shared by all segment managers */ + segment_manager_properties_t sm_properties; } application_t; -application_t *application_new (application_type_t type, session_type_t sst, - u32 api_client_index, u32 flags, - session_cb_vft_t * cb_fns); +application_t *application_new (); +int +application_init (application_t * app, u32 api_client_index, u64 * options, + session_cb_vft_t * cb_fns); void application_del (application_t * app); application_t *application_get (u32 index); application_t *application_get_if_valid (u32 index); @@ -105,11 +114,21 @@ application_t *application_lookup (u32 api_client_index); u32 application_get_index (application_t * app); int -application_server_init (application_t * server, u32 segment_size, - u32 add_segment_size, u32 rx_fifo_size, - u32 tx_fifo_size, u8 ** segment_name); +application_start_listen (application_t * app, session_type_t session_type, + transport_endpoint_t * tep, u64 * handle); +int application_stop_listen (application_t * srv, u64 handle); +int +application_open_session (application_t * app, session_type_t sst, + transport_endpoint_t * tep, u32 api_context); int application_api_queue_is_full (application_t * app); +segment_manager_t *application_get_listen_segment_manager (application_t * + app, + stream_session_t * + s); +segment_manager_t *application_get_connect_segment_manager (application_t * + app); + #endif /* SRC_VNET_SESSION_APPLICATION_H_ */ /* diff --git a/src/vnet/session/application_interface.c b/src/vnet/session/application_interface.c index 4b30bd87..96d2c621 100644 --- a/src/vnet/session/application_interface.c +++ b/src/vnet/session/application_interface.c @@ -79,81 +79,51 @@ api_parse_session_handle (u64 handle, u32 * session_index, u32 * thread_index) } int -vnet_bind_i (u32 api_client_index, ip46_address_t * ip46, u16 port_host_order, - session_type_t sst, u64 * options, session_cb_vft_t * cb_fns, - application_t ** app, u32 * len_seg_name, char *seg_name) +vnet_bind_i (u32 app_index, session_type_t sst, + transport_endpoint_t * tep, u64 * handle) { - u8 *segment_name = 0; - application_t *server = 0; + application_t *app; stream_session_t *listener; - u8 is_ip4; - - listener = - stream_session_lookup_listener (ip46, - clib_host_to_net_u16 (port_host_order), - sst); - - if (listener) - return VNET_API_ERROR_ADDRESS_IN_USE; - if (application_lookup (api_client_index)) + app = application_get_if_valid (app_index); + if (!app) { - clib_warning ("Only one connection supported for now"); - return VNET_API_ERROR_ADDRESS_IN_USE; + clib_warning ("app not attached"); + return VNET_API_ERROR_APPLICATION_NOT_ATTACHED; } - is_ip4 = SESSION_TYPE_IP4_UDP == sst || SESSION_TYPE_IP4_TCP == sst; - if (!ip_is_zero (ip46, is_ip4) && !ip_is_local (ip46, is_ip4)) - return VNET_API_ERROR_INVALID_VALUE; - - /* Allocate and initialize stream server */ - server = application_new (APP_SERVER, sst, api_client_index, - options[SESSION_OPTIONS_FLAGS], cb_fns); + listener = stream_session_lookup_listener (&tep->ip, + clib_host_to_net_u16 (tep->port), + sst); + if (listener) + return VNET_API_ERROR_ADDRESS_IN_USE; - application_server_init (server, options[SESSION_OPTIONS_SEGMENT_SIZE], - options[SESSION_OPTIONS_ADD_SEGMENT_SIZE], - options[SESSION_OPTIONS_RX_FIFO_SIZE], - options[SESSION_OPTIONS_TX_FIFO_SIZE], - &segment_name); + if (!ip_is_zero (&tep->ip, tep->is_ip4) + && !ip_is_local (&tep->ip, tep->is_ip4)) + return VNET_API_ERROR_INVALID_VALUE_2; /* Setup listen path down to transport */ - stream_session_start_listen (server->index, ip46, port_host_order); - - /* - * Return values - */ - - ASSERT (vec_len (segment_name) <= 128); - *len_seg_name = vec_len (segment_name); - memcpy (seg_name, segment_name, *len_seg_name); - *app = server; - - return 0; + return application_start_listen (app, sst, tep, handle); } int -vnet_unbind_i (u32 api_client_index) +vnet_unbind_i (u32 app_index, u64 handle) { - application_t *server; + application_t *app = application_get_if_valid (app_index); - /* - * Find the stream_server_t corresponding to the api client - */ - server = application_lookup (api_client_index); - if (!server) - return VNET_API_ERROR_INVALID_VALUE_2; + if (!app) + { + clib_warning ("app not attached"); + return VNET_API_ERROR_APPLICATION_NOT_ATTACHED; + } /* Clear the listener */ - stream_session_stop_listen (server->index); - application_del (server); - - return 0; + return application_stop_listen (app, handle); } int -vnet_connect_i (u32 api_client_index, u32 api_context, session_type_t sst, - ip46_address_t * ip46, u16 port, u64 * options, void *mp, - session_cb_vft_t * cb_fns) +vnet_connect_i (u32 app_index, u32 api_context, session_type_t sst, + transport_endpoint_t * tep, void *mp) { stream_session_t *listener; application_t *server, *app; @@ -161,8 +131,8 @@ vnet_connect_i (u32 api_client_index, u32 api_context, session_type_t sst, /* * Figure out if connecting to a local server */ - listener = stream_session_lookup_listener (ip46, - clib_host_to_net_u16 (port), + listener = stream_session_lookup_listener (&tep->ip, + clib_host_to_net_u16 (tep->port), sst); if (listener) { @@ -177,16 +147,11 @@ vnet_connect_i (u32 api_client_index, u32 api_context, session_type_t sst, redirect_connect_callback (server->api_client_index, mp); } - /* Create client app */ - app = application_new (APP_CLIENT, sst, api_client_index, - options[SESSION_OPTIONS_FLAGS], cb_fns); - - app->api_context = api_context; - /* * Not connecting to a local server. Create regular session */ - return stream_session_open (sst, ip46, port, app->index); + app = application_get (app_index); + return application_open_session (app, sst, tep, api_context); } /** @@ -209,30 +174,31 @@ vnet_connect_i (u32 api_client_index, u32 api_context, session_type_t sst, uword unformat_vnet_uri (unformat_input_t * input, va_list * args) { - ip46_address_t *address = va_arg (*args, ip46_address_t *); session_type_t *sst = va_arg (*args, session_type_t *); - u16 *port = va_arg (*args, u16 *); + transport_endpoint_t *tep = va_arg (*args, transport_endpoint_t *); - if (unformat (input, "tcp://%U/%d", unformat_ip4_address, &address->ip4, - port)) + if (unformat (input, "tcp://%U/%d", unformat_ip4_address, &tep->ip.ip4, + &tep->port)) { *sst = SESSION_TYPE_IP4_TCP; + tep->is_ip4 = 1; return 1; } - if (unformat (input, "udp://%U/%d", unformat_ip4_address, &address->ip4, - port)) + if (unformat (input, "udp://%U/%d", unformat_ip4_address, &tep->ip.ip4, + &tep->port)) { *sst = SESSION_TYPE_IP4_UDP; + tep->is_ip4 = 1; return 1; } - if (unformat (input, "udp://%U/%d", unformat_ip6_address, &address->ip6, - port)) + if (unformat (input, "udp://%U/%d", unformat_ip6_address, &tep->ip.ip6, + &tep->port)) { *sst = SESSION_TYPE_IP6_UDP; return 1; } - if (unformat (input, "tcp://%U/%d", unformat_ip6_address, &address->ip6, - port)) + if (unformat (input, "tcp://%U/%d", unformat_ip6_address, &tep->ip.ip6, + &tep->port)) { *sst = SESSION_TYPE_IP6_TCP; return 1; @@ -242,8 +208,7 @@ unformat_vnet_uri (unformat_input_t * input, va_list * args) } int -parse_uri (char *uri, session_type_t * sst, ip46_address_t * addr, - u16 * port_number_host_byte_order) +parse_uri (char *uri, session_type_t * sst, transport_endpoint_t * tep) { unformat_input_t _input, *input = &_input; @@ -252,8 +217,7 @@ parse_uri (char *uri, session_type_t * sst, ip46_address_t * addr, /* Parse uri */ unformat_init_string (input, uri, strlen (uri)); - if (!unformat (input, "%U", unformat_vnet_uri, addr, sst, - port_number_host_byte_order)) + if (!unformat (input, "%U", unformat_vnet_uri, sst, tep)) { unformat_free (input); return VNET_API_ERROR_INVALID_VALUE; @@ -263,26 +227,51 @@ parse_uri (char *uri, session_type_t * sst, ip46_address_t * addr, return 0; } +/** + * Attaches application. + * + * Allocates a vpp app, i.e., a structure that keeps back pointers + * to external app and a segment manager for shared memory fifo based + * communication with the external app. + */ int -vnet_bind_uri (vnet_bind_args_t * a) +vnet_application_attach (vnet_app_attach_args_t * a) { - application_t *server = 0; - u16 port_host_order; - session_type_t sst = SESSION_N_TYPES; - ip46_address_t ip46; + application_t *app = 0; + segment_manager_t *sm; + u8 *seg_name; int rv; - memset (&ip46, 0, sizeof (ip46)); - rv = parse_uri (a->uri, &sst, &ip46, &port_host_order); - if (rv) + app = application_new (); + if ((rv = application_init (app, a->api_client_index, a->options, + a->session_cb_vft))) return rv; - if ((rv = vnet_bind_i (a->api_client_index, &ip46, port_host_order, sst, - a->options, a->session_cb_vft, &server, - &a->segment_name_length, a->segment_name))) - return rv; + a->app_event_queue_address = (u64) app->event_queue; + sm = segment_manager_get (app->first_segment_manager); + segment_manager_get_segment_info (sm->segment_indices[0], + &seg_name, &a->segment_size); - a->server_event_queue_address = (u64) server->event_queue; + a->segment_name_length = vec_len (seg_name); + a->segment_name = seg_name; + ASSERT (vec_len (a->segment_name) <= 128); + a->app_index = app->index; + return 0; +} + +int +vnet_application_detach (vnet_app_detach_args_t * a) +{ + application_t *app; + app = application_get_if_valid (a->app_index); + + if (!app) + { + clib_warning ("app not attached"); + return VNET_API_ERROR_APPLICATION_NOT_ATTACHED; + } + + application_del (app); return 0; } @@ -308,125 +297,102 @@ session_type_from_proto_and_ip (session_api_proto_t proto, u8 is_ip4) } int -vnet_unbind_uri (char *uri, u32 api_client_index) +vnet_bind_uri (vnet_bind_args_t * a) { - u16 port_number_host_byte_order; session_type_t sst = SESSION_N_TYPES; - ip46_address_t ip46_address; - stream_session_t *listener; + transport_endpoint_t tep; int rv; - rv = parse_uri (uri, &sst, &ip46_address, &port_number_host_byte_order); + memset (&tep, 0, sizeof (tep)); + rv = parse_uri (a->uri, &sst, &tep); if (rv) return rv; - listener = - stream_session_lookup_listener (&ip46_address, - clib_host_to_net_u16 - (port_number_host_byte_order), sst); + if ((rv = vnet_bind_i (a->app_index, sst, &tep, &a->handle))) + return rv; + + return 0; +} + +int +vnet_unbind_uri (vnet_unbind_args_t * a) +{ + session_type_t sst = SESSION_N_TYPES; + stream_session_t *listener; + transport_endpoint_t tep; + int rv; + + rv = parse_uri (a->uri, &sst, &tep); + if (rv) + return rv; + listener = stream_session_lookup_listener (&tep.ip, + clib_host_to_net_u16 (tep.port), + sst); if (!listener) return VNET_API_ERROR_ADDRESS_NOT_IN_USE; - /* External client? */ - if (api_client_index != ~0) - { - ASSERT (vl_api_client_index_to_registration (api_client_index)); - } - - return vnet_unbind_i (api_client_index); + return vnet_unbind_i (a->app_index, listen_session_get_handle (listener)); } int vnet_connect_uri (vnet_connect_args_t * a) { - ip46_address_t ip46_address; - u16 port; + transport_endpoint_t tep; session_type_t sst; - application_t *app; int rv; - app = application_lookup (a->api_client_index); - if (app) - { - clib_warning ("Already have a connect from this app"); - return VNET_API_ERROR_INVALID_VALUE_2; - } - /* Parse uri */ - rv = parse_uri (a->uri, &sst, &ip46_address, &port); + memset (&tep, 0, sizeof (tep)); + rv = parse_uri (a->uri, &sst, &tep); if (rv) return rv; - return vnet_connect_i (a->api_client_index, a->api_context, sst, - &ip46_address, port, a->options, a->mp, - a->session_cb_vft); + return vnet_connect_i (a->app_index, a->api_context, sst, &tep, a->mp); } int -vnet_disconnect_session (u32 session_index, u32 thread_index) +vnet_disconnect_session (vnet_disconnect_args_t * a) { - stream_session_t *session; + u32 index, thread_index; + stream_session_t *s; - session = stream_session_get (session_index, thread_index); - stream_session_disconnect (session); + stream_session_parse_handle (a->handle, &index, &thread_index); + s = stream_session_get_if_valid (index, thread_index); + + if (!s || s->app_index != a->app_index) + return VNET_API_ERROR_INVALID_VALUE; + stream_session_disconnect (s); return 0; } - int vnet_bind (vnet_bind_args_t * a) { - application_t *server = 0; session_type_t sst = SESSION_N_TYPES; int rv; sst = session_type_from_proto_and_ip (a->proto, a->tep.is_ip4); - if ((rv = vnet_bind_i (a->api_client_index, &a->tep.ip, a->tep.port, sst, - a->options, a->session_cb_vft, &server, - &a->segment_name_length, a->segment_name))) + if ((rv = vnet_bind_i (a->app_index, sst, &a->tep, &a->handle))) return rv; - a->server_event_queue_address = (u64) server->event_queue; - a->handle = (u64) a->tep.vrf << 32 | (u64) server->session_index; return 0; } int vnet_unbind (vnet_unbind_args_t * a) { - application_t *server; - - if (a->api_client_index != ~0) - { - ASSERT (vl_api_client_index_to_registration (a->api_client_index)); - } - - /* Make sure this is the right one */ - server = application_lookup (a->api_client_index); - ASSERT (server->session_index == (0xFFFFFFFF & a->handle)); - - /* TODO use handle to disambiguate namespaces/vrfs */ - return vnet_unbind_i (a->api_client_index); + return vnet_unbind_i (a->app_index, a->handle); } int vnet_connect (vnet_connect_args_t * a) { session_type_t sst; - application_t *app; - - app = application_lookup (a->api_client_index); - if (app) - { - clib_warning ("Already have a connect from this app"); - return VNET_API_ERROR_INVALID_VALUE_2; - } sst = session_type_from_proto_and_ip (a->proto, a->tep.is_ip4); - return vnet_connect_i (a->api_client_index, a->api_context, sst, &a->tep.ip, - a->tep.port, a->options, a->mp, a->session_cb_vft); + return vnet_connect_i (a->app_index, a->api_context, sst, &a->tep, a->mp); } int diff --git a/src/vnet/session/application_interface.h b/src/vnet/session/application_interface.h index a5f2b9a6..2c497531 100644 --- a/src/vnet/session/application_interface.h +++ b/src/vnet/session/application_interface.h @@ -28,6 +28,27 @@ typedef enum _session_api_proto SESSION_PROTO_UDP } session_api_proto_t; +typedef struct _vnet_app_attach_args_t +{ + u32 api_client_index; + u64 *options; + session_cb_vft_t *session_cb_vft; + + /* + * Results + */ + u8 *segment_name; + u32 segment_name_length; + u32 segment_size; + u64 app_event_queue_address; + u32 app_index; +} vnet_app_attach_args_t; + +typedef struct _vnet_app_detach_args_t +{ + u32 app_index; +} vnet_app_detach_args_t; + typedef struct _vnet_bind_args_t { union @@ -40,9 +61,7 @@ typedef struct _vnet_bind_args_t }; }; - u32 api_client_index; - u64 *options; - session_cb_vft_t *session_cb_vft; + u32 app_index; /* * Results @@ -60,7 +79,7 @@ typedef struct _vnet_unbind_args_t char *uri; u64 handle; }; - u32 api_client_index; + u32 app_index; } vnet_unbind_args_t; typedef struct _vnet_connect_args @@ -74,10 +93,8 @@ typedef struct _vnet_connect_args session_api_proto_t proto; }; }; - u32 api_client_index; + u32 app_index; u32 api_context; - u64 *options; - session_cb_vft_t *session_cb_vft; /* Used for redirects */ void *mp; @@ -86,12 +103,13 @@ typedef struct _vnet_connect_args typedef struct _vnet_disconnect_args_t { u64 handle; - u32 api_client_index; + u32 app_index; } vnet_disconnect_args_t; -/* Bind / connect options */ +/* Application attach options */ typedef enum { + APP_EVT_QUEUE_SIZE, SESSION_OPTIONS_FLAGS, SESSION_OPTIONS_SEGMENT_SIZE, SESSION_OPTIONS_ADD_SEGMENT_SIZE, @@ -99,7 +117,7 @@ typedef enum SESSION_OPTIONS_TX_FIFO_SIZE, SESSION_OPTIONS_ACCEPT_COOKIE, SESSION_OPTIONS_N_OPTIONS -} session_options_index_t; +} app_attach_options_index_t; /** Server can handle delegated connect requests from local clients */ #define SESSION_OPTIONS_FLAGS_USE_FIFO (1<<0) @@ -109,10 +127,13 @@ typedef enum #define VNET_CONNECT_REDIRECTED 123 +int vnet_application_attach (vnet_app_attach_args_t * a); +int vnet_application_detach (vnet_app_detach_args_t * a); + int vnet_bind_uri (vnet_bind_args_t *); -int vnet_unbind_uri (char *uri, u32 api_client_index); +int vnet_unbind_uri (vnet_unbind_args_t * a); int vnet_connect_uri (vnet_connect_args_t * a); -int vnet_disconnect_session (u32 session_index, u32 thread_index); +int vnet_disconnect_session (vnet_disconnect_args_t * a); int vnet_bind (vnet_bind_args_t * a); int vnet_connect (vnet_connect_args_t * a); diff --git a/src/vnet/session/segment_manager.c b/src/vnet/session/segment_manager.c new file mode 100644 index 00000000..16e5bc56 --- /dev/null +++ b/src/vnet/session/segment_manager.c @@ -0,0 +1,342 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +/** + * Counter used to build segment names + */ +u32 segment_name_counter = 0; + +/** + * Pool of segment managers + */ +segment_manager_t *segment_managers = 0; + +/** + * Default fifo and segment size. TODO config. + */ +u32 default_fifo_size = 1 << 16; +u32 default_segment_size = 1 << 20; + +void +segment_manager_get_segment_info (u32 index, u8 ** name, u32 * size) +{ + svm_fifo_segment_private_t *s; + s = svm_fifo_get_segment (index); + *name = s->h->segment_name; + *size = s->ssvm.ssvm_size; +} + +always_inline int +session_manager_add_segment_i (segment_manager_t * sm, u32 segment_size, + u8 * segment_name) +{ + svm_fifo_segment_create_args_t _ca, *ca = &_ca; + int rv; + + memset (ca, 0, sizeof (*ca)); + + ca->segment_name = (char *) segment_name; + ca->segment_size = segment_size; + + rv = svm_fifo_segment_create (ca); + if (rv) + { + clib_warning ("svm_fifo_segment_create ('%s', %d) failed", + ca->segment_name, ca->segment_size); + vec_free (segment_name); + return VNET_API_ERROR_SVM_SEGMENT_CREATE_FAIL; + } + + vec_add1 (sm->segment_indices, ca->new_segment_index); + + return 0; +} + +int +session_manager_add_segment (segment_manager_t * sm) +{ + u8 *segment_name; + svm_fifo_segment_create_args_t _ca, *ca = &_ca; + u32 add_segment_size; + int rv; + + memset (ca, 0, sizeof (*ca)); + segment_name = format (0, "%d-%d%c", getpid (), segment_name_counter++, 0); + add_segment_size = sm->properties->add_segment_size ? + sm->properties->add_segment_size : default_segment_size; + + rv = session_manager_add_segment_i (sm, add_segment_size, segment_name); + vec_free (segment_name); + return rv; +} + +int +session_manager_add_first_segment (segment_manager_t * sm, u32 segment_size) +{ + svm_fifo_segment_create_args_t _ca, *ca = &_ca; + u8 *segment_name; + int rv; + + memset (ca, 0, sizeof (*ca)); + segment_name = format (0, "%d-%d%c", getpid (), segment_name_counter++, 0); + rv = session_manager_add_segment_i (sm, segment_size, segment_name); + vec_free (segment_name); + return rv; +} + +/** + * Initializes segment manager based on options provided. + * Returns error if svm segment allocation fails. + */ +int +segment_manager_init (segment_manager_t * sm, + segment_manager_properties_t * properties, + u32 first_seg_size) +{ + int rv; + + /* app allocates these */ + sm->properties = properties; + + if (first_seg_size > 0) + { + rv = session_manager_add_first_segment (sm, first_seg_size); + if (rv) + { + clib_warning ("Failed to allocate segment"); + return rv; + } + } + + return 0; +} + +/** + * Removes segment manager. + * + * Since the fifos allocated in the segment keep backpointers to the sessions + * prior to removing the segment, we call session disconnect. This + * subsequently propages into transport. + */ +void +segment_manager_del (segment_manager_t * sm) +{ + u32 *deleted_sessions = 0; + u32 *deleted_thread_indices = 0; + int i, j; + + /* Across all fifo segments used by the server */ + for (j = 0; j < vec_len (sm->segment_indices); j++) + { + svm_fifo_segment_private_t *fifo_segment; + svm_fifo_t **fifos; + /* Vector of fifos allocated in the segment */ + fifo_segment = svm_fifo_get_segment (sm->segment_indices[j]); + fifos = svm_fifo_segment_get_fifos (fifo_segment); + + /* + * Remove any residual sessions from the session lookup table + * Don't bother deleting the individual fifos, we're going to + * throw away the fifo segment in a minute. + */ + for (i = 0; i < vec_len (fifos); i++) + { + svm_fifo_t *fifo; + u32 session_index, thread_index; + stream_session_t *session; + + fifo = fifos[i]; + session_index = fifo->server_session_index; + thread_index = fifo->server_thread_index; + + session = stream_session_get (session_index, thread_index); + + /* Add to the deleted_sessions vector (once!) */ + if (!session->is_deleted) + { + session->is_deleted = 1; + vec_add1 (deleted_sessions, session_index); + vec_add1 (deleted_thread_indices, thread_index); + } + } + + for (i = 0; i < vec_len (deleted_sessions); i++) + { + stream_session_t *session; + session = stream_session_get (deleted_sessions[i], + deleted_thread_indices[i]); + + /* Instead of directly removing the session call disconnect */ + stream_session_disconnect (session); + + /* + stream_session_table_del (smm, session); + pool_put(smm->sessions[deleted_thread_indices[i]], session); + */ + } + + vec_reset_length (deleted_sessions); + vec_reset_length (deleted_thread_indices); + + /* Instead of removing the segment, test when removing the session if + * the segment can be removed + */ + /* svm_fifo_segment_delete (fifo_segment); */ + } + + vec_free (deleted_sessions); + vec_free (deleted_thread_indices); + pool_put (segment_managers, sm); +} + +static int +segment_manager_notify_app_seg_add (segment_manager_t * sm, + u32 fifo_segment_index) +{ + application_t *app = application_get (sm->app_index); + u32 seg_size = 0; + u8 *seg_name; + + /* Send an API message to the external app, to map new segment */ + ASSERT (app->cb_fns.add_segment_callback); + + segment_manager_get_segment_info (fifo_segment_index, &seg_name, &seg_size); + return app->cb_fns.add_segment_callback (app->api_client_index, seg_name, + seg_size); +} + +int +segment_manager_alloc_session_fifos (segment_manager_t * sm, + svm_fifo_t ** server_rx_fifo, + svm_fifo_t ** server_tx_fifo, + u32 * fifo_segment_index) +{ + svm_fifo_segment_private_t *fifo_segment; + u32 fifo_size, sm_index; + u8 added_a_segment = 0; + int i; + + /* Allocate svm fifos */ + ASSERT (vec_len (sm->segment_indices)); + +again: + for (i = 0; i < vec_len (sm->segment_indices); i++) + { + *fifo_segment_index = sm->segment_indices[i]; + fifo_segment = svm_fifo_get_segment (*fifo_segment_index); + + fifo_size = sm->properties->rx_fifo_size; + fifo_size = (fifo_size == 0) ? default_fifo_size : fifo_size; + *server_rx_fifo = svm_fifo_segment_alloc_fifo (fifo_segment, fifo_size); + + fifo_size = sm->properties->tx_fifo_size; + fifo_size = (fifo_size == 0) ? default_fifo_size : fifo_size; + *server_tx_fifo = svm_fifo_segment_alloc_fifo (fifo_segment, fifo_size); + + if (*server_rx_fifo == 0) + { + /* This would be very odd, but handle it... */ + if (*server_tx_fifo != 0) + { + svm_fifo_segment_free_fifo (fifo_segment, *server_tx_fifo); + *server_tx_fifo = 0; + } + continue; + } + if (*server_tx_fifo == 0) + { + if (*server_rx_fifo != 0) + { + svm_fifo_segment_free_fifo (fifo_segment, *server_rx_fifo); + *server_rx_fifo = 0; + } + continue; + } + break; + } + + /* See if we're supposed to create another segment */ + if (*server_rx_fifo == 0) + { + if (sm->properties->add_segment) + { + if (added_a_segment) + { + clib_warning ("added a segment, still cant allocate a fifo"); + return SESSION_ERROR_NEW_SEG_NO_SPACE; + } + + if (session_manager_add_segment (sm)) + return VNET_API_ERROR_URI_FIFO_CREATE_FAILED; + + added_a_segment = 1; + goto again; + } + else + { + clib_warning ("No space to allocate fifos!"); + return SESSION_ERROR_NO_SPACE; + } + } + + if (added_a_segment) + return segment_manager_notify_app_seg_add (sm, *fifo_segment_index); + + /* Backpointers to segment manager */ + sm_index = segment_manager_index (sm); + (*server_tx_fifo)->segment_manager = sm_index; + (*server_rx_fifo)->segment_manager = sm_index; + + return 0; +} + +void +segment_manager_dealloc_fifos (u32 svm_segment_index, svm_fifo_t * rx_fifo, + svm_fifo_t * tx_fifo) +{ + segment_manager_t *sm; + svm_fifo_segment_private_t *fifo_segment; + + fifo_segment = svm_fifo_get_segment (svm_segment_index); + svm_fifo_segment_free_fifo (fifo_segment, rx_fifo); + svm_fifo_segment_free_fifo (fifo_segment, tx_fifo); + + /* If we have segment manager, try doing some cleanup. + * It's possible to have no segment manager if the session was removed + * as result of a detach */ + sm = segment_manager_get_if_valid (rx_fifo->segment_manager); + if (sm) + { + /* Remove segment only if it holds no fifos and not the first */ + if (sm->segment_indices[0] != svm_segment_index + && !svm_fifo_segment_has_fifos (fifo_segment)) + { + svm_fifo_segment_delete (fifo_segment); + vec_del1 (sm->segment_indices, svm_segment_index); + } + } +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/segment_manager.h b/src/vnet/session/segment_manager.h new file mode 100644 index 00000000..778d6040 --- /dev/null +++ b/src/vnet/session/segment_manager.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_VNET_SESSION_SEGMENT_MANAGER_H_ +#define SRC_VNET_SESSION_SEGMENT_MANAGER_H_ + +#include +#include + +typedef struct _segment_manager_properties +{ + /** Session fifo sizes. */ + u32 rx_fifo_size; + u32 tx_fifo_size; + + /** Configured additional segment size */ + u32 add_segment_size; + + /** Flag that indicates if additional segments should be created */ + u8 add_segment; + +} segment_manager_properties_t; + +typedef struct _segment_manager +{ + /** segments mapped by this manager */ + u32 *segment_indices; + + /** Owner app index */ + u32 app_index; + + /** Pointer to manager properties. Could be shared among all of + * an app's segment managers s*/ + segment_manager_properties_t *properties; +} segment_manager_t; + +/** Pool of segment managers */ +extern segment_manager_t *segment_managers; + +always_inline segment_manager_t * +segment_manager_new () +{ + segment_manager_t *sm; + pool_get (segment_managers, sm); + memset (sm, 0, sizeof (*sm)); + return sm; +} + +always_inline segment_manager_t * +segment_manager_get (u32 index) +{ + return pool_elt_at_index (segment_managers, index); +} + +always_inline segment_manager_t * +segment_manager_get_if_valid (u32 index) +{ + if (pool_is_free_index (segment_managers, index)) + return 0; + return pool_elt_at_index (segment_managers, index); +} + +always_inline u32 +segment_manager_index (segment_manager_t * sm) +{ + return sm - segment_managers; +} + +int +segment_manager_init (segment_manager_t * sm, + segment_manager_properties_t * properties, + u32 seg_size); + +void segment_manager_get_segment_info (u32 index, u8 ** name, u32 * size); +int +session_manager_add_first_segment (segment_manager_t * sm, u32 segment_size); +int session_manager_add_segment (segment_manager_t * sm); +void segment_manager_del (segment_manager_t * sm); +int +segment_manager_alloc_session_fifos (segment_manager_t * sm, + svm_fifo_t ** server_rx_fifo, + svm_fifo_t ** server_tx_fifo, + u32 * fifo_segment_index); +void +segment_manager_dealloc_fifos (u32 svm_segment_index, svm_fifo_t * rx_fifo, + svm_fifo_t * tx_fifo); + +#endif /* SRC_VNET_SESSION_SEGMENT_MANAGER_H_ */ +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/session.api b/src/vnet/session/session.api index 582765b5..e207e46f 100644 --- a/src/vnet/session/session.api +++ b/src/vnet/session/session.api @@ -13,6 +13,68 @@ * limitations under the License. */ +/** \brief client->vpp, attach application to session layer + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param initial_segment_size - size of the initial shm segment to be + allocated + @param options - segment size, fifo sizes, etc. +*/ + define application_attach { + u32 client_index; + u32 context; + u32 initial_segment_size; + u64 options[16]; + }; + + /** \brief Application attach reply + @param context - sender context, to match reply w/ request + @param retval - return code for the request + @param app_event_queue_address - vpp event queue address or 0 if this + connection shouldn't send events + @param segment_size - size of first shm segment + @param segment_name_length - length of segment name + @param segment_name - name of segment client needs to attach to +*/ +define application_attach_reply { + u32 context; + i32 retval; + u64 app_event_queue_address; + u32 segment_size; + u8 segment_name_length; + u8 segment_name[128]; +}; + + /** \brief client->vpp, attach application to session layer + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ + define application_detach { + u32 client_index; + u32 context; + }; + + /** \brief detach reply + @param context - sender context, to match reply w/ request + @param retval - return code for the request +*/ +define application_detach_reply { + u32 context; + i32 retval; +}; + +/** \brief vpp->client, please map an additional shared memory segment + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param segment_name - +*/ +define map_another_segment { + u32 client_index; + u32 context; + u32 segment_size; + u8 segment_name[128]; +}; + /** \brief Bind to a given URI @param client_index - opaque cookie to identify the sender @param context - sender context, to match reply w/ request @@ -25,9 +87,7 @@ define bind_uri { u32 client_index; u32 context; u32 accept_cookie; - u32 initial_segment_size; u8 uri[128]; - u64 options[16]; }; /** \brief Unbind a given URI @@ -49,7 +109,10 @@ define unbind_uri { @param accept_cookie - sender accept cookie, to identify this bind flavor @param uri - a URI, e.g. "tcp4://0.0.0.0/0/80" "tcp6://::/0/80" [ipv6], etc. - @param options - socket options, fifo sizes, etc. + @param options - socket options, fifo sizes, etc. passed by vpp to the + server when redirecting connects + @param client_queue_address - binary API client queue address. Used by + local server when connect was redirected. */ define connect_uri { u32 client_index; @@ -62,18 +125,10 @@ define connect_uri { /** \brief Bind reply @param context - sender context, to match reply w/ request @param retval - return code for the request - @param event_queue_address - vpp event queue address or 0 if this - connection shouldn't send events - @param segment_name_length - length of segment name - @param segment_name - name of segment client needs to attach to */ define bind_uri_reply { u32 context; i32 retval; - u64 server_event_queue_address; - u8 segment_name_length; - u32 segment_size; - u8 segment_name[128]; }; /** \brief unbind reply @@ -88,43 +143,28 @@ define unbind_uri_reply { /** \brief vpp->client, connect reply @param context - sender context, to match reply w/ request @param retval - return code for the request + @param handle - session handle @param server_rx_fifo - rx (vpp -> vpp-client) fifo address @param server_tx_fifo - tx (vpp-client -> vpp) fifo address - @param session_index - session index; - @param session_thread_index - session thread index - @param session_type - session thread type @param vpp_event_queue_address - vpp's event queue address - @param client_event_queue_address - client's event queue address + @param segment_size - size of segment to be attached. Only for redirects. @param segment_name_length - non-zero if the client needs to attach to - the fifo segment + the fifo segment. This should only happen + if session was redirected. @param segment_name - set if the client needs to attach to the segment */ define connect_uri_reply { u32 context; i32 retval; + u64 handle; u64 server_rx_fifo; u64 server_tx_fifo; - u32 session_index; - u32 session_thread_index; - u8 session_type; - u64 client_event_queue_address; u64 vpp_event_queue_address; u32 segment_size; u8 segment_name_length; u8 segment_name[128]; }; -/** \brief vpp->client, please map an additional shared memory segment - @param context - sender context, to match reply w/ request - @param segment_name - -*/ -define map_another_segment { - u32 client_index; - u32 context; - u32 segment_size; - u8 segment_name[128]; -}; - /** \brief client->vpp @param context - sender context, to match reply w/ request @param retval - return code for the request @@ -136,25 +176,27 @@ define map_another_segment_reply { /** \brief vpp->client, accept this session @param context - sender context, to match reply w/ request - @param accept_cookie - tells client which bind flavor just occurred + @param listener_handle - tells client which listener this pertains to + @param handle - unique session identifier + @param session_thread_index - thread index of new session @param rx_fifo_address - rx (vpp -> vpp-client) fifo address @param tx_fifo_address - tx (vpp-client -> vpp) fifo address - @param session_index - index of new session - @param session_thread_index - thread index of new session @param vpp_event_queue_address - vpp's event queue address - @param session_type - type of session - + @param port - remote port + @param is_ip4 - 1 if the ip is ip4 + @param ip - remote ip */ define accept_session { u32 client_index; u32 context; - u32 accept_cookie; + u64 listener_handle; + u64 handle; u64 server_rx_fifo; u64 server_tx_fifo; - u32 session_index; - u32 session_thread_index; u64 vpp_event_queue_address; - u8 session_type; + u16 port; + u8 is_ip4; + u8 ip[16]; }; /** \brief client->vpp, reply to an accept message @@ -167,23 +209,19 @@ define accept_session { define accept_session_reply { u32 context; i32 retval; - u8 session_type; - u8 session_thread_index; - u32 session_index; + u64 handle; }; /** \brief bidirectional disconnect API @param client_index - opaque cookie to identify the sender client to vpp direction only @param context - sender context, to match reply w/ request - @param session_index - cookie #1 from accept_session / connect_reply - @param session_thread_index - cookie #2 + @param handle - session handle obtained from accept/connect */ define disconnect_session { u32 client_index; u32 context; - u32 session_index; - u32 session_thread_index; + u64 handle; }; /** \brief bidirectional disconnect reply API @@ -191,31 +229,25 @@ define disconnect_session { client to vpp direction only @param context - sender context, to match reply w/ request @param retval - return code for the request - @param session_index - session index from accept_session / connect_reply - @param session_thread_index - thread index from accept_session / - connect_reply + @param handle - session handle */ define disconnect_session_reply { u32 client_index; u32 context; i32 retval; - u32 session_index; - u32 session_thread_index; + u64 handle; }; /** \brief vpp->client reset session API @param client_index - opaque cookie to identify the sender client to vpp direction only @param context - sender context, to match reply w/ request - @param session_index - session index from accept_session / connect_reply - @param session_thread_index - thread index from accept_session / - connect_reply + @param handle - session handle obtained via accept/connects */ define reset_session { u32 client_index; u32 context; - u32 session_index; - u32 session_thread_index; + u64 handle; }; /** \brief client->vpp reset session reply @@ -223,16 +255,13 @@ define reset_session { client to vpp direction only @param context - sender context, to match reply w/ request @param retval - return code for the request - @param session_index - session index from accept_session / connect_reply - @param session_thread_index - thread index from accept_session / - connect_reply + @param handle - session handle obtained via accept/connect */ define reset_session_reply { u32 client_index; u32 context; i32 retval; - u32 session_index; - u32 session_thread_index; + u64 handle; }; /** \brief Bind to an ip:port pair for a given transport protocol @@ -277,7 +306,7 @@ define unbind_sock { @param proto - protocol 0 - TCP 1 - UDP @param client_queue_address - client's API queue address. Non-zero when used to perform redirects - @param options - socket options, fifo sizes, etc. + @param options - socket options, fifo sizes, etc. when doing redirects */ define connect_sock { u32 client_index; @@ -326,7 +355,7 @@ define unbind_sock_reply { @param server_rx_fifo - rx (vpp -> vpp-client) fifo address @param server_tx_fifo - tx (vpp-client -> vpp) fifo address @param vpp_event_queue_address - vpp's event queue address - @param client_event_queue_address - client's event queue address + @param segment_size - size of segment to be attached. Only for redirects. @param segment_name_length - non-zero if the client needs to attach to the fifo segment @param segment_name - set if the client needs to attach to the segment @@ -337,92 +366,12 @@ define connect_sock_reply { u64 handle; u64 server_rx_fifo; u64 server_tx_fifo; - u64 client_event_queue_address; u64 vpp_event_queue_address; u32 segment_size; u8 segment_name_length; u8 segment_name[128]; }; -/** \brief bidirectional disconnect API - @param client_index - opaque cookie to identify the sender - client to vpp direction only - @param context - sender context, to match reply w/ request - @param handle - session handle obtained through accept/connect -*/ -define disconnect_sock { - u32 client_index; - u32 context; - u64 handle; -}; - -/** \brief bidirectional disconnect reply API - @param client_index - opaque cookie to identify the sender - client to vpp direction only - @param client_context - sender context, to match reply w/ request - @param handle - session handle obtained through accept/connect -*/ -define disconnect_sock_reply { - u32 client_index; - u32 context; - i32 retval; - u64 handle; -}; - -/** \brief vpp->client, accept this session - @param context - sender context, to match reply w/ request - @param accept_cookie - tells client which bind flavor just occurred - @param handle - session handle obtained through accept/connect - @param rx_fifo_address - rx (vpp -> vpp-client) fifo address - @param tx_fifo_address - tx (vpp-client -> vpp) fifo address - @param vpp_event_queue_address - vpp's event queue address -*/ -define accept_sock { - u32 client_index; - u32 context; - u32 accept_cookie; - u64 handle; - u64 server_rx_fifo; - u64 server_tx_fifo; - u64 vpp_event_queue_address; -}; - -/** \brief client->vpp, reply to an accept message - @param context - sender context, to match reply w/ request - @param retval - return code for the request - @param handle - session handle obtained through accept/connect -*/ -define accept_sock_reply { - u32 context; - i32 retval; - u64 handle; -}; - -/** \brief vpp->client reset session API - @param client_index - opaque cookie to identify the sender - client to vpp direction only - @param context - sender context, to match reply w/ request - @param handle - session handle obtained through accept/connect -*/ -define reset_sock { - u32 client_index; - u32 context; - u64 handle; -}; - -/** \brief client->vpp reset session reply - @param client_index - opaque cookie to identify the sender - client to vpp direction only - @param context - sender context, to match reply w/ request - @param handle - session handle obtained through accept/connect -*/ -define reset_sock_reply { - u32 client_index; - u32 context; - i32 retval; - u64 handle; -}; - /** \brief enable/disable session layer @param client_index - opaque cookie to identify the sender client to vpp direction only diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 8e2b2616..e6cfe7da 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -36,15 +36,14 @@ session_manager_main_t session_manager_main; * Session lookup key; (src-ip, dst-ip, src-port, dst-port, session-type) * Value: (owner thread index << 32 | session_index); */ -static void -stream_session_table_add_for_tc (u8 sst, transport_connection_t * tc, - u64 value) +void +stream_session_table_add_for_tc (transport_connection_t * tc, u64 value) { session_manager_main_t *smm = &session_manager_main; session_kv4_t kv4; session_kv6_t kv6; - switch (sst) + switch (tc->proto) { case SESSION_TYPE_IP4_UDP: case SESSION_TYPE_IP4_TCP: @@ -72,12 +71,12 @@ stream_session_table_add (session_manager_main_t * smm, stream_session_t * s, tc = tp_vfts[s->session_type].get_connection (s->connection_index, s->thread_index); - stream_session_table_add_for_tc (s->session_type, tc, value); + stream_session_table_add_for_tc (tc, value); } static void -stream_session_half_open_table_add (u8 sst, transport_connection_t * tc, - u64 value) +stream_session_half_open_table_add (session_type_t sst, + transport_connection_t * tc, u64 value) { session_manager_main_t *smm = &session_manager_main; session_kv4_t kv4; @@ -105,14 +104,13 @@ stream_session_half_open_table_add (u8 sst, transport_connection_t * tc, } } -static int -stream_session_table_del_for_tc (session_manager_main_t * smm, u8 sst, - transport_connection_t * tc) +int +stream_session_table_del_for_tc (transport_connection_t * tc) { + session_manager_main_t *smm = &session_manager_main; session_kv4_t kv4; session_kv6_t kv6; - - switch (sst) + switch (tc->proto) { case SESSION_TYPE_IP4_UDP: case SESSION_TYPE_IP4_TCP: @@ -141,7 +139,7 @@ stream_session_table_del (session_manager_main_t * smm, stream_session_t * s) ts = tp_vfts[s->session_type].get_connection (s->connection_index, s->thread_index); - return stream_session_table_del_for_tc (smm, s->session_type, ts); + return stream_session_table_del_for_tc (ts); } static void @@ -383,7 +381,7 @@ stream_session_lookup_transport6 (ip6_address_t * lcl, ip6_address_t * rmt, * Allocate vpp event queue (once) per worker thread */ void -vpp_session_event_queue_allocate (session_manager_main_t * smm, +session_vpp_event_queue_allocate (session_manager_main_t * smm, u32 thread_index) { api_main_t *am = &api_main; @@ -406,266 +404,24 @@ vpp_session_event_queue_allocate (session_manager_main_t * smm, } } -void -session_manager_get_segment_info (u32 index, u8 ** name, u32 * size) -{ - svm_fifo_segment_private_t *s; - s = svm_fifo_get_segment (index); - *name = s->h->segment_name; - *size = s->ssvm.ssvm_size; -} - -always_inline int -session_manager_add_segment_i (session_manager_main_t * smm, - session_manager_t * sm, - u32 segment_size, u8 * segment_name) -{ - svm_fifo_segment_create_args_t _ca, *ca = &_ca; - int rv; - - memset (ca, 0, sizeof (*ca)); - - ca->segment_name = (char *) segment_name; - ca->segment_size = segment_size; - - rv = svm_fifo_segment_create (ca); - if (rv) - { - clib_warning ("svm_fifo_segment_create ('%s', %d) failed", - ca->segment_name, ca->segment_size); - vec_free (segment_name); - return -1; - } - - vec_add1 (sm->segment_indices, ca->new_segment_index); - - return 0; -} - -static int -session_manager_add_segment (session_manager_main_t * smm, - session_manager_t * sm) -{ - u8 *segment_name; - svm_fifo_segment_create_args_t _ca, *ca = &_ca; - u32 add_segment_size; - u32 default_segment_size = 128 << 10; - - memset (ca, 0, sizeof (*ca)); - segment_name = format (0, "%d-%d%c", getpid (), - smm->unique_segment_name_counter++, 0); - add_segment_size = - sm->add_segment_size ? sm->add_segment_size : default_segment_size; - - return session_manager_add_segment_i (smm, sm, add_segment_size, - segment_name); -} - -int -session_manager_add_first_segment (session_manager_main_t * smm, - session_manager_t * sm, u32 segment_size, - u8 ** segment_name) -{ - svm_fifo_segment_create_args_t _ca, *ca = &_ca; - memset (ca, 0, sizeof (*ca)); - *segment_name = format (0, "%d-%d%c", getpid (), - smm->unique_segment_name_counter++, 0); - return session_manager_add_segment_i (smm, sm, segment_size, *segment_name); -} - -void -session_manager_del (session_manager_main_t * smm, session_manager_t * sm) -{ - u32 *deleted_sessions = 0; - u32 *deleted_thread_indices = 0; - int i, j; - - /* Across all fifo segments used by the server */ - for (j = 0; j < vec_len (sm->segment_indices); j++) - { - svm_fifo_segment_private_t *fifo_segment; - svm_fifo_t **fifos; - /* Vector of fifos allocated in the segment */ - fifo_segment = svm_fifo_get_segment (sm->segment_indices[j]); - fifos = (svm_fifo_t **) fifo_segment->h->fifos; - - /* - * Remove any residual sessions from the session lookup table - * Don't bother deleting the individual fifos, we're going to - * throw away the fifo segment in a minute. - */ - for (i = 0; i < vec_len (fifos); i++) - { - svm_fifo_t *fifo; - u32 session_index, thread_index; - stream_session_t *session; - - fifo = fifos[i]; - session_index = fifo->server_session_index; - thread_index = fifo->server_thread_index; - - session = pool_elt_at_index (smm->sessions[thread_index], - session_index); - - /* Add to the deleted_sessions vector (once!) */ - if (!session->is_deleted) - { - session->is_deleted = 1; - vec_add1 (deleted_sessions, - session - smm->sessions[thread_index]); - vec_add1 (deleted_thread_indices, thread_index); - } - } - - for (i = 0; i < vec_len (deleted_sessions); i++) - { - stream_session_t *session; - - session = - pool_elt_at_index (smm->sessions[deleted_thread_indices[i]], - deleted_sessions[i]); - - /* Instead of directly removing the session call disconnect */ - stream_session_disconnect (session); - - /* - stream_session_table_del (smm, session); - pool_put(smm->sessions[deleted_thread_indices[i]], session); - */ - } - - vec_reset_length (deleted_sessions); - vec_reset_length (deleted_thread_indices); - - /* Instead of removing the segment, test when removing the session if - * the segment can be removed - */ - /* svm_fifo_segment_delete (fifo_segment); */ - } - - vec_free (deleted_sessions); - vec_free (deleted_thread_indices); -} - -int -session_manager_allocate_session_fifos (session_manager_main_t * smm, - session_manager_t * sm, - svm_fifo_t ** server_rx_fifo, - svm_fifo_t ** server_tx_fifo, - u32 * fifo_segment_index, - u8 * added_a_segment) -{ - svm_fifo_segment_private_t *fifo_segment; - u32 fifo_size, default_fifo_size = 1 << 16; /* TODO config */ - int i; - - *added_a_segment = 0; - - /* Allocate svm fifos */ - ASSERT (vec_len (sm->segment_indices)); - -again: - for (i = 0; i < vec_len (sm->segment_indices); i++) - { - *fifo_segment_index = sm->segment_indices[i]; - fifo_segment = svm_fifo_get_segment (*fifo_segment_index); - - fifo_size = sm->rx_fifo_size; - fifo_size = (fifo_size == 0) ? default_fifo_size : fifo_size; - *server_rx_fifo = svm_fifo_segment_alloc_fifo (fifo_segment, fifo_size); - - fifo_size = sm->tx_fifo_size; - fifo_size = (fifo_size == 0) ? default_fifo_size : fifo_size; - *server_tx_fifo = svm_fifo_segment_alloc_fifo (fifo_segment, fifo_size); - - if (*server_rx_fifo == 0) - { - /* This would be very odd, but handle it... */ - if (*server_tx_fifo != 0) - { - svm_fifo_segment_free_fifo (fifo_segment, *server_tx_fifo); - *server_tx_fifo = 0; - } - continue; - } - if (*server_tx_fifo == 0) - { - if (*server_rx_fifo != 0) - { - svm_fifo_segment_free_fifo (fifo_segment, *server_rx_fifo); - *server_rx_fifo = 0; - } - continue; - } - break; - } - - /* See if we're supposed to create another segment */ - if (*server_rx_fifo == 0) - { - if (sm->add_segment) - { - if (*added_a_segment) - { - clib_warning ("added a segment, still cant allocate a fifo"); - return SESSION_ERROR_NEW_SEG_NO_SPACE; - } - - if (session_manager_add_segment (smm, sm)) - return VNET_API_ERROR_URI_FIFO_CREATE_FAILED; - - *added_a_segment = 1; - goto again; - } - else - { - clib_warning ("No space to allocate fifos!"); - return SESSION_ERROR_NO_SPACE; - } - } - return 0; -} - int -stream_session_create_i (session_manager_main_t * smm, application_t * app, - transport_connection_t * tc, +stream_session_create_i (segment_manager_t * sm, transport_connection_t * tc, stream_session_t ** ret_s) { - int rv; + session_manager_main_t *smm = &session_manager_main; svm_fifo_t *server_rx_fifo = 0, *server_tx_fifo = 0; u32 fifo_segment_index; - u32 pool_index, seg_size; + u32 pool_index; stream_session_t *s; u64 value; u32 thread_index = tc->thread_index; - session_manager_t *sm; - u8 segment_added; - u8 *seg_name; - - sm = session_manager_get (app->session_manager_index); - - /* Check the API queue */ - if (app->mode == APP_SERVER && application_api_queue_is_full (app)) - return SESSION_ERROR_API_QUEUE_FULL; + int rv; - if ((rv = session_manager_allocate_session_fifos (smm, sm, &server_rx_fifo, - &server_tx_fifo, - &fifo_segment_index, - &segment_added))) + if ((rv = segment_manager_alloc_session_fifos (sm, &server_rx_fifo, + &server_tx_fifo, + &fifo_segment_index))) return rv; - if (segment_added && app->mode == APP_SERVER) - { - /* Send an API message to the external server, to map new segment */ - ASSERT (app->cb_fns.add_segment_callback); - - session_manager_get_segment_info (fifo_segment_index, &seg_name, - &seg_size); - if (app->cb_fns.add_segment_callback (app->api_client_index, seg_name, - seg_size)) - return VNET_API_ERROR_URI_FIFO_CREATE_FAILED; - } - /* Create the session */ pool_get (smm->sessions[thread_index], s); memset (s, 0, sizeof (*s)); @@ -682,10 +438,9 @@ stream_session_create_i (session_manager_main_t * smm, application_t * app, s->server_tx_fifo = server_tx_fifo; /* Initialize state machine, such as it is... */ - s->session_type = app->session_type; + s->session_type = tc->proto; s->session_state = SESSION_STATE_CONNECTING; - s->app_index = application_get_index (app); - s->server_segment_index = fifo_segment_index; + s->svm_segment_index = fifo_segment_index; s->thread_index = thread_index; s->session_index = pool_index; @@ -697,7 +452,7 @@ stream_session_create_i (session_manager_main_t * smm, application_t * app, /* Add to the main lookup table */ value = (((u64) thread_index) << 32) | (u64) s->session_index; - stream_session_table_add_for_tc (app->session_type, tc, value); + stream_session_table_add_for_tc (tc, value); *ret_s = s; @@ -881,94 +636,6 @@ session_manager_flush_enqueue_events (u32 thread_index) return errors; } -/* - * Start listening on server's ip/port pair for requested transport. - * - * Creates a 'dummy' stream session with state LISTENING to be used in session - * lookups, prior to establishing connection. Requests transport to build - * it's own specific listening connection. - */ -int -stream_session_start_listen (u32 server_index, ip46_address_t * ip, u16 port) -{ - session_manager_main_t *smm = &session_manager_main; - stream_session_t *s; - transport_connection_t *tc; - application_t *srv; - u32 tci; - - srv = application_get (server_index); - - pool_get (smm->listen_sessions[srv->session_type], s); - memset (s, 0, sizeof (*s)); - - s->session_type = srv->session_type; - s->session_state = SESSION_STATE_LISTENING; - s->session_index = s - smm->listen_sessions[srv->session_type]; - s->app_index = srv->index; - - /* Transport bind/listen */ - tci = tp_vfts[srv->session_type].bind (s->session_index, ip, port); - - /* Attach transport to session */ - s->connection_index = tci; - tc = tp_vfts[srv->session_type].get_listener (tci); - - srv->session_index = s->session_index; - - /* Add to the main lookup table */ - stream_session_table_add_for_tc (s->session_type, tc, s->session_index); - - return 0; -} - -void -stream_session_stop_listen (u32 server_index) -{ - session_manager_main_t *smm = &session_manager_main; - stream_session_t *listener; - transport_connection_t *tc; - application_t *srv; - - srv = application_get (server_index); - listener = pool_elt_at_index (smm->listen_sessions[srv->session_type], - srv->session_index); - - tc = tp_vfts[srv->session_type].get_listener (listener->connection_index); - stream_session_table_del_for_tc (smm, listener->session_type, tc); - - tp_vfts[srv->session_type].unbind (listener->connection_index); - pool_put (smm->listen_sessions[srv->session_type], listener); -} - -int -connect_server_add_segment_cb (application_t * ss, char *segment_name, - u32 segment_size) -{ - /* Does exactly nothing, but die */ - ASSERT (0); - return 0; -} - -void -connects_session_manager_init (session_manager_main_t * smm, u8 session_type) -{ - session_manager_t *sm; - u32 connect_fifo_size = 256 << 10; /* Config? */ - u32 default_segment_size = 1 << 20; - - pool_get (smm->session_managers, sm); - memset (sm, 0, sizeof (*sm)); - - sm->add_segment_size = default_segment_size; - sm->rx_fifo_size = connect_fifo_size; - sm->tx_fifo_size = connect_fifo_size; - sm->add_segment = 1; - - session_manager_add_segment (smm, sm); - smm->connect_manager_index[session_type] = sm - smm->session_managers; -} - void stream_session_connect_notify (transport_connection_t * tc, u8 sst, u8 is_fail) @@ -976,34 +643,36 @@ stream_session_connect_notify (transport_connection_t * tc, u8 sst, session_manager_main_t *smm = &session_manager_main; application_t *app; stream_session_t *new_s = 0; - u64 value; + u64 handle; + u32 api_context = 0; - value = stream_session_half_open_lookup (smm, &tc->lcl_ip, &tc->rmt_ip, - tc->lcl_port, tc->rmt_port, - tc->proto); - if (value == HALF_OPEN_LOOKUP_INVALID_VALUE) + handle = stream_session_half_open_lookup (smm, &tc->lcl_ip, &tc->rmt_ip, + tc->lcl_port, tc->rmt_port, + tc->proto); + if (handle == HALF_OPEN_LOOKUP_INVALID_VALUE) { clib_warning ("This can't be good!"); return; } - app = application_get (value >> 32); + /* Get the app's index from the handle we stored when opening connection */ + app = application_get (handle >> 32); + api_context = tc->s_index; if (!is_fail) { - /* Create new session (server segments are allocated if needed) */ - if (stream_session_create_i (smm, app, tc, &new_s)) - return; + segment_manager_t *sm; + sm = application_get_connect_segment_manager (app); - app->session_index = stream_session_get_index (new_s); - app->thread_index = new_s->thread_index; + /* Create new session (svm segments are allocated if needed) */ + if (stream_session_create_i (sm, tc, &new_s)) + return; - /* Allocate vpp event queue for this thread if needed */ - vpp_session_event_queue_allocate (smm, tc->thread_index); + new_s->app_index = app->index; } /* Notify client */ - app->cb_fns.session_connected_callback (app->api_client_index, new_s, + app->cb_fns.session_connected_callback (app->index, api_context, new_s, is_fail); /* Cleanup session lookup */ @@ -1046,48 +715,13 @@ void stream_session_delete (stream_session_t * s) { session_manager_main_t *smm = vnet_get_session_manager_main (); - svm_fifo_segment_private_t *fifo_segment; - application_t *app; /* Delete from the main lookup table. */ stream_session_table_del (smm, s); /* Cleanup fifo segments */ - fifo_segment = svm_fifo_get_segment (s->server_segment_index); - svm_fifo_segment_free_fifo (fifo_segment, s->server_rx_fifo); - svm_fifo_segment_free_fifo (fifo_segment, s->server_tx_fifo); - - app = application_get_if_valid (s->app_index); - - /* No app. A possibility: after disconnect application called unbind */ - if (!app) - return; - - if (app->mode == APP_CLIENT) - { - /* Cleanup app if client */ - application_del (app); - } - else if (app->mode == APP_SERVER) - { - session_manager_t *sm; - svm_fifo_segment_private_t *fifo_segment; - svm_fifo_t **fifos; - u32 fifo_index; - - /* For server, see if any segments can be removed */ - sm = session_manager_get (app->session_manager_index); - - /* Delete fifo */ - fifo_segment = svm_fifo_get_segment (s->server_segment_index); - fifos = (svm_fifo_t **) fifo_segment->h->fifos; - - fifo_index = svm_fifo_segment_index (fifo_segment); - - /* Remove segment only if it holds no fifos and not the first */ - if (sm->segment_indices[0] != fifo_index && vec_len (fifos) == 0) - svm_fifo_segment_delete (fifo_segment); - } + segment_manager_dealloc_fifos (s->svm_segment_index, s->server_rx_fifo, + s->server_tx_fifo); pool_put (smm->sessions[s->thread_index], s); } @@ -1134,21 +768,22 @@ int stream_session_accept (transport_connection_t * tc, u32 listener_index, u8 sst, u8 notify) { - session_manager_main_t *smm = &session_manager_main; application_t *server; stream_session_t *s, *listener; + segment_manager_t *sm; int rv; /* Find the server */ - listener = pool_elt_at_index (smm->listen_sessions[sst], listener_index); + listener = listen_session_get (sst, listener_index); server = application_get (listener->app_index); - if ((rv = stream_session_create_i (smm, server, tc, &s))) + sm = application_get_listen_segment_manager (server, listener); + if ((rv = stream_session_create_i (sm, tc, &s))) return rv; - /* Allocate vpp event queue for this thread if needed */ - vpp_session_event_queue_allocate (smm, tc->thread_index); + s->app_index = server->index; + s->listener_index = listener_index; /* Shoulder-tap the server */ if (notify) @@ -1159,37 +794,111 @@ stream_session_accept (transport_connection_t * tc, u32 listener_index, return 0; } +/** + * Ask transport to open connection to remote transport endpoint. + * + * Stores handle for matching request with reply since the call can be + * asynchronous. For instance, for TCP the 3-way handshake must complete + * before reply comes. Session is only created once connection is established. + * + * @param app_index Index of the application requesting the connect + * @param st Session type requested. + * @param tep Remote transport endpoint + * @param res Resulting transport connection . + */ int -stream_session_open (u8 sst, ip46_address_t * addr, u16 port_host_byte_order, - u32 app_index) +stream_session_open (u32 app_index, session_type_t st, + transport_endpoint_t * tep, + transport_connection_t ** res) { transport_connection_t *tc; - u32 tci; - u64 value; int rv; + u64 handle; - /* Ask transport to open connection */ - rv = tp_vfts[sst].open (addr, port_host_byte_order); + rv = tp_vfts[st].open (&tep->ip, tep->port); if (rv < 0) { clib_warning ("Transport failed to open connection."); return VNET_API_ERROR_SESSION_CONNECT_FAIL; } - tci = rv; + tc = tp_vfts[st].get_half_open ((u32) rv); - /* Get transport connection */ - tc = tp_vfts[sst].get_half_open (tci); - - /* Store api_client_index and transport connection index */ - value = (((u64) app_index) << 32) | (u64) tc->c_index; + /* Save app and tc index. The latter is needed to help establish the + * connection while the former is needed when the connect notify comes + * and we have to notify the external app */ + handle = (((u64) app_index) << 32) | (u64) tc->c_index; /* Add to the half-open lookup table */ - stream_session_half_open_table_add (sst, tc, value); + stream_session_half_open_table_add (st, tc, handle); + + *res = tc; + + return 0; +} + +/** + * Ask transport to listen on local transport endpoint. + * + * @param s Session for which listen will be called. Note that unlike + * established sessions, listen sessions are not associated to a + * thread. + * @param tep Local endpoint to be listened on. + */ +int +stream_session_listen (stream_session_t * s, transport_endpoint_t * tep) +{ + transport_connection_t *tc; + u32 tci; + + /* Transport bind/listen */ + tci = tp_vfts[s->session_type].bind (s->session_index, &tep->ip, tep->port); + + if (tci == (u32) ~ 0) + return -1; + + /* Attach transport to session */ + s->connection_index = tci; + tc = tp_vfts[s->session_type].get_listener (tci); + + /* Weird but handle it ... */ + if (tc == 0) + return -1; + + /* Add to the main lookup table */ + stream_session_table_add_for_tc (tc, s->session_index); return 0; } +/** + * Ask transport to stop listening on local transport endpoint. + * + * @param s Session to stop listening on. It must be in state LISTENING. + */ +int +stream_session_stop_listen (stream_session_t * s) +{ + transport_connection_t *tc; + + if (s->session_state != SESSION_STATE_LISTENING) + { + clib_warning ("not a listening session"); + return -1; + } + + tc = tp_vfts[s->session_type].get_listener (s->connection_index); + if (!tc) + { + clib_warning ("no transport"); + return VNET_API_ERROR_ADDRESS_NOT_IN_USE; + } + + stream_session_table_del_for_tc (tc); + tp_vfts[s->session_type].unbind (s->connection_index); + return 0; +} + /** * Disconnect session and propagate to transport. This should eventually * result in a delete notification that allows us to cleanup session state. @@ -1297,6 +1006,10 @@ session_manager_main_enable (vlib_main_t * vm) vec_validate (smm->last_event_poll_by_thread, num_threads - 1); #endif + /* Allocate vpp event queues */ + for (i = 0; i < vec_len (smm->vpp_event_queues); i++) + session_vpp_event_queue_allocate (smm, i); + /* $$$$ preallocate hack config parameter */ for (i = 0; i < 200000; i++) { @@ -1322,9 +1035,6 @@ session_manager_main_enable (vlib_main_t * vm) 200000 /* $$$$ config parameter nbuckets */ , (64 << 20) /*$$$ config parameter table size */ ); - for (i = 0; i < SESSION_N_TYPES; i++) - smm->connect_manager_index[i] = INVALID_INDEX; - smm->is_enabled = 1; /* Enable TCP transport */ diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index 6878b4d2..6e4ea96d 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -21,6 +21,7 @@ #include #include #include +#include #define HALF_OPEN_LOOKUP_INVALID_VALUE ((u64)~0) #define INVALID_INDEX ((u32)~0) @@ -107,6 +108,9 @@ typedef struct _stream_session_t svm_fifo_t *server_rx_fifo; svm_fifo_t *server_tx_fifo; + /** svm segment index where fifos were allocated */ + u32 svm_segment_index; + /** Type */ u8 session_type; @@ -133,27 +137,10 @@ typedef struct _stream_session_t /** stream server pool index */ u32 app_index; - /** svm segment index */ - u32 server_segment_index; + /** Parent listener session if the result of an accept */ + u32 listener_index; } stream_session_t; -typedef struct _session_manager -{ - /** segments mapped by this server */ - u32 *segment_indices; - - /** Session fifo sizes. They are provided for binds and take default - * values for connects */ - u32 rx_fifo_size; - u32 tx_fifo_size; - - /** Configured additional segment size */ - u32 add_segment_size; - - /** Flag that indicates if additional segments should be created */ - u8 add_segment; -} session_manager_t; - /* Forward definition */ typedef struct _session_manager_main session_manager_main_t; @@ -206,11 +193,6 @@ struct _session_manager_main /** Unique segment name counter */ u32 unique_segment_name_counter; - /* Connection manager used by incoming connects */ - u32 connect_manager_index[SESSION_N_TYPES]; - - session_manager_t *session_managers; - /** Per transport rx function that can either dequeue or peek */ session_fifo_rx_fn *session_tx_fns[SESSION_N_TYPES]; @@ -242,37 +224,6 @@ vnet_get_session_manager_main () return &session_manager_main; } -always_inline session_manager_t * -session_manager_get (u32 index) -{ - return pool_elt_at_index (session_manager_main.session_managers, index); -} - -always_inline unix_shared_memory_queue_t * -session_manager_get_vpp_event_queue (u32 thread_index) -{ - return session_manager_main.vpp_event_queues[thread_index]; -} - -always_inline session_manager_t * -connects_session_manager_get (session_manager_main_t * smm, - session_type_t session_type) -{ - return pool_elt_at_index (smm->session_managers, - smm->connect_manager_index[session_type]); -} - -void session_manager_get_segment_info (u32 index, u8 ** name, u32 * size); -int session_manager_flush_enqueue_events (u32 thread_index); -int -session_manager_add_first_segment (session_manager_main_t * smm, - session_manager_t * sm, u32 segment_size, - u8 ** segment_name); -void -session_manager_del (session_manager_main_t * smm, session_manager_t * sm); -void -connects_session_manager_init (session_manager_main_t * smm, u8 session_type); - /* * Stream session functions */ @@ -300,6 +251,8 @@ transport_connection_t u32 thread_index); stream_session_t *stream_session_lookup_listener (ip46_address_t * lcl, u16 lcl_port, u8 proto); +void stream_session_table_add_for_tc (transport_connection_t * tc, u64 value); +int stream_session_table_del_for_tc (transport_connection_t * tc); always_inline stream_session_t * stream_session_get_tsi (u64 ti_and_si, u32 thread_index) @@ -310,7 +263,7 @@ stream_session_get_tsi (u64 ti_and_si, u32 thread_index) } always_inline stream_session_t * -stream_session_get (u64 si, u32 thread_index) +stream_session_get (u32 si, u32 thread_index) { return pool_elt_at_index (session_manager_main.sessions[thread_index], si); } @@ -327,6 +280,40 @@ stream_session_get_if_valid (u64 si, u32 thread_index) return pool_elt_at_index (session_manager_main.sessions[thread_index], si); } +always_inline u64 +stream_session_handle (stream_session_t * s) +{ + return ((u64) s->thread_index << 32) | (u64) s->session_index; +} + +always_inline u32 +stream_session_index_from_handle (u64 handle) +{ + return handle & 0xFFFFFFFF; +} + +always_inline u32 +stream_session_thread_from_handle (u64 handle) +{ + return handle >> 32; +} + +always_inline void +stream_session_parse_handle (u64 handle, u32 * index, u32 * thread_index) +{ + *index = stream_session_index_from_handle (handle); + *thread_index = stream_session_thread_from_handle (handle); +} + +always_inline stream_session_t * +stream_session_get_from_handle (u64 handle) +{ + session_manager_main_t *smm = &session_manager_main; + return pool_elt_at_index (smm->sessions[stream_session_thread_from_handle + (handle)], + stream_session_index_from_handle (handle)); +} + always_inline stream_session_t * stream_session_listener_get (u8 sst, u64 si) { @@ -375,13 +362,14 @@ void stream_session_reset_notify (transport_connection_t * tc); int stream_session_accept (transport_connection_t * tc, u32 listener_index, u8 sst, u8 notify); -int stream_session_open (u8 sst, ip46_address_t * addr, - u16 port_host_byte_order, u32 api_client_index); +int +stream_session_open (u32 app_index, session_type_t st, + transport_endpoint_t * tep, + transport_connection_t ** tc); +int stream_session_listen (stream_session_t * s, transport_endpoint_t * tep); +int stream_session_stop_listen (stream_session_t * s); void stream_session_disconnect (stream_session_t * s); void stream_session_cleanup (stream_session_t * s); -int -stream_session_start_listen (u32 server_index, ip46_address_t * ip, u16 port); -void stream_session_stop_listen (u32 server_index); u8 *format_stream_session (u8 * s, va_list * args); @@ -390,6 +378,71 @@ transport_proto_vft_t *session_get_transport_vft (u8 type); clib_error_t *vnet_session_enable_disable (vlib_main_t * vm, u8 is_en); +always_inline unix_shared_memory_queue_t * +session_manager_get_vpp_event_queue (u32 thread_index) +{ + return session_manager_main.vpp_event_queues[thread_index]; +} + +int session_manager_flush_enqueue_events (u32 thread_index); + +always_inline u64 +listen_session_get_handle (stream_session_t * s) +{ + ASSERT (s->session_state == SESSION_STATE_LISTENING); + return ((u64) s->session_type << 32) | s->session_index; +} + +always_inline stream_session_t * +listen_session_get_from_handle (u64 handle) +{ + session_manager_main_t *smm = &session_manager_main; + stream_session_t *s; + u32 type, index; + type = handle >> 32; + index = handle & 0xFFFFFFFF; + + if (pool_is_free_index (smm->listen_sessions[type], index)) + return 0; + + s = pool_elt_at_index (smm->listen_sessions[type], index); + ASSERT (s->session_state == SESSION_STATE_LISTENING); + return s; +} + +always_inline stream_session_t * +listen_session_new (session_type_t type) +{ + stream_session_t *s; + pool_get (session_manager_main.listen_sessions[type], s); + memset (s, 0, sizeof (*s)); + + s->session_type = type; + s->session_state = SESSION_STATE_LISTENING; + s->session_index = s - session_manager_main.listen_sessions[type]; + + return s; +} + +always_inline stream_session_t * +listen_session_get (session_type_t type, u32 index) +{ + return pool_elt_at_index (session_manager_main.listen_sessions[type], + index); +} + +always_inline void +listen_session_del (stream_session_t * s) +{ + pool_put (session_manager_main.listen_sessions[s->session_type], s); +} + +always_inline u8 +session_manager_is_enabled () +{ + return session_manager_main.is_enabled == 1; +} + #endif /* __included_session_h__ */ /* diff --git a/src/vnet/session/session_api.c b/src/vnet/session/session_api.c index 9c38428a..a82dfe0b 100644 --- a/src/vnet/session/session_api.c +++ b/src/vnet/session/session_api.c @@ -38,6 +38,8 @@ #define foreach_session_api_msg \ _(MAP_ANOTHER_SEGMENT_REPLY, map_another_segment_reply) \ +_(APPLICATION_ATTACH, application_attach) \ +_(APPLICATION_DETACH, application_detach) \ _(BIND_URI, bind_uri) \ _(UNBIND_URI, unbind_uri) \ _(CONNECT_URI, connect_uri) \ @@ -48,13 +50,8 @@ _(RESET_SESSION_REPLY, reset_session_reply) \ _(BIND_SOCK, bind_sock) \ _(UNBIND_SOCK, unbind_sock) \ _(CONNECT_SOCK, connect_sock) \ -_(DISCONNECT_SOCK, disconnect_sock) \ -_(DISCONNECT_SOCK_REPLY, disconnect_sock_reply) \ -_(ACCEPT_SOCK_REPLY, accept_sock_reply) \ -_(RESET_SOCK_REPLY, reset_sock_reply) \ _(SESSION_ENABLE_DISABLE, session_enable_disable) \ - static int send_add_segment_callback (u32 api_client_index, const u8 * segment_name, u32 segment_size) @@ -80,11 +77,14 @@ send_add_segment_callback (u32 api_client_index, const u8 * segment_name, } static int -send_session_accept_uri_callback (stream_session_t * s) +send_session_accept_callback (stream_session_t * s) { vl_api_accept_session_t *mp; unix_shared_memory_queue_t *q, *vpp_queue; application_t *server = application_get (s->app_index); + transport_connection_t *tc; + transport_proto_vft_t *tp_vft; + stream_session_t *listener; q = vl_api_client_index_to_input_queue (server->api_client_index); vpp_queue = session_manager_get_vpp_event_queue (s->thread_index); @@ -93,24 +93,28 @@ send_session_accept_uri_callback (stream_session_t * s) return -1; mp = vl_msg_api_alloc (sizeof (*mp)); - mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_ACCEPT_SESSION); + memset (mp, 0, sizeof (*mp)); - /* Note: session_type is the first octet in all types of sessions */ + mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_ACCEPT_SESSION); - mp->accept_cookie = server->accept_cookie; + listener = listen_session_get (s->session_type, s->listener_index); + tp_vft = session_get_transport_vft (s->session_type); + tc = tp_vft->get_connection (s->connection_index, s->thread_index); + mp->listener_handle = listen_session_get_handle (listener); + mp->handle = stream_session_handle (s); mp->server_rx_fifo = (u64) s->server_rx_fifo; mp->server_tx_fifo = (u64) s->server_tx_fifo; - mp->session_thread_index = s->thread_index; - mp->session_index = s->session_index; - mp->session_type = s->session_type; mp->vpp_event_queue_address = (u64) vpp_queue; + mp->port = tc->rmt_port; + mp->is_ip4 = tc->is_ip4; + clib_memcpy (&mp->ip, &tc->rmt_ip, sizeof (tc->rmt_ip)); vl_msg_api_send_shmem (q, (u8 *) & mp); return 0; } static void -send_session_disconnect_uri_callback (stream_session_t * s) +send_session_disconnect_callback (stream_session_t * s) { vl_api_disconnect_session_t *mp; unix_shared_memory_queue_t *q; @@ -124,14 +128,12 @@ send_session_disconnect_uri_callback (stream_session_t * s) mp = vl_msg_api_alloc (sizeof (*mp)); memset (mp, 0, sizeof (*mp)); mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_DISCONNECT_SESSION); - - mp->session_thread_index = s->thread_index; - mp->session_index = s->session_index; + mp->handle = stream_session_handle (s); vl_msg_api_send_shmem (q, (u8 *) & mp); } static void -send_session_reset_uri_callback (stream_session_t * s) +send_session_reset_callback (stream_session_t * s) { vl_api_reset_session_t *mp; unix_shared_memory_queue_t *q; @@ -145,22 +147,20 @@ send_session_reset_uri_callback (stream_session_t * s) mp = vl_msg_api_alloc (sizeof (*mp)); memset (mp, 0, sizeof (*mp)); mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_RESET_SESSION); - - mp->session_thread_index = s->thread_index; - mp->session_index = s->session_index; + mp->handle = stream_session_handle (s); vl_msg_api_send_shmem (q, (u8 *) & mp); } static int -send_session_connected_uri_callback (u32 api_client_index, - stream_session_t * s, u8 is_fail) +send_session_connected_callback (u32 app_index, u32 api_context, + stream_session_t * s, u8 is_fail) { vl_api_connect_uri_reply_t *mp; unix_shared_memory_queue_t *q; - application_t *app = application_lookup (api_client_index); - u8 *seg_name; + application_t *app; unix_shared_memory_queue_t *vpp_queue; + app = application_get (app_index); q = vl_api_client_index_to_input_queue (app->api_client_index); if (!q) @@ -168,24 +168,15 @@ send_session_connected_uri_callback (u32 api_client_index, mp = vl_msg_api_alloc (sizeof (*mp)); mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_CONNECT_URI_REPLY); - mp->context = app->api_context; + mp->context = api_context; if (!is_fail) { vpp_queue = session_manager_get_vpp_event_queue (s->thread_index); mp->server_rx_fifo = (u64) s->server_rx_fifo; mp->server_tx_fifo = (u64) s->server_tx_fifo; - mp->session_thread_index = s->thread_index; - mp->session_index = s->session_index; - mp->session_type = s->session_type; + mp->handle = stream_session_handle (s); mp->vpp_event_queue_address = (u64) vpp_queue; - mp->client_event_queue_address = (u64) app->event_queue; mp->retval = 0; - - session_manager_get_segment_info (s->server_segment_index, &seg_name, - &mp->segment_size); - mp->segment_name_length = vec_len (seg_name); - if (mp->segment_name_length) - clib_memcpy (mp->segment_name, seg_name, mp->segment_name_length); } else { @@ -195,199 +186,14 @@ send_session_connected_uri_callback (u32 api_client_index, vl_msg_api_send_shmem (q, (u8 *) & mp); /* Remove client if connect failed */ - if (is_fail) - { - application_del (app); - } - else - { - s->session_state = SESSION_STATE_READY; - } - - return 0; -} - -/** - * Redirect a connect_uri message to the indicated server. - * Only sent if the server has bound the related port with - * URI_OPTIONS_FLAGS_USE_FIFO - */ -static int -redirect_connect_uri_callback (u32 server_api_client_index, void *mp_arg) -{ - vl_api_connect_uri_t *mp = mp_arg; - unix_shared_memory_queue_t *server_q, *client_q; - vlib_main_t *vm = vlib_get_main (); - f64 timeout = vlib_time_now (vm) + 0.5; - int rv = 0; - - server_q = vl_api_client_index_to_input_queue (server_api_client_index); - - if (!server_q) - { - rv = VNET_API_ERROR_INVALID_VALUE; - goto out; - } - - client_q = vl_api_client_index_to_input_queue (mp->client_index); - if (!client_q) - { - rv = VNET_API_ERROR_INVALID_VALUE_2; - goto out; - } - - /* Tell the server the client's API queue address, so it can reply */ - mp->client_queue_address = (u64) client_q; - - /* - * Bounce message handlers MUST NOT block the data-plane. - * Spin waiting for the queue lock, but - */ - - while (vlib_time_now (vm) < timeout) - { - rv = - unix_shared_memory_queue_add (server_q, (u8 *) & mp, 1 /*nowait */ ); - switch (rv) - { - /* correctly enqueued */ - case 0: - return VNET_CONNECT_REDIRECTED; - - /* continue spinning, wait for pthread_mutex_trylock to work */ - case -1: - continue; - - /* queue stuffed, drop the msg */ - case -2: - rv = VNET_API_ERROR_QUEUE_FULL; - goto out; - } - } -out: - /* Dispose of the message */ - vl_msg_api_free (mp); - return rv; -} - -static u64 -make_session_handle (stream_session_t * s) -{ - return (u64) s->session_index << 32 | (u64) s->thread_index; -} - -static int -send_session_accept_callback (stream_session_t * s) -{ - vl_api_accept_sock_t *mp; - unix_shared_memory_queue_t *q, *vpp_queue; - application_t *server = application_get (s->app_index); - - q = vl_api_client_index_to_input_queue (server->api_client_index); - vpp_queue = session_manager_get_vpp_event_queue (s->thread_index); - - if (!q) - return -1; - - mp = vl_msg_api_alloc (sizeof (*mp)); - mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_ACCEPT_SOCK); - - /* Note: session_type is the first octet in all types of sessions */ - - mp->accept_cookie = server->accept_cookie; - mp->server_rx_fifo = (u64) s->server_rx_fifo; - mp->server_tx_fifo = (u64) s->server_tx_fifo; - mp->handle = make_session_handle (s); - mp->vpp_event_queue_address = (u64) vpp_queue; - vl_msg_api_send_shmem (q, (u8 *) & mp); - - return 0; -} - -static int -send_session_connected_callback (u32 api_client_index, stream_session_t * s, - u8 is_fail) -{ - vl_api_connect_sock_reply_t *mp; - unix_shared_memory_queue_t *q; - application_t *app = application_lookup (api_client_index); - u8 *seg_name; - unix_shared_memory_queue_t *vpp_queue; - - q = vl_api_client_index_to_input_queue (app->api_client_index); - - if (!q) - return -1; - - mp = vl_msg_api_alloc (sizeof (*mp)); - mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_CONNECT_SOCK_REPLY); - mp->context = app->api_context; - mp->retval = is_fail; if (!is_fail) { - vpp_queue = session_manager_get_vpp_event_queue (s->thread_index); - mp->server_rx_fifo = (u64) s->server_rx_fifo; - mp->server_tx_fifo = (u64) s->server_tx_fifo; - mp->handle = make_session_handle (s); - mp->vpp_event_queue_address = (u64) vpp_queue; - mp->client_event_queue_address = (u64) app->event_queue; - - session_manager_get_segment_info (s->server_segment_index, &seg_name, - &mp->segment_size); - mp->segment_name_length = vec_len (seg_name); - if (mp->segment_name_length) - clib_memcpy (mp->segment_name, seg_name, mp->segment_name_length); + s->session_state = SESSION_STATE_READY; } - vl_msg_api_send_shmem (q, (u8 *) & mp); - - /* Remove client if connect failed */ - if (is_fail) - application_del (app); - return 0; } -static void -send_session_disconnect_callback (stream_session_t * s) -{ - vl_api_disconnect_sock_t *mp; - unix_shared_memory_queue_t *q; - application_t *app = application_get (s->app_index); - - q = vl_api_client_index_to_input_queue (app->api_client_index); - - if (!q) - return; - - mp = vl_msg_api_alloc (sizeof (*mp)); - memset (mp, 0, sizeof (*mp)); - mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_DISCONNECT_SOCK); - - mp->handle = make_session_handle (s); - vl_msg_api_send_shmem (q, (u8 *) & mp); -} - -static void -send_session_reset_callback (stream_session_t * s) -{ - vl_api_reset_sock_t *mp; - unix_shared_memory_queue_t *q; - application_t *app = application_get (s->app_index); - - q = vl_api_client_index_to_input_queue (app->api_client_index); - - if (!q) - return; - - mp = vl_msg_api_alloc (sizeof (*mp)); - memset (mp, 0, sizeof (*mp)); - mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_RESET_SOCK); - - mp->handle = make_session_handle (s); - vl_msg_api_send_shmem (q, (u8 *) & mp); -} - /** * Redirect a connect_uri message to the indicated server. * Only sent if the server has bound the related port with @@ -396,10 +202,11 @@ send_session_reset_callback (stream_session_t * s) static int redirect_connect_callback (u32 server_api_client_index, void *mp_arg) { - vl_api_connect_sock_t *mp = mp_arg; + vl_api_connect_uri_t *mp = mp_arg; unix_shared_memory_queue_t *server_q, *client_q; vlib_main_t *vm = vlib_get_main (); f64 timeout = vlib_time_now (vm) + 0.5; + application_t *app; int rv = 0; server_q = vl_api_client_index_to_input_queue (server_api_client_index); @@ -419,6 +226,9 @@ redirect_connect_callback (u32 server_api_client_index, void *mp_arg) /* Tell the server the client's API queue address, so it can reply */ mp->client_queue_address = (u64) client_q; + app = application_lookup (mp->client_index); + mp->options[SESSION_OPTIONS_RX_FIFO_SIZE] = app->sm_properties.rx_fifo_size; + mp->options[SESSION_OPTIONS_TX_FIFO_SIZE] = app->sm_properties.tx_fifo_size; /* * Bounce message handlers MUST NOT block the data-plane. @@ -452,15 +262,6 @@ out: } static session_cb_vft_t uri_session_cb_vft = { - .session_accept_callback = send_session_accept_uri_callback, - .session_disconnect_callback = send_session_disconnect_uri_callback, - .session_connected_callback = send_session_connected_uri_callback, - .session_reset_callback = send_session_reset_uri_callback, - .add_segment_callback = send_add_segment_callback, - .redirect_connect_callback = redirect_connect_uri_callback -}; - -static session_cb_vft_t session_cb_vft = { .session_accept_callback = send_session_accept_callback, .session_disconnect_callback = send_session_disconnect_callback, .session_connected_callback = send_session_connected_callback, @@ -498,60 +299,134 @@ vl_api_session_enable_disable_t_handler (vl_api_session_enable_disable_t * mp) } static void -vl_api_bind_uri_t_handler (vl_api_bind_uri_t * mp) +vl_api_application_attach_t_handler (vl_api_application_attach_t * mp) { - vl_api_bind_uri_reply_t *rmp; - vnet_bind_args_t _a, *a = &_a; - char segment_name[128]; - u32 segment_name_length; + vl_api_application_attach_reply_t *rmp; + vnet_app_attach_args_t _a, *a = &_a; int rv; - _Static_assert (sizeof (u64) * SESSION_OPTIONS_N_OPTIONS <= - sizeof (mp->options), - "Out of options, fix api message definition"); + if (session_manager_is_enabled () == 0) + { + rv = VNET_API_ERROR_FEATURE_DISABLED; + goto done; + } - segment_name_length = ARRAY_LEN (segment_name); + STATIC_ASSERT (sizeof (u64) * SESSION_OPTIONS_N_OPTIONS <= + sizeof (mp->options), + "Out of options, fix api message definition"); memset (a, 0, sizeof (*a)); - a->uri = (char *) mp->uri; a->api_client_index = mp->client_index; a->options = mp->options; - a->segment_name = segment_name; - a->segment_name_length = segment_name_length; a->session_cb_vft = &uri_session_cb_vft; - a->options[SESSION_OPTIONS_SEGMENT_SIZE] = mp->initial_segment_size; - a->options[SESSION_OPTIONS_ACCEPT_COOKIE] = mp->accept_cookie; - rv = vnet_bind_uri (a); + rv = vnet_application_attach (a); +done: /* *INDENT-OFF* */ - REPLY_MACRO2 (VL_API_BIND_URI_REPLY, ({ + REPLY_MACRO2 (VL_API_APPLICATION_ATTACH_REPLY, ({ rmp->retval = rv; if (!rv) { rmp->segment_name_length = 0; /* $$$$ policy? */ - rmp->segment_size = mp->initial_segment_size; - if (segment_name_length) + rmp->segment_size = a->segment_size; + if (a->segment_name_length) { - memcpy (rmp->segment_name, segment_name, segment_name_length); - rmp->segment_name_length = segment_name_length; + memcpy (rmp->segment_name, a->segment_name, + a->segment_name_length); + rmp->segment_name_length = a->segment_name_length; } - rmp->server_event_queue_address = a->server_event_queue_address; + rmp->app_event_queue_address = a->app_event_queue_address; } })); /* *INDENT-ON* */ } +static void +vl_api_application_detach_t_handler (vl_api_application_detach_t * mp) +{ + vl_api_application_detach_reply_t *rmp; + int rv = VNET_API_ERROR_INVALID_VALUE_2; + vnet_app_detach_args_t _a, *a = &_a; + application_t *app; + + if (session_manager_is_enabled () == 0) + { + rv = VNET_API_ERROR_FEATURE_DISABLED; + goto done; + } + + app = application_lookup (mp->client_index); + if (app) + { + a->app_index = app->index; + rv = vnet_application_detach (a); + } + +done: + REPLY_MACRO (VL_API_APPLICATION_DETACH_REPLY); +} + +static void +vl_api_bind_uri_t_handler (vl_api_bind_uri_t * mp) +{ + vl_api_bind_uri_reply_t *rmp; + vnet_bind_args_t _a, *a = &_a; + application_t *app; + int rv; + + if (session_manager_is_enabled () == 0) + { + rv = VNET_API_ERROR_FEATURE_DISABLED; + goto done; + } + + app = application_lookup (mp->client_index); + if (app) + { + memset (a, 0, sizeof (*a)); + a->uri = (char *) mp->uri; + a->app_index = app->index; + rv = vnet_bind_uri (a); + } + else + { + rv = VNET_API_ERROR_APPLICATION_NOT_ATTACHED; + } + +done: + REPLY_MACRO (VL_API_BIND_URI_REPLY); +} + static void vl_api_unbind_uri_t_handler (vl_api_unbind_uri_t * mp) { vl_api_unbind_uri_reply_t *rmp; + application_t *app; + vnet_unbind_args_t _a, *a = &_a; int rv; - rv = vnet_unbind_uri ((char *) mp->uri, mp->client_index); + if (session_manager_is_enabled () == 0) + { + rv = VNET_API_ERROR_FEATURE_DISABLED; + goto done; + } + + app = application_lookup (mp->client_index); + if (app) + { + a->uri = (char *) mp->uri; + a->app_index = app->index; + rv = vnet_unbind_uri (a); + } + else + { + rv = VNET_API_ERROR_APPLICATION_NOT_ATTACHED; + } +done: REPLY_MACRO (VL_API_UNBIND_URI_REPLY); } @@ -560,26 +435,37 @@ vl_api_connect_uri_t_handler (vl_api_connect_uri_t * mp) { vl_api_connect_uri_reply_t *rmp; vnet_connect_args_t _a, *a = &_a; + application_t *app; int rv; - a->uri = (char *) mp->uri; - a->api_client_index = mp->client_index; - a->api_context = mp->context; - a->options = mp->options; - a->session_cb_vft = &uri_session_cb_vft; - a->mp = mp; + if (session_manager_is_enabled () == 0) + { + rv = VNET_API_ERROR_FEATURE_DISABLED; + goto done; + } - rv = vnet_connect_uri (a); + app = application_lookup (mp->client_index); + if (app) + { + a->uri = (char *) mp->uri; + a->api_context = mp->context; + a->app_index = app->index; + a->mp = mp; + rv = vnet_connect_uri (a); + } + else + { + rv = VNET_API_ERROR_APPLICATION_NOT_ATTACHED; + } if (rv == 0 || rv == VNET_CONNECT_REDIRECTED) return; /* Got some error, relay it */ +done: /* *INDENT-OFF* */ - REPLY_MACRO2 (VL_API_CONNECT_URI_REPLY, ({ - rmp->retval = rv; - })); + REPLY_MACRO (VL_API_CONNECT_URI_REPLY); /* *INDENT-ON* */ } @@ -587,13 +473,29 @@ static void vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) { vl_api_disconnect_session_reply_t *rmp; - int rv; + vnet_disconnect_args_t _a, *a = &_a; + application_t *app; + int rv = 0; - rv = api_session_not_valid (mp->session_index, mp->session_thread_index); - if (!rv) - rv = - vnet_disconnect_session (mp->session_index, mp->session_thread_index); + if (session_manager_is_enabled () == 0) + { + rv = VNET_API_ERROR_FEATURE_DISABLED; + goto done; + } + + app = application_lookup (mp->client_index); + if (app) + { + a->handle = mp->handle; + a->app_index = app->index; + rv = vnet_disconnect_session (a); + } + else + { + rv = VNET_API_ERROR_APPLICATION_NOT_ATTACHED; + } +done: REPLY_MACRO (VL_API_DISCONNECT_SESSION_REPLY); } @@ -601,11 +503,8 @@ static void vl_api_disconnect_session_reply_t_handler (vl_api_disconnect_session_reply_t * mp) { - if (api_session_not_valid (mp->session_index, mp->session_thread_index)) - { - clib_warning ("Invalid session!"); - return; - } + vnet_disconnect_args_t _a, *a = &_a; + application_t *app; /* Client objected to disconnecting the session, log and continue */ if (mp->retval) @@ -615,15 +514,29 @@ vl_api_disconnect_session_reply_t_handler (vl_api_disconnect_session_reply_t * } /* Disconnect has been confirmed. Confirm close to transport */ - vnet_disconnect_session (mp->session_index, mp->session_thread_index); + app = application_lookup (mp->client_index); + if (app) + { + a->handle = mp->handle; + a->app_index = app->index; + vnet_disconnect_session (a); + } } static void vl_api_reset_session_reply_t_handler (vl_api_reset_session_reply_t * mp) { + application_t *app; stream_session_t *s; + u32 index, thread_index; + + app = application_lookup (mp->client_index); + if (!app) + return; - if (api_session_not_valid (mp->session_index, mp->session_thread_index)) + stream_session_parse_handle (mp->handle, &index, &thread_index); + s = stream_session_get_if_valid (index, thread_index); + if (s == 0 || app->index != s->app_index) { clib_warning ("Invalid session!"); return; @@ -636,8 +549,6 @@ vl_api_reset_session_reply_t_handler (vl_api_reset_session_reply_t * mp) return; } - s = stream_session_get (mp->session_index, mp->session_thread_index); - /* This comes as a response to a reset, transport only waiting for * confirmation to remove connection state, no need to disconnect */ stream_session_cleanup (s); @@ -648,11 +559,13 @@ vl_api_accept_session_reply_t_handler (vl_api_accept_session_reply_t * mp) { stream_session_t *s; int rv; - - if (api_session_not_valid (mp->session_index, mp->session_thread_index)) + u32 session_index, thread_index; + session_index = stream_session_index_from_handle (mp->handle); + thread_index = stream_session_thread_from_handle (mp->handle); + if (api_session_not_valid (session_index, thread_index)) return; - s = stream_session_get (mp->session_index, mp->session_thread_index); + s = stream_session_get (session_index, thread_index); rv = mp->retval; if (rv) @@ -677,49 +590,31 @@ vl_api_bind_sock_t_handler (vl_api_bind_sock_t * mp) { vl_api_bind_sock_reply_t *rmp; vnet_bind_args_t _a, *a = &_a; - char segment_name[128]; - u32 segment_name_length; - int rv; - - STATIC_ASSERT (sizeof (u64) * SESSION_OPTIONS_N_OPTIONS <= - sizeof (mp->options), - "Out of options, fix api message definition"); - - segment_name_length = ARRAY_LEN (segment_name); - - memset (a, 0, sizeof (*a)); - - clib_memcpy (&a->tep.ip, mp->ip, - (mp->is_ip4 ? sizeof (ip4_address_t) : - sizeof (ip6_address_t))); - a->tep.is_ip4 = mp->is_ip4; - a->tep.port = mp->port; - a->tep.vrf = mp->vrf; - - a->api_client_index = mp->client_index; - a->options = mp->options; - a->segment_name = segment_name; - a->segment_name_length = segment_name_length; - a->session_cb_vft = &session_cb_vft; + int rv = VNET_API_ERROR_APPLICATION_NOT_ATTACHED; + application_t *app; - rv = vnet_bind_uri (a); + if (session_manager_is_enabled () == 0) + { + rv = VNET_API_ERROR_FEATURE_DISABLED; + goto done; + } - /* *INDENT-OFF* */ - REPLY_MACRO2 (VL_API_BIND_SOCK_REPLY, ({ - rmp->retval = rv; - if (!rv) - { - rmp->segment_name_length = 0; - rmp->segment_size = mp->options[SESSION_OPTIONS_SEGMENT_SIZE]; - if (segment_name_length) - { - memcpy(rmp->segment_name, segment_name, segment_name_length); - rmp->segment_name_length = segment_name_length; - } - rmp->server_event_queue_address = a->server_event_queue_address; - } - })); - /* *INDENT-ON* */ + app = application_lookup (mp->client_index); + if (app) + { + memset (a, 0, sizeof (*a)); + clib_memcpy (&a->tep.ip, mp->ip, (mp->is_ip4 ? + sizeof (ip4_address_t) : + sizeof (ip6_address_t))); + a->tep.is_ip4 = mp->is_ip4; + a->tep.port = mp->port; + a->tep.vrf = mp->vrf; + a->app_index = app->index; + + rv = vnet_bind (a); + } +done: + REPLY_MACRO (VL_API_BIND_SOCK_REPLY); } static void @@ -727,13 +622,24 @@ vl_api_unbind_sock_t_handler (vl_api_unbind_sock_t * mp) { vl_api_unbind_sock_reply_t *rmp; vnet_unbind_args_t _a, *a = &_a; - int rv; + application_t *app; + int rv = VNET_API_ERROR_APPLICATION_NOT_ATTACHED; - a->api_client_index = mp->client_index; - a->handle = mp->handle; + if (session_manager_is_enabled () == 0) + { + rv = VNET_API_ERROR_FEATURE_DISABLED; + goto done; + } - rv = vnet_unbind (a); + app = application_lookup (mp->client_index); + if (app) + { + a->app_index = mp->client_index; + a->handle = mp->handle; + rv = vnet_unbind (a); + } +done: REPLY_MACRO (VL_API_UNBIND_SOCK_REPLY); } @@ -742,114 +648,55 @@ vl_api_connect_sock_t_handler (vl_api_connect_sock_t * mp) { vl_api_connect_sock_reply_t *rmp; vnet_connect_args_t _a, *a = &_a; + application_t *app; int rv; - clib_memcpy (&a->tep.ip, mp->ip, - (mp->is_ip4 ? sizeof (ip4_address_t) : - sizeof (ip6_address_t))); - a->tep.is_ip4 = mp->is_ip4; - a->tep.port = mp->port; - a->tep.vrf = mp->vrf; - a->options = mp->options; - a->session_cb_vft = &session_cb_vft; - a->api_context = mp->context; - a->mp = mp; - - rv = vnet_connect (a); - - if (rv == 0 || rv == VNET_CONNECT_REDIRECTED) - return; - - /* Got some error, relay it */ - - /* *INDENT-OFF* */ - REPLY_MACRO2 (VL_API_CONNECT_URI_REPLY, ({ - rmp->retval = rv; - })); - /* *INDENT-ON* */ -} - -static void -vl_api_disconnect_sock_t_handler (vl_api_disconnect_sock_t * mp) -{ - vnet_disconnect_args_t _a, *a = &_a; - vl_api_disconnect_sock_reply_t *rmp; - int rv; - - a->api_client_index = mp->client_index; - a->handle = mp->handle; - rv = vnet_disconnect (a); - - REPLY_MACRO (VL_API_DISCONNECT_SOCK_REPLY); -} - -static void -vl_api_disconnect_sock_reply_t_handler (vl_api_disconnect_sock_reply_t * mp) -{ - vnet_disconnect_args_t _a, *a = &_a; - - /* Client objected to disconnecting the session, log and continue */ - if (mp->retval) + if (session_manager_is_enabled () == 0) { - clib_warning ("client retval %d", mp->retval); - return; + rv = VNET_API_ERROR_FEATURE_DISABLED; + goto done; } - a->api_client_index = mp->client_index; - a->handle = mp->handle; - - vnet_disconnect (a); -} - -static void -vl_api_reset_sock_reply_t_handler (vl_api_reset_sock_reply_t * mp) -{ - stream_session_t *s; - u32 session_index, thread_index; - - /* Client objected to resetting the session, log and continue */ - if (mp->retval) + app = application_lookup (mp->client_index); + if (app) { - clib_warning ("client retval %d", mp->retval); - return; + clib_memcpy (&a->tep.ip, mp->ip, + (mp->is_ip4 ? sizeof (ip4_address_t) : + sizeof (ip6_address_t))); + a->api_context = mp->context; + a->app_index = app->index; + a->mp = mp; + rv = vnet_connect (a); } - - if (api_parse_session_handle (mp->handle, &session_index, &thread_index)) + else { - clib_warning ("Invalid handle"); - return; + rv = VNET_API_ERROR_APPLICATION_NOT_ATTACHED; } - s = stream_session_get (session_index, thread_index); + if (rv == 0 || rv == VNET_CONNECT_REDIRECTED) + return; - /* This comes as a response to a reset, transport only waiting for - * confirmation to remove connection state, no need to disconnect */ - stream_session_cleanup (s); + /* Got some error, relay it */ + +done: + REPLY_MACRO (VL_API_CONNECT_URI_REPLY); } -static void -vl_api_accept_sock_reply_t_handler (vl_api_accept_sock_reply_t * mp) +static clib_error_t * +application_reaper_cb (u32 client_index) { - stream_session_t *s; - u32 session_index, thread_index; - - if (api_parse_session_handle (mp->handle, &session_index, &thread_index)) - { - clib_warning ("Invalid handle"); - return; - } - s = stream_session_get (session_index, thread_index); - - if (mp->retval) + application_t *app = application_lookup (client_index); + vnet_app_detach_args_t _a, *a = &_a; + if (app) { - /* Server isn't interested, kill the session */ - stream_session_disconnect (s); - return; + a->app_index = app->index; + vnet_application_detach (a); } - - s->session_state = SESSION_STATE_READY; + return 0; } +VL_MSG_API_REAPER_FUNCTION (application_reaper_cb); + #define vl_msg_name_crc_list #include #undef vl_msg_name_crc_list @@ -903,6 +750,7 @@ session_api_hookup (vlib_main_t * vm) } VLIB_API_INIT_FUNCTION (session_api_hookup); + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/session/transport.h b/src/vnet/session/transport.h index 2f912cbc..7ea7af15 100644 --- a/src/vnet/session/transport.h +++ b/src/vnet/session/transport.h @@ -30,7 +30,7 @@ typedef struct _transport_connection ip46_address_t lcl_ip; /**< Local IP */ u16 lcl_port; /**< Local port */ u16 rmt_port; /**< Remote port */ - u8 proto; /**< Transport protocol id */ + u8 proto; /**< Transport protocol id (also session type) */ u32 s_index; /**< Parent session index */ u32 c_index; /**< Connection index in transport pool */ @@ -103,7 +103,8 @@ typedef CLIB_PACKED (struct { { struct { - ip4_address_t src; ip4_address_t dst; + ip4_address_t src; + ip4_address_t dst; u16 src_port; u16 dst_port; /* align by making this 4 octets even though its a 1-bit field @@ -122,10 +123,14 @@ typedef CLIB_PACKED (struct { struct { /* 48 octets */ - ip6_address_t src; ip6_address_t dst; + ip6_address_t src; + ip6_address_t dst; u16 src_port; - u16 dst_port; u32 proto; u8 unused_for_now[8]; - }; u64 as_u64[6]; + u16 dst_port; + u32 proto; + u8 unused_for_now[8]; + }; + u64 as_u64[6]; }; }) v6_connection_key_t; /* *INDENT-ON* */ @@ -233,10 +238,10 @@ make_v6_ss_kv_from_tc (session_kv6_t * kv, transport_connection_t * t) typedef struct _transport_endpoint { - ip46_address_t ip; - u16 port; - u8 is_ip4; - u32 vrf; + ip46_address_t ip; /** ip address */ + u16 port; /** port in host order */ + u8 is_ip4; /** 1 if ip4 */ + u32 vrf; /** fib table the endpoint is associated with */ } transport_endpoint_t; typedef clib_bihash_24_8_t transport_endpoint_table_t; diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index 9e8e1561..f8fbf28c 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -237,8 +237,7 @@ tclient_thread_fn (void *arg) memset (dmp, 0, sizeof (*dmp)); dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION); dmp->client_index = tm->my_client_index; - dmp->session_index = sp->vpp_session_index; - dmp->session_thread_index = sp->vpp_session_thread; + dmp->handle = sp->vpp_session_handle; vl_msg_api_send_shmem (tm->vl_input_queue, (u8 *) & dmp); pool_put (tm->sessions, sp); } @@ -253,9 +252,10 @@ tclient_thread_fn (void *arg) static void vl_api_memclnt_create_reply_t_handler (vl_api_memclnt_create_reply_t * mp) { + vlib_main_t *vm = vlib_get_main (); tclient_main_t *tm = &tclient_main; - tm->my_client_index = mp->index; + vlib_process_signal_event (vm, tm->node_index, 1 /* evt */ , 0 /* data */ ); } static void @@ -264,7 +264,6 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) tclient_main_t *tm = &tclient_main; session_t *session; u32 session_index; - u64 key; i32 retval = /* clib_net_to_host_u32 ( */ mp->retval /*) */ ; if (retval < 0) @@ -291,24 +290,24 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) session->server_rx_fifo->client_session_index = session_index; session->server_tx_fifo = (svm_fifo_t *) mp->server_tx_fifo; session->server_tx_fifo->client_session_index = session_index; - - session->vpp_session_index = mp->session_index; - session->vpp_session_thread = mp->session_thread_index; + session->vpp_session_handle = mp->handle; /* Add it to the session lookup table */ - key = (((u64) mp->session_thread_index) << 32) | (u64) mp->session_index; - hash_set (tm->session_index_by_vpp_handles, key, session_index); + hash_set (tm->session_index_by_vpp_handles, mp->handle, session_index); tm->ready_connections++; } -static void +static int create_api_loopback (tclient_main_t * tm) { + vlib_main_t *vm = vlib_get_main (); vl_api_memclnt_create_t _m, *mp = &_m; extern void vl_api_memclnt_create_t_handler (vl_api_memclnt_create_t *); api_main_t *am = &api_main; vl_shmem_hdr_t *shmem_hdr; + uword *event_data = 0, event_type; + int resolved = 0; /* * Create a "loopback" API client connection @@ -324,6 +323,25 @@ create_api_loopback (tclient_main_t * tm) strncpy ((char *) mp->name, "tcp_tester", sizeof (mp->name) - 1); vl_api_memclnt_create_t_handler (mp); + + /* Wait for reply */ + tm->node_index = vlib_get_current_process (vm)->node_runtime.node_index; + vlib_process_wait_for_event_or_clock (vm, 1.0); + event_type = vlib_process_get_events (vm, &event_data); + switch (event_type) + { + case 1: + resolved = 1; + break; + case ~0: + /* timed out */ + break; + default: + clib_warning ("unknown event_type %d", event_type); + } + if (!resolved) + return -1; + return 0; } #define foreach_tclient_static_api_msg \ @@ -333,17 +351,7 @@ _(CONNECT_URI_REPLY, connect_uri_reply) static clib_error_t * tclient_api_hookup (vlib_main_t * vm) { - tclient_main_t *tm = &tclient_main; vl_msg_api_msg_config_t _c, *c = &_c; - int i; - - /* Init test data */ - vec_validate (tm->connect_test_data, 64 * 1024 - 1); - for (i = 0; i < vec_len (tm->connect_test_data); i++) - tm->connect_test_data[i] = i & 0xff; - - tm->session_index_by_vpp_handles = hash_create (0, sizeof (uword)); - vec_validate (tm->rx_buf, vec_len (tm->connect_test_data) - 1); /* Hook up client-side static APIs to our handlers */ #define _(N,n) do { \ @@ -365,18 +373,105 @@ tclient_api_hookup (vlib_main_t * vm) return 0; } -VLIB_API_INIT_FUNCTION (tclient_api_hookup); +static int +tcp_test_clients_init (vlib_main_t * vm) +{ + tclient_main_t *tm = &tclient_main; + int i; + + tclient_api_hookup (vm); + if (create_api_loopback (tm)) + return -1; + + /* Init test data */ + vec_validate (tm->connect_test_data, 64 * 1024 - 1); + for (i = 0; i < vec_len (tm->connect_test_data); i++) + tm->connect_test_data[i] = i & 0xff; + + tm->session_index_by_vpp_handles = hash_create (0, sizeof (uword)); + vec_validate (tm->rx_buf, vec_len (tm->connect_test_data) - 1); + + tm->is_init = 1; + + return 0; +} + +static void +builtin_session_reset_callback (stream_session_t * s) +{ + return; +} + +static int +builtin_session_connected_callback (u32 app_index, u32 api_context, + stream_session_t * s, u8 code) +{ + return 0; +} + +static int +builtin_session_create_callback (stream_session_t * s) +{ + return 0; +} + +static void +builtin_session_disconnect_callback (stream_session_t * s) +{ + return; +} + +static int +builtin_server_rx_callback (stream_session_t * s) +{ + return 0; +} + +/* *INDENT-OFF* */ +static session_cb_vft_t builtin_clients = { + .session_reset_callback = builtin_session_reset_callback, + .session_connected_callback = builtin_session_connected_callback, + .session_accept_callback = builtin_session_create_callback, + .session_disconnect_callback = builtin_session_disconnect_callback, + .builtin_server_rx_callback = builtin_server_rx_callback +}; +/* *INDENT-ON* */ + +static int +attach_builtin_test_clients () +{ + vnet_app_attach_args_t _a, *a = &_a; + u8 segment_name[128]; + u32 segment_name_length; + u64 options[16]; + + segment_name_length = ARRAY_LEN (segment_name); + + memset (a, 0, sizeof (*a)); + memset (options, 0, sizeof (options)); + + a->api_client_index = ~0; + a->segment_name = segment_name; + a->segment_name_length = segment_name_length; + a->session_cb_vft = &builtin_clients; + + options[SESSION_OPTIONS_ACCEPT_COOKIE] = 0x12345678; + options[SESSION_OPTIONS_SEGMENT_SIZE] = (2 << 30); /*$$$$ config / arg */ + a->options = options; + + return vnet_application_attach (a); +} static clib_error_t * test_tcp_clients_command_fn (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) { + tclient_main_t *tm = &tclient_main; u8 *connect_uri = (u8 *) "tcp://6.0.1.1/1234"; u8 *uri; - tclient_main_t *tm = &tclient_main; - int i; u32 n_clients = 1; + int i; tm->bytes_to_send = 8192; tm->n_iterations = 1; @@ -397,14 +492,19 @@ test_tcp_clients_command_fn (vlib_main_t * vm, format_unformat_error, input); } + if (tm->is_init == 0) + { + if (tcp_test_clients_init (vm)) + return clib_error_return (0, "failed init"); + } + tm->ready_connections = 0; tm->expected_connections = n_clients; + uri = connect_uri; if (tm->connect_uri) uri = tm->connect_uri; - create_api_loopback (tm); - #if TCP_BUILTIN_CLIENT_PTHREAD /* Start a transmit thread */ if (tm->client_thread_handle == 0) @@ -420,6 +520,7 @@ test_tcp_clients_command_fn (vlib_main_t * vm, } #endif vnet_session_enable_disable (vm, 1 /* turn on TCP, etc. */ ); + attach_builtin_test_clients (); /* Fire off connect requests, in something approaching a normal manner */ for (i = 0; i < n_clients; i++) @@ -461,6 +562,16 @@ VLIB_CLI_COMMAND (test_clients_command, static) = }; /* *INDENT-ON* */ +clib_error_t * +tcp_test_clients_main_init (vlib_main_t * vm) +{ + tclient_main_t *tm = &tclient_main; + tm->is_init = 0; + return 0; +} + +VLIB_INIT_FUNCTION (tcp_test_clients_main_init); + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/tcp/builtin_client.h b/src/vnet/tcp/builtin_client.h index 64030302..2bd87c07 100644 --- a/src/vnet/tcp/builtin_client.h +++ b/src/vnet/tcp/builtin_client.h @@ -39,8 +39,7 @@ typedef struct svm_fifo_t *server_rx_fifo; svm_fifo_t *server_tx_fifo; - u32 vpp_session_index; - u32 vpp_session_thread; + u64 vpp_session_handle; } session_t; typedef struct @@ -110,6 +109,10 @@ typedef struct u32 client_bytes_received; u8 test_return_packets; + u8 is_init; + + u32 node_index; + /* convenience */ vlib_main_t *vlib_main; vnet_main_t *vnet_main; diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c index 917d4bd3..8308e3d9 100644 --- a/src/vnet/tcp/builtin_server.c +++ b/src/vnet/tcp/builtin_server.c @@ -18,17 +18,46 @@ #include #include +/* define message IDs */ +#include + +/* define message structures */ +#define vl_typedefs +#include +#undef vl_typedefs + +/* define generated endian-swappers */ +#define vl_endianfun +#include +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) +#define vl_printfun +#include +#undef vl_printfun + typedef struct { u8 *rx_buf; unix_shared_memory_queue_t **vpp_queue; - u32 byte_index; + u64 byte_index; + + /* Sever's event queue */ + unix_shared_memory_queue_t *vl_input_queue; + + /* API client handle */ + u32 my_client_index; + + u32 app_index; + + /* process node index for evnt scheduling */ + u32 node_index; vlib_main_t *vlib_main; } builtin_server_main_t; builtin_server_main_t builtin_server_main; - int builtin_session_accept_callback (stream_session_t * s) { @@ -45,9 +74,13 @@ builtin_session_accept_callback (stream_session_t * s) void builtin_session_disconnect_callback (stream_session_t * s) { + builtin_server_main_t *bsm = &builtin_server_main; + vnet_disconnect_args_t _a, *a = &_a; clib_warning ("called..."); - vnet_disconnect_session (s->session_index, s->thread_index); + a->handle = stream_session_handle (s); + a->app_index = bsm->app_index; + vnet_disconnect_session (a); } void @@ -60,7 +93,7 @@ builtin_session_reset_callback (stream_session_t * s) int -builtin_session_connected_callback (u32 client_index, +builtin_session_connected_callback (u32 app_index, u32 api_context, stream_session_t * s, u8 is_fail) { clib_warning ("called..."); @@ -91,7 +124,7 @@ test_bytes (builtin_server_main_t * bsm, int actual_transfer) { if (bsm->rx_buf[i] != ((bsm->byte_index + i) & 0xff)) { - clib_warning ("at %d expected %d got %d", bsm->byte_index + i, + clib_warning ("at %lld expected %d got %d", bsm->byte_index + i, (bsm->byte_index + i) & 0xff, bsm->rx_buf[i]); } } @@ -190,23 +223,66 @@ static session_cb_vft_t builtin_session_cb_vft = { .session_reset_callback = builtin_session_reset_callback }; +/* Abuse VPP's input queue */ static int -server_create (vlib_main_t * vm) +create_api_loopback (vlib_main_t * vm) { - vnet_bind_args_t _a, *a = &_a; - u64 options[SESSION_OPTIONS_N_OPTIONS]; - char segment_name[128]; - u32 num_threads; - vlib_thread_main_t *vtm = vlib_get_thread_main (); + builtin_server_main_t *bsm = &builtin_server_main; + vl_api_memclnt_create_t _m, *mp = &_m; + extern void vl_api_memclnt_create_t_handler (vl_api_memclnt_create_t *); + api_main_t *am = &api_main; + vl_shmem_hdr_t *shmem_hdr; + uword *event_data = 0, event_type; + int resolved = 0; - num_threads = 1 /* main thread */ + vtm->n_threads; - vec_validate (builtin_server_main.vpp_queue, num_threads - 1); + /* + * Create a "loopback" API client connection + * Don't do things like this unless you know what you're doing... + */ + + shmem_hdr = am->shmem_hdr; + bsm->vl_input_queue = shmem_hdr->vl_input_queue; + memset (mp, 0, sizeof (*mp)); + mp->_vl_msg_id = VL_API_MEMCLNT_CREATE; + mp->context = 0xFEEDFACE; + mp->input_queue = (u64) bsm->vl_input_queue; + strncpy ((char *) mp->name, "tcp_test_server", sizeof (mp->name) - 1); + + vl_api_memclnt_create_t_handler (mp); + + /* Wait for reply */ + bsm->node_index = vlib_get_current_process (vm)->node_runtime.node_index; + vlib_process_wait_for_event_or_clock (vm, 1.0); + event_type = vlib_process_get_events (vm, &event_data); + switch (event_type) + { + case 1: + resolved = 1; + break; + case ~0: + /* timed out */ + break; + default: + clib_warning ("unknown event_type %d", event_type); + } + if (!resolved) + return -1; + + return 0; +} + +static int +server_attach () +{ + builtin_server_main_t *bsm = &builtin_server_main; + u8 segment_name[128]; + u64 options[SESSION_OPTIONS_N_OPTIONS]; + vnet_app_attach_args_t _a, *a = &_a; memset (a, 0, sizeof (*a)); memset (options, 0, sizeof (options)); - a->uri = "tcp://0.0.0.0/1234"; - a->api_client_index = ~0; + a->api_client_index = bsm->my_client_index; a->session_cb_vft = &builtin_session_cb_vft; a->options = options; a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 128 << 20; @@ -215,9 +291,94 @@ server_create (vlib_main_t * vm) a->segment_name = segment_name; a->segment_name_length = ARRAY_LEN (segment_name); + if (vnet_application_attach (a)) + { + clib_warning ("failed to attach server"); + return -1; + } + bsm->app_index = a->app_index; + return 0; +} + +static int +server_listen () +{ + builtin_server_main_t *bsm = &builtin_server_main; + vnet_bind_args_t _a, *a = &_a; + memset (a, 0, sizeof (*a)); + a->app_index = bsm->app_index; + a->uri = "tcp://0.0.0.0/1234"; return vnet_bind_uri (a); } +static int +server_create (vlib_main_t * vm) +{ + builtin_server_main_t *bsm = &builtin_server_main; + u32 num_threads; + vlib_thread_main_t *vtm = vlib_get_thread_main (); + + if (bsm->my_client_index == (u32) ~ 0) + { + if (create_api_loopback (vm)) + return -1; + } + + num_threads = 1 /* main thread */ + vtm->n_threads; + vec_validate (builtin_server_main.vpp_queue, num_threads - 1); + + if (server_attach ()) + { + clib_warning ("failed to attach server"); + return -1; + } + if (server_listen ()) + { + clib_warning ("failed to start listening"); + return -1; + } + return 0; +} + +/* Get our api client index */ +static void +vl_api_memclnt_create_reply_t_handler (vl_api_memclnt_create_reply_t * mp) +{ + vlib_main_t *vm = vlib_get_main (); + builtin_server_main_t *bsm = &builtin_server_main; + bsm->my_client_index = mp->index; + vlib_process_signal_event (vm, bsm->node_index, 1 /* evt */ , + 0 /* data */ ); +} + +#define foreach_tcp_builtin_server_api_msg \ +_(MEMCLNT_CREATE_REPLY, memclnt_create_reply) \ + +static clib_error_t * +tcp_builtin_server_api_hookup (vlib_main_t * vm) +{ + vl_msg_api_msg_config_t _c, *c = &_c; + + /* Hook up client-side static APIs to our handlers */ +#define _(N,n) do { \ + c->id = VL_API_##N; \ + c->name = #n; \ + c->handler = vl_api_##n##_t_handler; \ + c->cleanup = vl_noop_handler; \ + c->endian = vl_api_##n##_t_endian; \ + c->print = vl_api_##n##_t_print; \ + c->size = sizeof(vl_api_##n##_t); \ + c->traced = 1; /* trace, so these msgs print */ \ + c->replay = 0; /* don't replay client create/delete msgs */ \ + c->message_bounce = 0; /* don't bounce this message */ \ + vl_msg_api_config(c);} while (0); + + foreach_tcp_builtin_server_api_msg; +#undef _ + + return 0; +} + static clib_error_t * server_create_command_fn (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) @@ -234,6 +395,7 @@ server_create_command_fn (vlib_main_t * vm, } #endif + tcp_builtin_server_api_hookup (vm); vnet_session_enable_disable (vm, 1 /* turn on TCP, etc. */ ); rv = server_create (vm); switch (rv) @@ -249,12 +411,22 @@ server_create_command_fn (vlib_main_t * vm, /* *INDENT-OFF* */ VLIB_CLI_COMMAND (server_create_command, static) = { - .path = "test server", - .short_help = "test server", + .path = "test tcp server", + .short_help = "test tcp server", .function = server_create_command_fn, }; /* *INDENT-ON* */ +clib_error_t * +builtin_tcp_server_main_init (vlib_main_t * vm) +{ + builtin_server_main_t *bsm = &builtin_server_main; + bsm->my_client_index = ~0; + return 0; +} + +VLIB_INIT_FUNCTION (builtin_tcp_server_main_init); + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index b6c34828..a0c66b9f 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -34,14 +34,19 @@ tcp_connection_bind (u32 session_index, ip46_address_t * ip, listener->c_lcl_port = clib_host_to_net_u16 (port_host_byte_order); if (is_ip4) - listener->c_lcl_ip4.as_u32 = ip->ip4.as_u32; + { + listener->c_lcl_ip4.as_u32 = ip->ip4.as_u32; + listener->c_is_ip4 = 1; + listener->c_proto = SESSION_TYPE_IP4_TCP; + } else - clib_memcpy (&listener->c_lcl_ip6, &ip->ip6, sizeof (ip6_address_t)); + { + clib_memcpy (&listener->c_lcl_ip6, &ip->ip6, sizeof (ip6_address_t)); + listener->c_proto = SESSION_TYPE_IP6_TCP; + } listener->c_s_index = session_index; - listener->c_proto = SESSION_TYPE_IP4_TCP; listener->state = TCP_STATE_LISTEN; - listener->c_is_ip4 = 1; tcp_connection_timers_init (listener); @@ -62,7 +67,6 @@ tcp_session_bind_ip6 (u32 session_index, ip46_address_t * ip, u16 port_host_byte_order) { return tcp_connection_bind (session_index, ip, port_host_byte_order, 0); - } static void @@ -397,6 +401,7 @@ tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4) tc->c_lcl_port = clib_host_to_net_u16 (lcl_port); tc->c_c_index = tc - tm->half_open_connections; tc->c_is_ip4 = is_ip4; + tc->c_proto = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; /* The other connection vars will be initialized after SYN ACK */ tcp_connection_timers_init (tc); @@ -518,7 +523,10 @@ format_tcp_session (u8 * s, va_list * args) tcp_connection_t *tc; tc = tcp_connection_get (tci, thread_index); - return format (s, "%U", format_tcp_connection, tc); + if (tc) + return format (s, "%U", format_tcp_connection, tc); + else + return format (s, "empty"); } u8 * diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 2f5da108..93f3245d 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -100,8 +100,6 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler; #define TCP_RTO_SYN_RETRIES 3 /* SYN retries without doubling RTO */ #define TCP_RTO_INIT 1 * THZ /* Initial retransmit timer */ -void tcp_update_time (f64 now, u32 thread_index); - /** TCP connection flags */ #define foreach_tcp_connection_flag \ _(SNDACK, "Send ACK") \ @@ -481,6 +479,13 @@ tcp_time_now (void) return clib_cpu_time_now () * tcp_main.tstamp_ticks_per_clock; } +always_inline void +tcp_update_time (f64 now, u32 thread_index) +{ + tw_timer_expire_timers_16t_2w_512sl (&tcp_main.timer_wheels[thread_index], + now); +} + u32 tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b); u32 diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 7e9fa47b..ae1f92d5 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -1841,6 +1841,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, case TCP_STATE_ESTABLISHED: case TCP_STATE_FIN_WAIT_1: case TCP_STATE_FIN_WAIT_2: + vlib_buffer_advance (b0, n_advance_bytes0); error0 = tcp_segment_rcv (tm, tc0, b0, n_data_bytes0, &next0); break; case TCP_STATE_CLOSE_WAIT: @@ -2410,12 +2411,6 @@ VLIB_REGISTER_NODE (tcp6_input_node) = /* *INDENT-ON* */ VLIB_NODE_FUNCTION_MULTIARCH (tcp6_input_node, tcp6_input); -void -tcp_update_time (f64 now, u32 thread_index) -{ - tcp_main_t *tm = vnet_get_tcp_main (); - tw_timer_expire_timers_16t_2w_512sl (&tm->timer_wheels[thread_index], now); -} static void tcp_dispatch_table_init (tcp_main_t * tm) diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c index 0725bb04..3dbbdf6f 100644 --- a/src/vnet/tcp/tcp_test.c +++ b/src/vnet/tcp/tcp_test.c @@ -12,7 +12,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #include #define TCP_TEST_I(_cond, _comment, _args...) \ @@ -174,6 +173,118 @@ tcp_test_sack () return 0; } +static int +tcp_test_fifo (vlib_main_t * vm, unformat_input_t * input) +{ + svm_fifo_t *f; + u32 fifo_size = 1 << 20; + u32 *test_data = 0; + u32 offset; + int i, rv; + u32 data_word, test_data_len; + + /* $$$ parse args */ + test_data_len = fifo_size / sizeof (u32); + vec_validate (test_data, test_data_len - 1); + + for (i = 0; i < vec_len (test_data); i++) + test_data[i] = i; + + f = svm_fifo_create (fifo_size); + + /* Paint fifo data vector with -1's */ + memset (f->data, 0xFF, test_data_len); + + /* Enqueue an initial (un-dequeued) chunk */ + rv = svm_fifo_enqueue_nowait (f, 0 /* pid */ , + sizeof (u32), (u8 *) test_data); + + if (rv != sizeof (u32)) + { + clib_warning ("enqueue returned %d", rv); + goto out; + } + + /* + * Create 3 chunks in the future. The offsets are relative + * to the current fifo tail + */ + for (i = 0; i < 3; i++) + { + offset = (2 * i + 1) * sizeof (u32); + vlib_cli_output (vm, "add offset %d", offset); + + rv = svm_fifo_enqueue_with_offset + (f, 0 /* pid */ , offset, sizeof (u32), + (u8 *) (test_data + ((offset + sizeof (u32)) / sizeof (u32)))); + + if (rv) + { + clib_warning ("enqueue returned %d", rv); + goto out; + } + } + + /* Paint missing data backwards */ + for (i = 3; i > 0; i--) + { + offset = (2 * i + 0) * sizeof (u32); + + vlib_cli_output (vm, "add offset %d", offset); + + rv = svm_fifo_enqueue_with_offset + (f, 0 /* pid */ , offset, sizeof (u32), + (u8 *) (test_data + ((offset + sizeof (u32)) / sizeof (u32)))); + + if (rv) + { + clib_warning ("enqueue returned %d", rv); + goto out; + } + } + + vlib_cli_output (vm, "fifo before missing link: %U", + format_svm_fifo, f, 1 /* verbose */ ); + + /* Enqueue the missing u32 */ + rv = svm_fifo_enqueue_nowait (f, 0 /* pid */ , + sizeof (u32), (u8 *) (test_data + 1)); + if (rv != 7 * sizeof (u32)) + { + clib_warning ("enqueue returned %d", rv); + goto out; + } + + vlib_cli_output (vm, "fifo after missing link: %U", + format_svm_fifo, f, 1 /* verbose */ ); + + /* Collect results */ + for (i = 0; i < 7; i++) + { + rv = svm_fifo_dequeue_nowait (f, 0 /* pid */ , sizeof (u32), + (u8 *) & data_word); + if (rv != sizeof (u32)) + { + clib_warning ("dequeue returned %d", rv); + goto out; + } + if (data_word != test_data[i]) + { + clib_warning ("recovered data %d not %d", data_word, test_data[i]); + goto out; + } + } + + clib_warning ("test complete..."); + +out: + svm_fifo_free (f); + vec_free (test_data); + return 0; +} + + + static clib_error_t * tcp_test (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd_arg) @@ -186,6 +297,10 @@ tcp_test (vlib_main_t * vm, { res = tcp_test_sack (); } + else if (unformat (input, "fifo")) + { + res = tcp_test_fifo (vm, input); + } else { return clib_error_return (0, "unknown input `%U'", @@ -203,10 +318,16 @@ tcp_test (vlib_main_t * vm, } } +/* *INDENT-OFF* */ VLIB_CLI_COMMAND (tcp_test_command, static) = { -.path = "test tcp",.short_help = "internal tcp unit tests",.function = - tcp_test,}; + .path = "test tcp", + .short_help = "internal tcp unit tests", + .function = tcp_test, +}; +/* *INDENT-ON* */ + + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/udp/builtin_server.c b/src/vnet/udp/builtin_server.c index 57f774c5..8565f04c 100644 --- a/src/vnet/udp/builtin_server.c +++ b/src/vnet/udp/builtin_server.c @@ -91,12 +91,11 @@ static session_cb_vft_t builtin_server = { /* *INDENT-ON* */ static int -bind_builtin_uri_server (u8 * uri) +attach_builtin_uri_server () { - vnet_bind_args_t _a, *a = &_a; - char segment_name[128]; + vnet_app_attach_args_t _a, *a = &_a; + u8 segment_name[128]; u32 segment_name_length; - int rv; u64 options[16]; segment_name_length = ARRAY_LEN (segment_name); @@ -104,8 +103,7 @@ bind_builtin_uri_server (u8 * uri) memset (a, 0, sizeof (*a)); memset (options, 0, sizeof (options)); - a->uri = (char *) uri; - a->api_client_index = ~0; /* built-in server */ + a->api_client_index = ~0; a->segment_name = segment_name; a->segment_name_length = segment_name_length; a->session_cb_vft = &builtin_server; @@ -114,6 +112,23 @@ bind_builtin_uri_server (u8 * uri) options[SESSION_OPTIONS_SEGMENT_SIZE] = (2 << 30); /*$$$$ config / arg */ a->options = options; + return vnet_application_attach (a); +} + +static int +bind_builtin_uri_server (u8 * uri) +{ + vnet_bind_args_t _a, *a = &_a; + int rv; + + rv = attach_builtin_uri_server (); + if (rv) + return rv; + + memset (a, 0, sizeof (*a)); + a->uri = (char *) uri; + a->app_index = ~0; /* built-in server */ + rv = vnet_bind_uri (a); return rv; @@ -122,11 +137,12 @@ bind_builtin_uri_server (u8 * uri) static int unbind_builtin_uri_server (u8 * uri) { - int rv; + vnet_unbind_args_t _a, *a = &_a; - rv = vnet_unbind_uri ((char *) uri, ~0 /* client_index */ ); + a->app_index = ~0; + a->uri = (char *) uri; - return rv; + return vnet_unbind_uri (a); } static clib_error_t * -- cgit 1.2.3-korg From 6cf260cb2a8b41450850a1578c708e1dd5af699f Mon Sep 17 00:00:00 2001 From: Clement Durand Date: Thu, 13 Apr 2017 13:27:04 +0200 Subject: tcp: completed trace functions * Populated the trace struct that weren't used before * Modified and created format functions to format tcp packets and connections * Completed the node definitions to add the format_trace function * Filled the tracing parts with `vlib_add_trace' calls in each tcp node function For the nodes in tcp_input.c, there is a verbose trace and a non-verbose trace. Each packet goes through tcp[4-6]-input which is traced with `format_tcp_rx_trace', and the other nodes are traced with `format_tcp_rx_trace_short'. Change-Id: I4f2eed023f5973c14343132a33b06131cf063aa2 Signed-off-by: Pierre Pfister Signed-off-by: Clement Durand --- src/vnet/tcp/tcp_input.c | 100 +++++++++++++++++++++++++++++++++------------- src/vnet/tcp/tcp_output.c | 45 +++++++++++++++++---- 2 files changed, 110 insertions(+), 35 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index ae1f92d5..a12ad8c0 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -1125,6 +1125,43 @@ done: return error; } +typedef struct +{ + tcp_header_t tcp_header; + tcp_connection_t tcp_connection; +} tcp_rx_trace_t; + +u8 * +format_tcp_rx_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *); + uword indent = format_get_indent (s); + + s = format (s, "%U\n%U%U", + format_tcp_header, &t->tcp_header, 128, + format_white_space, indent, + format_tcp_connection_verbose, &t->tcp_connection); + + return s; +} + +u8 * +format_tcp_rx_trace_short (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *); + + s = format (s, "%d -> %d (%U)", + clib_net_to_host_u16 (t->tcp_header.src_port), + clib_net_to_host_u16 (t->tcp_header.dst_port), format_tcp_state, + &t->tcp_connection.state); + + return s; +} + always_inline void tcp_established_inc_counter (vlib_main_t * vm, u8 is_ip4, u8 evt, u8 val) { @@ -1160,6 +1197,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { u32 bi0; vlib_buffer_t *b0; + tcp_rx_trace_t *t0; tcp_header_t *th0 = 0; tcp_connection_t *tc0; ip4_header_t *ip40; @@ -1250,7 +1288,10 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, b0->error = node->errors[error0]; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); + clib_memcpy (&t0->tcp_connection, tc0, + sizeof (t0->tcp_connection)); } vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, @@ -1296,6 +1337,7 @@ VLIB_REGISTER_NODE (tcp4_established_node) = foreach_tcp_state_next #undef _ }, + .format_trace = format_tcp_rx_trace_short, }; /* *INDENT-ON* */ @@ -1317,6 +1359,7 @@ VLIB_REGISTER_NODE (tcp6_established_node) = foreach_tcp_state_next #undef _ }, + .format_trace = format_tcp_rx_trace_short, }; /* *INDENT-ON* */ @@ -1350,6 +1393,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { u32 bi0, ack0, seq0; vlib_buffer_t *b0; + tcp_rx_trace_t *t0; tcp_header_t *tcp0 = 0; tcp_connection_t *tc0; ip4_header_t *ip40; @@ -1545,7 +1589,10 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, b0->error = error0 ? node->errors[error0] : 0; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + clib_memcpy (&t0->tcp_header, tcp0, sizeof (t0->tcp_header)); + clib_memcpy (&t0->tcp_connection, tc0, + sizeof (t0->tcp_connection)); } vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, @@ -1599,6 +1646,7 @@ VLIB_REGISTER_NODE (tcp4_syn_sent_node) = foreach_tcp_state_next #undef _ }, + .format_trace = format_tcp_rx_trace_short, }; /* *INDENT-ON* */ @@ -1619,8 +1667,9 @@ VLIB_REGISTER_NODE (tcp6_syn_sent_node) = #define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n, foreach_tcp_state_next #undef _ - } -,}; + }, + .format_trace = format_tcp_rx_trace_short, +}; /* *INDENT-ON* */ VLIB_NODE_FUNCTION_MULTIARCH (tcp6_syn_sent_node, tcp6_syn_sent_rcv); @@ -1651,6 +1700,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { u32 bi0; vlib_buffer_t *b0; + tcp_rx_trace_t *t0; tcp_header_t *tcp0 = 0; tcp_connection_t *tc0; ip4_header_t *ip40; @@ -1899,7 +1949,10 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, drop: if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + clib_memcpy (&t0->tcp_header, tcp0, sizeof (t0->tcp_header)); + clib_memcpy (&t0->tcp_connection, tc0, + sizeof (t0->tcp_connection)); } vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, @@ -1953,6 +2006,7 @@ VLIB_REGISTER_NODE (tcp4_rcv_process_node) = foreach_tcp_state_next #undef _ }, + .format_trace = format_tcp_rx_trace_short, }; /* *INDENT-ON* */ @@ -1974,6 +2028,7 @@ VLIB_REGISTER_NODE (tcp6_rcv_process_node) = foreach_tcp_state_next #undef _ }, + .format_trace = format_tcp_rx_trace_short, }; /* *INDENT-ON* */ @@ -2009,6 +2064,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { u32 bi0; vlib_buffer_t *b0; + tcp_rx_trace_t *t0; tcp_header_t *th0 = 0; tcp_connection_t *lc0; ip4_header_t *ip40; @@ -2116,7 +2172,10 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, drop: if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); + clib_memcpy (&t0->tcp_connection, lc0, + sizeof (t0->tcp_connection)); } b0->error = node->errors[error0]; @@ -2160,6 +2219,7 @@ VLIB_REGISTER_NODE (tcp4_listen_node) = foreach_tcp_state_next #undef _ }, + .format_trace = format_tcp_rx_trace_short, }; /* *INDENT-ON* */ @@ -2181,6 +2241,7 @@ VLIB_REGISTER_NODE (tcp6_listen_node) = foreach_tcp_state_next #undef _ }, + .format_trace = format_tcp_rx_trace_short, }; /* *INDENT-ON* */ @@ -2216,27 +2277,6 @@ typedef enum _tcp_input_next _ (ESTABLISHED, "tcp6-established") \ _ (RESET, "tcp6-reset") -typedef struct -{ - u16 src_port; - u16 dst_port; - u8 state; -} tcp_rx_trace_t; - -u8 * -format_tcp_rx_trace (u8 * s, va_list * args) -{ - CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); - CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); - tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *); - - s = format (s, "TCP: src-port %d dst-port %U%s\n", - clib_net_to_host_u16 (t->src_port), - clib_net_to_host_u16 (t->dst_port), format_tcp_state, t->state); - - return s; -} - #define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN) always_inline uword @@ -2262,6 +2302,7 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { u32 bi0; vlib_buffer_t *b0; + tcp_rx_trace_t *t0; tcp_header_t *tcp0 = 0; tcp_connection_t *tc0; ip4_header_t *ip40; @@ -2339,7 +2380,10 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + clib_memcpy (&t0->tcp_header, tcp0, sizeof (t0->tcp_header)); + clib_memcpy (&t0->tcp_connection, tc0, + sizeof (t0->tcp_connection)); } vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index e18bfad7..d2fa1d7b 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -42,9 +42,8 @@ static char *tcp_error_strings[] = { typedef struct { - u16 src_port; - u16 dst_port; - u8 state; + tcp_header_t tcp_header; + tcp_connection_t tcp_connection; } tcp_tx_trace_t; u16 dummy_mtu = 400; @@ -54,8 +53,13 @@ format_tcp_tx_trace (u8 * s, va_list * args) { CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + tcp_tx_trace_t *t = va_arg (*args, tcp_tx_trace_t *); + uword indent = format_get_indent (s); - s = format (s, "TBD\n"); + s = format (s, "%U\n%U%U", + format_tcp_header, &t->tcp_header, 128, + format_white_space, indent, + format_tcp_connection_verbose, &t->tcp_connection); return s; } @@ -1331,7 +1335,8 @@ tcp46_output_inline (vlib_main_t * vm, u32 bi0; vlib_buffer_t *b0; tcp_connection_t *tc0; - tcp_header_t *th0; + tcp_tx_trace_t *t0; + tcp_header_t *th0 = 0; u32 error0 = TCP_ERROR_PKTS_SENT, next0 = TCP_OUTPUT_NEXT_IP_LOOKUP; bi0 = from[0]; @@ -1424,7 +1429,17 @@ tcp46_output_inline (vlib_main_t * vm, b0->error = node->errors[error0]; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + if (th0) + { + clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); + } + else + { + memset (&t0->tcp_header, 0, sizeof (t0->tcp_header)); + } + clib_memcpy (&t0->tcp_connection, tc0, + sizeof (t0->tcp_connection)); } vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, @@ -1541,6 +1556,9 @@ tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { u32 bi0; vlib_buffer_t *b0; + tcp_tx_trace_t *t0; + tcp_header_t *th0; + tcp_connection_t *tc0; u32 error0 = TCP_ERROR_RST_SENT, next0 = TCP_RESET_NEXT_IP_LOOKUP; bi0 = from[0]; @@ -1569,7 +1587,18 @@ tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - + th0 = vlib_buffer_get_current (b0); + if (is_ip4) + th0 = ip4_next_header ((ip4_header_t *) th0); + else + th0 = ip6_next_header ((ip6_header_t *) th0); + tc0 = + tcp_connection_get (vnet_buffer (b0)->tcp.connection_index, + my_thread_index); + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); + clib_memcpy (&t0->tcp_connection, tc0, + sizeof (t0->tcp_connection)); } vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, @@ -1607,6 +1636,7 @@ VLIB_REGISTER_NODE (tcp4_reset_node) = { foreach_tcp4_reset_next #undef _ }, + .format_trace = format_tcp_tx_trace, }; /* *INDENT-ON* */ @@ -1625,6 +1655,7 @@ VLIB_REGISTER_NODE (tcp6_reset_node) = { foreach_tcp6_reset_next #undef _ }, + .format_trace = format_tcp_tx_trace, }; /* *INDENT-ON* */ -- cgit 1.2.3-korg From 1f75cfd73320476a8f821064391fe368dd4bf75b Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Fri, 14 Apr 2017 16:46:44 -0400 Subject: Fix fifo ooo bugs and improve testing Change-Id: If3c01e318bcb740ca5b240c63f712e2167082a80 Signed-off-by: Dave Barach Signed-off-by: Florin Coras --- src/svm/svm_fifo.c | 126 +++++++---- src/svm/svm_fifo.h | 21 +- src/vnet/tcp/tcp.c | 2 +- src/vnet/tcp/tcp.h | 1 + src/vnet/tcp/tcp_format.c | 2 +- src/vnet/tcp/tcp_input.c | 28 ++- src/vnet/tcp/tcp_test.c | 516 ++++++++++++++++++++++++++++++++++++++++++---- 7 files changed, 602 insertions(+), 94 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c index 097bab77..bd968aea 100644 --- a/src/svm/svm_fifo.c +++ b/src/svm/svm_fifo.c @@ -15,6 +15,36 @@ #include +#define offset_lt(_a, _b) ((i32)((_a)-(_b)) < 0) +#define offset_leq(_a, _b) ((i32)((_a)-(_b)) <= 0) + +u8 * +format_ooo_segment (u8 * s, va_list * args) +{ + ooo_segment_t *seg = va_arg (*args, ooo_segment_t *); + + s = format (s, "pos %u, len %u, next %d, prev %d", + seg->start, seg->length, seg->next, seg->prev); + return s; +} + +u8 * +format_ooo_list (u8 * s, va_list * args) +{ + svm_fifo_t *f = va_arg (*args, svm_fifo_t *); + u32 ooo_segment_index = f->ooos_list_head; + ooo_segment_t *seg; + + while (ooo_segment_index != OOO_SEGMENT_INVALID_INDEX) + { + seg = pool_elt_at_index (f->ooo_segments, ooo_segment_index); + s = format (s, "\n %U", format_ooo_segment, seg); + + ooo_segment_index = seg->next; + } + return s; +} + /** create an svm fifo, in the current heap. Fails vs blow up the process */ svm_fifo_t * svm_fifo_create (u32 data_size_in_bytes) @@ -47,7 +77,7 @@ ooo_segment_new (svm_fifo_t * f, u32 start, u32 length) pool_get (f->ooo_segments, s); - s->fifo_position = start; + s->start = start; s->length = length; s->prev = s->next = OOO_SEGMENT_INVALID_INDEX; @@ -88,14 +118,13 @@ static void ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) { ooo_segment_t *s, *new_s, *prev, *next, *it; - u32 new_index, position, end_offset, s_sof, s_eof, s_index; + u32 new_index, end_offset, s_sof, s_eof, s_index; - position = (f->tail + offset) % f->nitems; end_offset = offset + length; if (f->ooos_list_head == OOO_SEGMENT_INVALID_INDEX) { - s = ooo_segment_new (f, position, length); + s = ooo_segment_new (f, offset, length); f->ooos_list_head = s - f->ooo_segments; f->ooos_newest = f->ooos_list_head; return; @@ -104,26 +133,26 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) /* Find first segment that starts after new segment */ s = pool_elt_at_index (f->ooo_segments, f->ooos_list_head); while (s->next != OOO_SEGMENT_INVALID_INDEX - && ooo_segment_offset (f, s) <= offset) + && offset_leq (ooo_segment_offset (f, s), offset)) s = pool_elt_at_index (f->ooo_segments, s->next); s_index = s - f->ooo_segments; s_sof = ooo_segment_offset (f, s); s_eof = ooo_segment_end_offset (f, s); + prev = ooo_segment_get_prev (f, s); /* No overlap, add before current segment */ - if (end_offset < s_sof) + if (offset_lt (end_offset, s_sof) + && (!prev || offset_lt (prev->start + prev->length, offset))) { - new_s = ooo_segment_new (f, position, length); + new_s = ooo_segment_new (f, offset, length); new_index = new_s - f->ooo_segments; /* Pool might've moved, get segment again */ s = pool_elt_at_index (f->ooo_segments, s_index); - if (s->prev != OOO_SEGMENT_INVALID_INDEX) { new_s->prev = s->prev; - prev = pool_elt_at_index (f->ooo_segments, new_s->prev); prev->next = new_index; } @@ -139,9 +168,9 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) return; } /* No overlap, add after current segment */ - else if (s_eof < offset) + else if (offset_lt (s_eof, offset)) { - new_s = ooo_segment_new (f, position, length); + new_s = ooo_segment_new (f, offset, length); new_index = new_s - f->ooo_segments; /* Pool might've moved, get segment again */ @@ -150,7 +179,6 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) if (s->next != OOO_SEGMENT_INVALID_INDEX) { new_s->next = s->next; - next = pool_elt_at_index (f->ooo_segments, new_s->next); next->prev = new_index; } @@ -167,7 +195,7 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) */ /* Merge at head */ - if (offset <= s_sof) + if (offset_leq (offset, s_sof)) { /* If we have a previous, check if we overlap */ if (s->prev != OOO_SEGMENT_INVALID_INDEX) @@ -176,26 +204,31 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) /* New segment merges prev and current. Remove previous and * update position of current. */ - if (ooo_segment_end_offset (f, prev) >= offset) + if (offset_leq (offset, ooo_segment_end_offset (f, prev))) { - s->fifo_position = prev->fifo_position; + s->start = prev->start; s->length = s_eof - ooo_segment_offset (f, prev); ooo_segment_del (f, s->prev); } + else + { + s->start = offset; + s->length = s_eof - ooo_segment_offset (f, s); + } } else { - s->fifo_position = position; + s->start = offset; s->length = s_eof - ooo_segment_offset (f, s); } /* The new segment's tail may cover multiple smaller ones */ - if (s_eof < end_offset) + if (offset_lt (s_eof, end_offset)) { /* Remove segments completely covered */ it = (s->next != OOO_SEGMENT_INVALID_INDEX) ? pool_elt_at_index (f->ooo_segments, s->next) : 0; - while (it && ooo_segment_end_offset (f, it) < end_offset) + while (it && offset_lt (ooo_segment_end_offset (f, it), end_offset)) { next = (it->next != OOO_SEGMENT_INVALID_INDEX) ? pool_elt_at_index (f->ooo_segments, it->next) : 0; @@ -207,7 +240,7 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) s->length = end_offset - ooo_segment_offset (f, s); /* If partial overlap with last, merge */ - if (it && ooo_segment_offset (f, it) < end_offset) + if (it && offset_lt (ooo_segment_offset (f, it), end_offset)) { s->length += it->length - (ooo_segment_offset (f, it) - end_offset); @@ -216,7 +249,7 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) } } /* Last but overlapping previous */ - else if (s_eof <= end_offset) + else if (offset_leq (s_eof, end_offset)) { s->length = end_offset - ooo_segment_offset (f, s); } @@ -247,7 +280,7 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) s = pool_elt_at_index (f->ooo_segments, f->ooos_list_head); /* If last tail update overlaps one/multiple ooo segments, remove them */ - diff = (f->nitems + f->tail - s->fifo_position) % f->nitems; + diff = (f->nitems + ((int) s->start - f->tail)) % f->nitems; while (0 < diff && diff < n_bytes_enqueued) { /* Segment end is beyond the tail. Advance tail and be done */ @@ -262,7 +295,7 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) { index = s - f->ooo_segments; s = pool_elt_at_index (f->ooo_segments, s->next); - diff = (f->nitems + f->tail - s->fifo_position) % f->nitems; + diff = (f->nitems + ((int) s->start - f->tail)) % f->nitems; ooo_segment_del (f, index); } /* End of search */ @@ -368,9 +401,20 @@ svm_fifo_enqueue_with_offset_internal (svm_fifo_t * f, { u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; u32 cursize, nitems; - u32 tail_plus_offset; + u32 normalized_offset; + int rv; - ASSERT (offset > 0); + /* Safety: don't wrap more than nitems/2 */ + ASSERT ((f->nitems + offset - f->tail) % f->nitems < f->nitems / 2); + + /* Users would do do well to avoid this */ + if (PREDICT_FALSE (f->tail == (offset % f->nitems))) + { + rv = svm_fifo_enqueue_internal (f, pid, required_bytes, copy_from_here); + if (rv > 0) + return 0; + return -1; + } /* read cursize, which can only increase while we're working */ cursize = svm_fifo_max_dequeue (f); @@ -384,24 +428,24 @@ svm_fifo_enqueue_with_offset_internal (svm_fifo_t * f, /* Number of bytes we're going to copy */ total_copy_bytes = required_bytes; - tail_plus_offset = (f->tail + offset) % nitems; + normalized_offset = offset % nitems; /* Number of bytes in first copy segment */ - first_copy_bytes = ((nitems - tail_plus_offset) < total_copy_bytes) - ? (nitems - tail_plus_offset) : total_copy_bytes; + first_copy_bytes = ((nitems - normalized_offset) < total_copy_bytes) + ? (nitems - normalized_offset) : total_copy_bytes; - clib_memcpy (&f->data[tail_plus_offset], copy_from_here, first_copy_bytes); + clib_memcpy (&f->data[normalized_offset], copy_from_here, first_copy_bytes); /* Number of bytes in second copy segment, if any */ second_copy_bytes = total_copy_bytes - first_copy_bytes; if (second_copy_bytes) { - tail_plus_offset += first_copy_bytes; - tail_plus_offset %= nitems; + normalized_offset += first_copy_bytes; + normalized_offset %= nitems; - ASSERT (tail_plus_offset == 0); + ASSERT (normalized_offset == 0); - clib_memcpy (&f->data[tail_plus_offset], + clib_memcpy (&f->data[normalized_offset], copy_from_here + first_copy_bytes, second_copy_bytes); } @@ -573,8 +617,8 @@ format_svm_fifo (u8 * s, va_list * args) ooo_segment_t *seg; u32 seg_index; - s = - format (s, "ooo pool %d active elts\n", pool_elts (f->ooo_segments)); + s = format (s, "ooo pool %d active elts\n", + pool_elts (f->ooo_segments)); seg_index = f->ooos_list_head; @@ -582,13 +626,25 @@ format_svm_fifo (u8 * s, va_list * args) { seg = pool_elt_at_index (f->ooo_segments, seg_index); s = format (s, " pos %u, len %u next %d\n", - seg->fifo_position, seg->length, seg->next); + seg->start, seg->length, seg->next); seg_index = seg->next; } } return s; } +u32 +svm_fifo_number_ooo_segments (svm_fifo_t * f) +{ + return pool_elts (f->ooo_segments); +} + +ooo_segment_t * +svm_fifo_first_ooo_segment (svm_fifo_t * f) +{ + return pool_elt_at_index (f->ooo_segments, f->ooos_list_head); +} + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/svm/svm_fifo.h b/src/svm/svm_fifo.h index 9beb63f5..0fff2577 100644 --- a/src/svm/svm_fifo.h +++ b/src/svm/svm_fifo.h @@ -36,10 +36,13 @@ typedef struct u32 next; /**< Next linked-list element pool index */ u32 prev; /**< Previous linked-list element pool index */ - u32 fifo_position; /**< Start of segment, normalized*/ + u32 start; /**< Start of segment, normalized*/ u32 length; /**< Length of segment */ } ooo_segment_t; +format_function_t format_ooo_segment; +format_function_t format_ooo_list; + #define OOO_SEGMENT_INVALID_INDEX ((u32)~0) typedef struct @@ -127,6 +130,8 @@ int svm_fifo_dequeue_nowait (svm_fifo_t * f, int pid, u32 max_bytes, int svm_fifo_peek (svm_fifo_t * f, int pid, u32 offset, u32 max_bytes, u8 * copy_here); int svm_fifo_dequeue_drop (svm_fifo_t * f, int pid, u32 max_bytes); +u32 svm_fifo_number_ooo_segments (svm_fifo_t * f); +ooo_segment_t *svm_fifo_first_ooo_segment (svm_fifo_t * f); format_function_t format_svm_fifo; @@ -139,13 +144,23 @@ svm_fifo_newest_ooo_segment (svm_fifo_t * f) always_inline u32 ooo_segment_offset (svm_fifo_t * f, ooo_segment_t * s) { - return ((f->nitems + s->fifo_position - f->tail) % f->nitems); +// return ((f->nitems + s->fifo_position - f->tail) % f->nitems); + return s->start; } always_inline u32 ooo_segment_end_offset (svm_fifo_t * f, ooo_segment_t * s) { - return ((f->nitems + s->fifo_position + s->length - f->tail) % f->nitems); +// return ((f->nitems + s->fifo_position + s->length - f->tail) % f->nitems); + return s->start + s->length; +} + +always_inline ooo_segment_t * +ooo_segment_get_prev (svm_fifo_t * f, ooo_segment_t * s) +{ + if (s->prev == OOO_SEGMENT_INVALID_INDEX) + return 0; + return pool_elt_at_index (f->ooo_segments, s->prev); } #endif /* __included_ssvm_fifo_h__ */ diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index a0c66b9f..12982589 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -447,7 +447,7 @@ format_tcp_state (u8 * s, va_list * args) if (*state < TCP_N_STATES) s = format (s, "%s", tcp_fsm_states[*state]); else - s = format (s, "UNKNOWN"); + s = format (s, "UNKNOWN (%d (0x%x))", *state, *state); return s; } diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 225b26da..2ac6a9b8 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -58,6 +58,7 @@ typedef enum _tcp_state } tcp_state_t; format_function_t format_tcp_state; +format_function_t format_tcp_flags; /** TCP timers */ #define foreach_tcp_timer \ diff --git a/src/vnet/tcp/tcp_format.c b/src/vnet/tcp/tcp_format.c index 7136741d..994ccfd6 100644 --- a/src/vnet/tcp/tcp_format.c +++ b/src/vnet/tcp/tcp_format.c @@ -40,7 +40,7 @@ #include #include -static u8 * +u8 * format_tcp_flags (u8 * s, va_list * args) { int flags = va_arg (*args, int); diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index a12ad8c0..97679aaf 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -211,8 +211,6 @@ tcp_options_parse (tcp_header_t * th, tcp_options_t * to) always_inline int tcp_segment_check_paws (tcp_connection_t * tc) { - /* XXX normally test for timestamp should be lt instead of leq, but for - * local testing this is not enough */ return tcp_opts_tstamp (&tc->opt) && tc->tsval_recent && timestamp_lt (tc->opt.tsval, tc->tsval_recent); } @@ -999,7 +997,7 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, u16 data_len) { stream_session_t *s0; - u32 offset, seq; + u32 offset; int rv; /* Pure ACK. Do nothing */ @@ -1009,8 +1007,9 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, } s0 = stream_session_get (tc->c_s_index, tc->c_thread_index); - seq = vnet_buffer (b)->tcp.seq_number; - offset = seq - tc->rcv_nxt; + offset = vnet_buffer (b)->tcp.seq_number - tc->irs; + + clib_warning ("ooo: offset %d len %d", offset, data_len); rv = svm_fifo_enqueue_with_offset (s0->server_rx_fifo, s0->pid, offset, data_len, vlib_buffer_get_current (b)); @@ -1032,8 +1031,8 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, /* Get the newest segment from the fifo */ newest = svm_fifo_newest_ooo_segment (s0->server_rx_fifo); - start = tc->rcv_nxt + ooo_segment_offset (s0->server_rx_fifo, newest); - end = tc->rcv_nxt + ooo_segment_end_offset (s0->server_rx_fifo, newest); + start = ooo_segment_offset (s0->server_rx_fifo, newest); + end = ooo_segment_end_offset (s0->server_rx_fifo, newest); tcp_update_sack_list (tc, start, end); } @@ -1072,6 +1071,7 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, { /* Old sequence numbers allowed through because they overlapped * the rx window */ + if (seq_lt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt)) { error = TCP_ERROR_SEGMENT_OLD; @@ -1181,6 +1181,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, u32 n_left_from, next_index, *from, *to_next; u32 my_thread_index = vm->thread_index, errors = 0; tcp_main_t *tm = vnet_get_tcp_main (); + u8 is_fin = 0; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -1243,9 +1244,11 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, n_advance_bytes0 += sizeof (ip60[0]); } + is_fin = (th0->flags & TCP_FLAG_FIN) != 0; + /* SYNs, FINs and data consume sequence numbers */ vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number - + tcp_is_syn (th0) + tcp_is_fin (th0) + n_data_bytes0; + + tcp_is_syn (th0) + is_fin + n_data_bytes0; /* TODO header prediction fast path */ @@ -1272,8 +1275,11 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_advance (b0, n_advance_bytes0); error0 = tcp_segment_rcv (tm, tc0, b0, n_data_bytes0, &next0); + /* N.B. buffer is rewritten if segment is ooo. Thus, th0 becomes a + * dangling reference. */ + /* 8: check the FIN bit */ - if (tcp_fin (th0)) + if (is_fin) { /* Enter CLOSE-WAIT and notify session. Don't send ACK, instead * wait for session to call close. To avoid lingering @@ -2365,8 +2371,12 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (PREDICT_FALSE (error0 == TCP_ERROR_DISPATCH)) { + tcp_state_t state0 = tc0->state; /* Overload tcp flags to store state */ vnet_buffer (b0)->tcp.flags = tc0->state; + clib_warning ("disp error state %U flags %U", + format_tcp_state, &state0, + format_tcp_flags, flags0); } } else diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c index 3dbbdf6f..12579632 100644 --- a/src/vnet/tcp/tcp_test.c +++ b/src/vnet/tcp/tcp_test.c @@ -173,17 +173,145 @@ tcp_test_sack () return 0; } -static int -tcp_test_fifo (vlib_main_t * vm, unformat_input_t * input) +typedef struct +{ + u32 offset; + u32 len; +} test_pattern_t; + +/* *INDENT-OFF* */ +test_pattern_t test_pattern[] = { + {380, 8}, {768, 8}, {1156, 8}, {1544, 8}, {1932, 8}, {2320, 8}, {2708, 8}, + {2992, 8}, {372, 8}, {760, 8}, {1148, 8}, {1536, 8}, {1924, 8}, {2312, 8}, + {2700, 8}, {2984, 8}, {364, 8}, {752, 8}, {1140, 8}, {1528, 8}, {1916, 8}, + {2304, 8}, {2692, 8}, {2976, 8}, {356, 8}, {744, 8}, {1132, 8}, {1520, 8}, + {1908, 8}, {2296, 8}, {2684, 8}, {2968, 8}, {348, 8}, {736, 8}, {1124, 8}, + {1512, 8}, {1900, 8}, {2288, 8}, {2676, 8}, {2960, 8}, {340, 8}, {728, 8}, + {1116, 8}, {1504, 8}, {1892, 8}, {2280, 8}, {2668, 8}, {2952, 8}, {332, 8}, + {720, 8}, {1108, 8}, {1496, 8}, {1884, 8}, {2272, 8}, {2660, 8}, {2944, 8}, + {324, 8}, {712, 8}, {1100, 8}, {1488, 8}, {1876, 8}, {2264, 8}, {2652, 8}, + {2936, 8}, {316, 8}, {704, 8}, {1092, 8}, {1480, 8}, {1868, 8}, {2256, 8}, + {2644, 8}, {2928, 8}, {308, 8}, {696, 8}, {1084, 8}, {1472, 8}, {1860, 8}, + {2248, 8}, {2636, 8}, {2920, 8}, {300, 8}, {688, 8}, {1076, 8}, {1464, 8}, + {1852, 8}, {2240, 8}, {2628, 8}, {2912, 8}, {292, 8}, {680, 8}, {1068, 8}, + {1456, 8}, {1844, 8}, {2232, 8}, {2620, 8}, {2904, 8}, {284, 8}, {672, 8}, + {1060, 8}, {1448, 8}, {1836, 8}, {2224, 8}, {2612, 8}, {2896, 8}, {276, 8}, + {664, 8}, {1052, 8}, {1440, 8}, {1828, 8}, {2216, 8}, {2604, 8}, {2888, 8}, + {268, 8}, {656, 8}, {1044, 8}, {1432, 8}, {1820, 8}, {2208, 8}, {2596, 8}, + {2880, 8}, {260, 8}, {648, 8}, {1036, 8}, {1424, 8}, {1812, 8}, {2200, 8}, + {2588, 8}, {2872, 8}, {252, 8}, {640, 8}, {1028, 8}, {1416, 8}, {1804, 8}, + {2192, 8}, {2580, 8}, {2864, 8}, {244, 8}, {632, 8}, {1020, 8}, {1408, 8}, + {1796, 8}, {2184, 8}, {2572, 8}, {2856, 8}, {236, 8}, {624, 8}, {1012, 8}, + {1400, 8}, {1788, 8}, {2176, 8}, {2564, 8}, {2848, 8}, {228, 8}, {616, 8}, + {1004, 8}, {1392, 8}, {1780, 8}, {2168, 8}, {2556, 8}, {2840, 8}, {220, 8}, + {608, 8}, {996, 8}, {1384, 8}, {1772, 8}, {2160, 8}, {2548, 8}, {2832, 8}, + {212, 8}, {600, 8}, {988, 8}, {1376, 8}, {1764, 8}, {2152, 8}, {2540, 8}, + {2824, 8}, {204, 8}, {592, 8}, {980, 8}, {1368, 8}, {1756, 8}, {2144, 8}, + {2532, 8}, {2816, 8}, {196, 8}, {584, 8}, {972, 8}, {1360, 8}, {1748, 8}, + {2136, 8}, {2524, 8}, {2808, 8}, {188, 8}, {576, 8}, {964, 8}, {1352, 8}, + {1740, 8}, {2128, 8}, {2516, 8}, {2800, 8}, {180, 8}, {568, 8}, {956, 8}, + {1344, 8}, {1732, 8}, {2120, 8}, {2508, 8}, {2792, 8}, {172, 8}, {560, 8}, + {948, 8}, {1336, 8}, {1724, 8}, {2112, 8}, {2500, 8}, {2784, 8}, {164, 8}, + {552, 8}, {940, 8}, {1328, 8}, {1716, 8}, {2104, 8}, {2492, 8}, {2776, 8}, + {156, 8}, {544, 8}, {932, 8}, {1320, 8}, {1708, 8}, {2096, 8}, {2484, 8}, + {2768, 8}, {148, 8}, {536, 8}, {924, 8}, {1312, 8}, {1700, 8}, {2088, 8}, + {2476, 8}, {2760, 8}, {140, 8}, {528, 8}, {916, 8}, {1304, 8}, {1692, 8}, + {2080, 8}, {2468, 8}, {2752, 8}, {132, 8}, {520, 8}, {908, 8}, {1296, 8}, + {1684, 8}, {2072, 8}, {2460, 8}, {2744, 8}, {124, 8}, {512, 8}, {900, 8}, + {1288, 8}, {1676, 8}, {2064, 8}, {2452, 8}, {2736, 8}, {116, 8}, {504, 8}, + {892, 8}, {1280, 8}, {1668, 8}, {2056, 8}, {2444, 8}, {2728, 8}, {108, 8}, + {496, 8}, {884, 8}, {1272, 8}, {1660, 8}, {2048, 8}, {2436, 8}, {2720, 8}, + {100, 8}, {488, 8}, {876, 8}, {1264, 8}, {1652, 8}, {2040, 8}, {2428, 8}, + {2716, 4}, {92, 8}, {480, 8}, {868, 8}, {1256, 8}, {1644, 8}, {2032, 8}, + {2420, 8}, {84, 8}, {472, 8}, {860, 8}, {1248, 8}, {1636, 8}, {2024, 8}, + {2412, 8}, {76, 8}, {464, 8}, {852, 8}, {1240, 8}, {1628, 8}, {2016, 8}, + {2404, 8}, {68, 8}, {456, 8}, {844, 8}, {1232, 8}, {1620, 8}, {2008, 8}, + {2396, 8}, {60, 8}, {448, 8}, {836, 8}, {1224, 8}, {1612, 8}, {2000, 8}, + {2388, 8}, {52, 8}, {440, 8}, {828, 8}, {1216, 8}, {1604, 8}, {1992, 8}, + {2380, 8}, {44, 8}, {432, 8}, {820, 8}, {1208, 8}, {1596, 8}, {1984, 8}, + {2372, 8}, {36, 8}, {424, 8}, {812, 8}, {1200, 8}, {1588, 8}, {1976, 8}, + {2364, 8}, {28, 8}, {416, 8}, {804, 8}, {1192, 8}, {1580, 8}, {1968, 8}, + {2356, 8}, {20, 8}, {408, 8}, {796, 8}, {1184, 8}, {1572, 8}, {1960, 8}, + {2348, 8}, {12, 8}, {400, 8}, {788, 8}, {1176, 8}, {1564, 8}, {1952, 8}, + {2340, 8}, {4, 8}, {392, 8}, {780, 8}, {1168, 8}, {1556, 8}, {1944, 8}, + {2332, 8}, + /* missing from original data set */ + {388, 4}, {776, 4}, {1164, 4}, {1552, 4}, {1940, 4}, {2328, 4}, +}; +/* *INDENT-ON* */ + +int +pattern_cmp (const void *arg1, const void *arg2) +{ + test_pattern_t *a1 = (test_pattern_t *) arg1; + test_pattern_t *a2 = (test_pattern_t *) arg2; + + if (a1->offset < a2->offset) + return -1; + else if (a1->offset > a2->offset) + return 1; + return 0; +} + +static u8 +fifo_validate_pattern (vlib_main_t * vm, test_pattern_t * pattern, + u32 pattern_length) +{ + test_pattern_t *tp = pattern; + int i; + + /* Go through the pattern and make 100% sure it's sane */ + for (i = 0; i < pattern_length - 1; i++) + { + if (tp->offset + tp->len != (tp + 1)->offset) + { + vlib_cli_output (vm, "[%d] missing {%d, %d}", i, + (tp->offset + tp->len), + (tp + 1)->offset - (tp->offset + tp->len)); + return 0; + } + tp++; + } + return 1; +} + +static test_pattern_t * +fifo_get_validate_pattern (vlib_main_t * vm, test_pattern_t * test_data, + u32 test_data_len) +{ + test_pattern_t *validate_pattern = 0; + + /* Validate, and try segments in order... */ + vec_validate (validate_pattern, test_data_len - 1); + memcpy (validate_pattern, test_data, + test_data_len * sizeof (test_pattern_t)); + qsort ((u8 *) validate_pattern, test_data_len, sizeof (test_pattern_t), + pattern_cmp); + + if (fifo_validate_pattern (vm, validate_pattern, test_data_len) == 0) + return 0; + + return validate_pattern; +} + +int +tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) { svm_fifo_t *f; u32 fifo_size = 1 << 20; u32 *test_data = 0; u32 offset; - int i, rv; + int i, rv, verbose = 0; u32 data_word, test_data_len; + ooo_segment_t *ooo_seg; + u8 *data; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "verbose")) + verbose = 1; + } - /* $$$ parse args */ test_data_len = fifo_size / sizeof (u32); vec_validate (test_data, test_data_len - 1); @@ -198,12 +326,8 @@ tcp_test_fifo (vlib_main_t * vm, unformat_input_t * input) /* Enqueue an initial (un-dequeued) chunk */ rv = svm_fifo_enqueue_nowait (f, 0 /* pid */ , sizeof (u32), (u8 *) test_data); - - if (rv != sizeof (u32)) - { - clib_warning ("enqueue returned %d", rv); - goto out; - } + TCP_TEST ((rv == sizeof (u32)), "enqueued %d", rv); + TCP_TEST ((f->tail == 4), "fifo tail %u", f->tail); /* * Create 3 chunks in the future. The offsets are relative @@ -212,51 +336,62 @@ tcp_test_fifo (vlib_main_t * vm, unformat_input_t * input) for (i = 0; i < 3; i++) { offset = (2 * i + 1) * sizeof (u32); - vlib_cli_output (vm, "add offset %d", offset); - - rv = svm_fifo_enqueue_with_offset - (f, 0 /* pid */ , offset, sizeof (u32), - (u8 *) (test_data + ((offset + sizeof (u32)) / sizeof (u32)))); - + data = (u8 *) (test_data + (2 * i + 1)); + rv = + svm_fifo_enqueue_with_offset (f, 0 /* pid */ , offset, sizeof (u32), + data); + if (verbose) + vlib_cli_output (vm, "add [%d] [%d, %d]", 2 * i + 1, offset, + offset + sizeof (u32)); if (rv) { clib_warning ("enqueue returned %d", rv); - goto out; + goto err; } } - /* Paint missing data backwards */ - for (i = 3; i > 0; i--) + if (verbose) + vlib_cli_output (vm, "fifo after odd segs: %U", format_svm_fifo, f, 1); + TCP_TEST ((f->tail == 8), "fifo tail %u", f->tail); + + /* Paint some of missing data backwards */ + for (i = 3; i > 1; i--) { offset = (2 * i + 0) * sizeof (u32); - - vlib_cli_output (vm, "add offset %d", offset); - - rv = svm_fifo_enqueue_with_offset - (f, 0 /* pid */ , offset, sizeof (u32), - (u8 *) (test_data + ((offset + sizeof (u32)) / sizeof (u32)))); - + data = (u8 *) (test_data + (2 * i + 0)); + rv = + svm_fifo_enqueue_with_offset (f, 0 /* pid */ , offset, sizeof (u32), + data); + if (verbose) + vlib_cli_output (vm, "add [%d] [%d, %d]", 2 * i, offset, + offset + sizeof (u32)); if (rv) { clib_warning ("enqueue returned %d", rv); - goto out; + goto err; } } - vlib_cli_output (vm, "fifo before missing link: %U", - format_svm_fifo, f, 1 /* verbose */ ); + if (verbose) + vlib_cli_output (vm, "fifo before missing link: %U", format_svm_fifo, f, + 1); + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 1), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); + ooo_seg = svm_fifo_first_ooo_segment (f); + TCP_TEST ((ooo_seg->start == 12), + "first ooo seg position %u", ooo_seg->start); + TCP_TEST ((ooo_seg->length == 16), + "first ooo seg length %u", ooo_seg->length); /* Enqueue the missing u32 */ - rv = svm_fifo_enqueue_nowait (f, 0 /* pid */ , - sizeof (u32), (u8 *) (test_data + 1)); - if (rv != 7 * sizeof (u32)) - { - clib_warning ("enqueue returned %d", rv); - goto out; - } - - vlib_cli_output (vm, "fifo after missing link: %U", - format_svm_fifo, f, 1 /* verbose */ ); + rv = svm_fifo_enqueue_nowait (f, 0 /* pid */ , sizeof (u32), + (u8 *) (test_data + 2)); + if (verbose) + vlib_cli_output (vm, "fifo after missing link: %U", format_svm_fifo, f, + 1); + TCP_TEST ((rv == 20), "bytes to be enqueued %u", rv); + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 0), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); /* Collect results */ for (i = 0; i < 7; i++) @@ -265,25 +400,316 @@ tcp_test_fifo (vlib_main_t * vm, unformat_input_t * input) (u8 *) & data_word); if (rv != sizeof (u32)) { - clib_warning ("dequeue returned %d", rv); - goto out; + clib_warning ("bytes dequeues %u", rv); + goto err; } if (data_word != test_data[i]) { - clib_warning ("recovered data %d not %d", data_word, test_data[i]); - goto out; + clib_warning ("recovered [%d] %d not %d", i, data_word, + test_data[i]); + goto err; } } - clib_warning ("test complete..."); + svm_fifo_free (f); + vec_free (test_data); + return 0; -out: +err: svm_fifo_free (f); vec_free (test_data); + return -1; +} + +static int +tcp_test_fifo2 (vlib_main_t * vm) +{ + svm_fifo_t *f; + u32 fifo_size = 1 << 20; + int i, rv, test_data_len; + u64 data64; + test_pattern_t *tp, *vp, *test_data; + ooo_segment_t *ooo_seg; + + test_data = test_pattern; + test_data_len = ARRAY_LEN (test_pattern); + + vp = fifo_get_validate_pattern (vm, test_data, test_data_len); + + /* Create a fifo */ + f = svm_fifo_create (fifo_size); + + /* Paint the fifo data vector with -1's */ + memset (f->data, 0xFF, 1 << 20); + + /* + * Try with sorted data + */ + for (i = 0; i < test_data_len; i++) + { + tp = vp + i; + data64 = tp->offset; + rv = svm_fifo_enqueue_with_offset (f, 0, tp->offset, tp->len, + (u8 *) & data64); + } + + /* Expected result: one big fat chunk at offset 4 */ + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 1), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); + ooo_seg = svm_fifo_first_ooo_segment (f); + TCP_TEST ((ooo_seg->start == 4), + "first ooo seg position %u", ooo_seg->start); + TCP_TEST ((ooo_seg->length == 2996), + "first ooo seg length %u", ooo_seg->length); + + data64 = 0; + rv = svm_fifo_enqueue_nowait (f, 0, sizeof (u32), (u8 *) & data64); + TCP_TEST ((rv == 3000), "bytes to be enqueued %u", rv); + + svm_fifo_free (f); + vec_free (vp); + + /* + * Now try it again w/ unsorted data... + */ + + f = svm_fifo_create (fifo_size); + + /* Paint fifo data vector with -1's */ + memset (f->data, 0xFF, 1 << 20); + + for (i = 0; i < test_data_len; i++) + { + tp = &test_data[i]; + data64 = tp->offset; + rv = svm_fifo_enqueue_with_offset (f, 0, tp->offset, tp->len, + (u8 *) & data64); + if (rv) + { + clib_warning ("enqueue returned %d", rv); + } + } + + /* Expecting the same result: one big fat chunk at offset 4 */ + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 1), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); + ooo_seg = svm_fifo_first_ooo_segment (f); + TCP_TEST ((ooo_seg->start == 4), + "first ooo seg position %u", ooo_seg->start); + TCP_TEST ((ooo_seg->length == 2996), + "first ooo seg length %u", ooo_seg->length); + + data64 = 0; + rv = svm_fifo_enqueue_nowait (f, 0, sizeof (u32), (u8 *) & data64); + + TCP_TEST ((rv == 3000), "bytes to be enqueued %u", rv); + + svm_fifo_free (f); + return 0; } +static int +tcp_test_fifo3 (vlib_main_t * vm, unformat_input_t * input) +{ + svm_fifo_t *f; + u32 fifo_size = 4 << 10; + u32 fifo_initial_offset = 0; + u32 total_size = 2 << 10; + int overlap = 0; + int i, rv; + u8 *data_pattern = 0; + test_pattern_t *tp, *generate = 0; + u32 nsegs = 2; + u32 seg_size, length_so_far; + u32 current_offset, offset_increment, len_this_chunk; + u32 seed = 0xdeaddabe; + int verbose = 0; + int randomize = 1; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "fifo-size %d", &fifo_size)) + ; + else if (unformat (input, "total-size %d", &total_size)) + ; + else if (unformat (input, "verbose")) + verbose = 1; + else if (unformat (input, "overlap")) + overlap = 1; + else if (unformat (input, "initial-offset %d", &fifo_initial_offset)) + ; + else if (unformat (input, "seed %d", &seed)) + ; + else if (unformat (input, "nsegs %d", &nsegs)) + ; + else if (unformat (input, "no-randomize")) + randomize = 0; + else + { + clib_error_t *e = clib_error_return + (0, "unknown input `%U'", format_unformat_error, input); + clib_error_report (e); + return -1; + } + } + /* + * Generate data + */ + vec_validate (data_pattern, total_size - 1); + for (i = 0; i < vec_len (data_pattern); i++) + data_pattern[i] = i & 0xff; + + seg_size = total_size / nsegs; + length_so_far = 0; + current_offset = 1; + while (length_so_far < total_size) + { + vec_add2 (generate, tp, 1); + len_this_chunk = clib_min (seg_size, total_size - length_so_far); + tp->offset = current_offset; + tp->len = len_this_chunk; + + if (overlap && (len_this_chunk == seg_size)) + do + { + offset_increment = len_this_chunk + % (1 + (random_u32 (&seed) % len_this_chunk)); + } + while (offset_increment == 0); + else + offset_increment = len_this_chunk; + + current_offset += offset_increment; + length_so_far = tp->offset + tp->len; + } + + /* + * Validate segment list. Only valid for non-overlap cases. + */ + if (overlap == 0) + fifo_validate_pattern (vm, generate, vec_len (generate)); + + if (verbose) + { + vlib_cli_output (vm, "raw data pattern:"); + for (i = 0; i < vec_len (generate); i++) + { + vlib_cli_output (vm, "[%d] offset %u len %u", i, + generate[i].offset, generate[i].len); + } + } + + /* Randomize data pattern */ + if (randomize) + { + for (i = 0; i < vec_len (generate) / 2; i++) + { + u32 src_index, dst_index; + test_pattern_t _tmp, *tmp = &_tmp; + + src_index = random_u32 (&seed) % vec_len (generate); + dst_index = random_u32 (&seed) % vec_len (generate); + + tmp[0] = generate[dst_index]; + generate[dst_index] = generate[src_index]; + generate[src_index] = tmp[0]; + } + } + + if (verbose) + { + vlib_cli_output (vm, "randomized data pattern:"); + for (i = 0; i < vec_len (generate); i++) + { + vlib_cli_output (vm, "[%d] offset %u len %u", i, + generate[i].offset, generate[i].len); + } + } + + /* Create a fifo */ + f = svm_fifo_create (fifo_size); + + /* Paint the fifo data vector with -1's */ + memset (f->data, 0xFF, fifo_size); + + /* manually set head and tail pointers to validate modular arithmetic */ + f->head = fifo_initial_offset % fifo_size; + f->tail = fifo_initial_offset % fifo_size; + + for (i = 0; i < vec_len (generate); i++) + { + tp = generate + i; + rv = svm_fifo_enqueue_with_offset (f, 0, tp->offset, tp->len, + (u8 *) data_pattern + tp->offset); + } + + /* Expected result: one big fat chunk at offset 1 */ + + if (verbose) + vlib_cli_output (vm, "fifo before missing link: %U", + format_svm_fifo, f, 1 /* verbose */ ); + + rv = svm_fifo_enqueue_nowait (f, 0, 1 /* count */ , data_pattern + 0); + + if (verbose) + vlib_cli_output (vm, "in-order enqueue returned %d", rv); + + TCP_TEST ((rv == total_size), "retrieved %u expected %u", rv, total_size); + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 0), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); + svm_fifo_free (f); + vec_free (data_pattern); + + return 0; +} + +static int +tcp_test_fifo (vlib_main_t * vm, unformat_input_t * input) +{ + int res = 0; + + /* Run all tests */ + if (unformat_check_input (input) == UNFORMAT_END_OF_INPUT) + { + res = tcp_test_fifo1 (vm, input); + if (res) + return res; + + res = tcp_test_fifo2 (vm); + if (res) + return res; + + /* Run a number of fifo3 configs */ + unformat_init_cstring (input, "nsegs 3 overlap seed 123"); + if (tcp_test_fifo3 (vm, input)) + return -1; + unformat_free (input); + + unformat_init_cstring (input, "nsegs 10"); + if (tcp_test_fifo3 (vm, input)) + return -1; + unformat_free (input); + } + else + { + if (unformat (input, "fifo3")) + { + res = tcp_test_fifo3 (vm, input); + } + else if (unformat (input, "fifo2")) + { + res = tcp_test_fifo2 (vm); + } + else if (unformat (input, "fifo1")) + { + res = tcp_test_fifo1 (vm, input); + } + } + + return res; +} static clib_error_t * tcp_test (vlib_main_t * vm, -- cgit 1.2.3-korg From a5464817522c7a7dc760af4612f1d6a68ed0afc8 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Wed, 19 Apr 2017 13:00:05 -0700 Subject: Session layer improvements Among others: - Moved app event queue to shared memory segment - Use private memory segment for builtin apps - Remove pid from svm fifo - Protect session fifo (de)allocation - Use fifo event for session disconnects - Have session queue node poll in all wk threads Change-Id: I89dbf7fdfebef12f5ef2b34ba3ef3c2c07f49ff2 Signed-off-by: Florin Coras --- src/svm/svm_fifo.c | 30 ++--- src/svm/svm_fifo.h | 31 ++--- src/svm/svm_fifo_segment.c | 50 +++++++- src/svm/svm_fifo_segment.h | 5 + src/svm/test_svm_fifo1.c | 27 ++--- src/uri/uri_tcp_test.c | 189 ++++++++++++++++++++----------- src/uri/uri_udp_test.c | 40 ++++--- src/vnet/session/application.c | 48 +++----- src/vnet/session/application.h | 12 -- src/vnet/session/application_interface.c | 26 +---- src/vnet/session/application_interface.h | 38 ++++++- src/vnet/session/node.c | 63 ++++++----- src/vnet/session/segment_manager.c | 134 ++++++++++++++++++---- src/vnet/session/segment_manager.h | 12 ++ src/vnet/session/session.c | 138 ++++++++++++---------- src/vnet/session/session.h | 19 ++-- src/vnet/session/session_api.c | 58 ++++------ src/vnet/tcp/builtin_client.c | 9 +- src/vnet/tcp/builtin_server.c | 8 +- src/vnet/tcp/tcp.c | 13 ++- src/vnet/tcp/tcp_input.c | 8 +- src/vnet/tcp/tcp_output.c | 6 - src/vnet/tcp/tcp_test.c | 43 ++++--- src/vnet/udp/builtin_server.c | 8 +- src/vnet/udp/udp_input.c | 5 +- 25 files changed, 604 insertions(+), 416 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c index f428d3ec..8f2ed0c9 100644 --- a/src/svm/svm_fifo.c +++ b/src/svm/svm_fifo.c @@ -57,7 +57,7 @@ format_svm_fifo (u8 * s, va_list * args) if (verbose > 1) s = format (s, "server session %d thread %d client session %d thread %d\n", - f->server_session_index, f->server_thread_index, + f->master_session_index, f->master_thread_index, f->client_session_index, f->client_thread_index); if (verbose) @@ -353,8 +353,7 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) } static int -svm_fifo_enqueue_internal (svm_fifo_t * f, - int pid, u32 max_bytes, u8 * copy_from_here) +svm_fifo_enqueue_internal (svm_fifo_t * f, u32 max_bytes, u8 * copy_from_here) { u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; u32 cursize, nitems; @@ -411,10 +410,9 @@ svm_fifo_enqueue_internal (svm_fifo_t * f, } int -svm_fifo_enqueue_nowait (svm_fifo_t * f, - int pid, u32 max_bytes, u8 * copy_from_here) +svm_fifo_enqueue_nowait (svm_fifo_t * f, u32 max_bytes, u8 * copy_from_here) { - return svm_fifo_enqueue_internal (f, pid, max_bytes, copy_from_here); + return svm_fifo_enqueue_internal (f, max_bytes, copy_from_here); } /** @@ -426,7 +424,6 @@ svm_fifo_enqueue_nowait (svm_fifo_t * f, */ static int svm_fifo_enqueue_with_offset_internal (svm_fifo_t * f, - int pid, u32 offset, u32 required_bytes, u8 * copy_from_here) @@ -439,7 +436,7 @@ svm_fifo_enqueue_with_offset_internal (svm_fifo_t * f, /* Users would do well to avoid this */ if (PREDICT_FALSE (f->tail == (offset % f->nitems))) { - rv = svm_fifo_enqueue_internal (f, pid, required_bytes, copy_from_here); + rv = svm_fifo_enqueue_internal (f, required_bytes, copy_from_here); if (rv > 0) return 0; return -1; @@ -484,18 +481,16 @@ svm_fifo_enqueue_with_offset_internal (svm_fifo_t * f, int svm_fifo_enqueue_with_offset (svm_fifo_t * f, - int pid, u32 offset, u32 required_bytes, u8 * copy_from_here) { - return svm_fifo_enqueue_with_offset_internal - (f, pid, offset, required_bytes, copy_from_here); + return svm_fifo_enqueue_with_offset_internal (f, offset, required_bytes, + copy_from_here); } static int -svm_fifo_dequeue_internal (svm_fifo_t * f, - int pid, u32 max_bytes, u8 * copy_here) +svm_fifo_dequeue_internal (svm_fifo_t * f, u32 max_bytes, u8 * copy_here) { u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; u32 cursize, nitems; @@ -545,14 +540,13 @@ svm_fifo_dequeue_internal (svm_fifo_t * f, } int -svm_fifo_dequeue_nowait (svm_fifo_t * f, - int pid, u32 max_bytes, u8 * copy_here) +svm_fifo_dequeue_nowait (svm_fifo_t * f, u32 max_bytes, u8 * copy_here) { - return svm_fifo_dequeue_internal (f, pid, max_bytes, copy_here); + return svm_fifo_dequeue_internal (f, max_bytes, copy_here); } int -svm_fifo_peek (svm_fifo_t * f, int pid, u32 relative_offset, u32 max_bytes, +svm_fifo_peek (svm_fifo_t * f, u32 relative_offset, u32 max_bytes, u8 * copy_here) { u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; @@ -590,7 +584,7 @@ svm_fifo_peek (svm_fifo_t * f, int pid, u32 relative_offset, u32 max_bytes, } int -svm_fifo_dequeue_drop (svm_fifo_t * f, int pid, u32 max_bytes) +svm_fifo_dequeue_drop (svm_fifo_t * f, u32 max_bytes) { u32 total_drop_bytes, first_drop_bytes, second_drop_bytes; u32 cursize, nitems; diff --git a/src/svm/svm_fifo.h b/src/svm/svm_fifo.h index 0fff2577..d67237c6 100644 --- a/src/svm/svm_fifo.h +++ b/src/svm/svm_fifo.h @@ -23,13 +23,6 @@ #include #include -typedef enum -{ - SVM_FIFO_TAG_NOT_HELD = 0, - SVM_FIFO_TAG_DEQUEUE, - SVM_FIFO_TAG_ENQUEUE, -} svm_lock_tag_t; - /** Out-of-order segment */ typedef struct { @@ -37,7 +30,7 @@ typedef struct u32 prev; /**< Previous linked-list element pool index */ u32 start; /**< Start of segment, normalized*/ - u32 length; /**< Length of segment */ + u32 length; /**< Length of segment */ } ooo_segment_t; format_function_t format_ooo_segment; @@ -52,12 +45,11 @@ typedef struct CLIB_CACHE_LINE_ALIGN_MARK (end_cursize); volatile u8 has_event; /**< non-zero if deq event exists */ - u32 owner_pid; /* Backpointers */ - u32 server_session_index; + u32 master_session_index; u32 client_session_index; - u8 server_thread_index; + u8 master_thread_index; u8 client_thread_index; u32 segment_manager; CLIB_CACHE_LINE_ALIGN_MARK (end_shared); @@ -117,19 +109,14 @@ svm_fifo_unset_event (svm_fifo_t * f) svm_fifo_t *svm_fifo_create (u32 data_size_in_bytes); void svm_fifo_free (svm_fifo_t * f); -int svm_fifo_enqueue_nowait (svm_fifo_t * f, int pid, u32 max_bytes, +int svm_fifo_enqueue_nowait (svm_fifo_t * f, u32 max_bytes, u8 * copy_from_here); +int svm_fifo_enqueue_with_offset (svm_fifo_t * f, u32 offset, + u32 required_bytes, u8 * copy_from_here); +int svm_fifo_dequeue_nowait (svm_fifo_t * f, u32 max_bytes, u8 * copy_here); -int svm_fifo_enqueue_with_offset (svm_fifo_t * f, int pid, - u32 offset, u32 required_bytes, - u8 * copy_from_here); - -int svm_fifo_dequeue_nowait (svm_fifo_t * f, int pid, u32 max_bytes, - u8 * copy_here); - -int svm_fifo_peek (svm_fifo_t * f, int pid, u32 offset, u32 max_bytes, - u8 * copy_here); -int svm_fifo_dequeue_drop (svm_fifo_t * f, int pid, u32 max_bytes); +int svm_fifo_peek (svm_fifo_t * f, u32 offset, u32 max_bytes, u8 * copy_here); +int svm_fifo_dequeue_drop (svm_fifo_t * f, u32 max_bytes); u32 svm_fifo_number_ooo_segments (svm_fifo_t * f); ooo_segment_t *svm_fifo_first_ooo_segment (svm_fifo_t * f); diff --git a/src/svm/svm_fifo_segment.c b/src/svm/svm_fifo_segment.c index acabb3bd..281fae27 100644 --- a/src/svm/svm_fifo_segment.c +++ b/src/svm/svm_fifo_segment.c @@ -70,6 +70,44 @@ svm_fifo_segment_create (svm_fifo_segment_create_args_t * a) return (0); } +/** Create an svm fifo segment in process-private memory */ +int +svm_fifo_segment_create_process_private (svm_fifo_segment_create_args_t * a) +{ + svm_fifo_segment_private_t *s; + svm_fifo_segment_main_t *sm = &svm_fifo_segment_main; + ssvm_shared_header_t *sh; + svm_fifo_segment_header_t *fsh; + + /* Allocate a fresh segment */ + pool_get (sm->segments, s); + memset (s, 0, sizeof (*s)); + + s->ssvm.ssvm_size = ~0; + s->ssvm.i_am_master = 1; + s->ssvm.my_pid = getpid (); + s->ssvm.name = (u8 *) a->segment_name; + s->ssvm.requested_va = ~0; + + /* Allocate a [sic] shared memory header, in process memory... */ + sh = clib_mem_alloc_aligned (sizeof (*sh), CLIB_CACHE_LINE_BYTES); + s->ssvm.sh = sh; + + memset (sh, 0, sizeof (*sh)); + sh->heap = clib_mem_get_heap (); + + /* Set up svm_fifo_segment shared header */ + fsh = clib_mem_alloc (sizeof (*fsh)); + memset (fsh, 0, sizeof (*fsh)); + sh->opaque[0] = fsh; + s->h = fsh; + fsh->segment_name = format (0, "%s%c", a->segment_name, 0); + + sh->ready = 1; + a->new_segment_index = s - sm->segments; + return (0); +} + /** (slave) attach to an svm fifo segment */ int svm_fifo_segment_attach (svm_fifo_segment_create_args_t * a) @@ -82,7 +120,6 @@ svm_fifo_segment_attach (svm_fifo_segment_create_args_t * a) /* Allocate a fresh segment */ pool_get (sm->segments, s); - memset (s, 0, sizeof (*s)); s->ssvm.ssvm_size = a->segment_size; @@ -126,19 +163,22 @@ svm_fifo_segment_alloc_fifo (svm_fifo_segment_private_t * s, sh = s->ssvm.sh; fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; + + ssvm_lock (sh, 1, 0); oldheap = ssvm_push_heap (sh); /* Note: this can fail, in which case: create another segment */ f = svm_fifo_create (data_size_in_bytes); - if (f == 0) + if (PREDICT_FALSE (f == 0)) { ssvm_pop_heap (oldheap); + ssvm_unlock (sh); return (0); } vec_add1 (fsh->fifos, f); - ssvm_pop_heap (oldheap); + ssvm_unlock (sh); return (f); } @@ -152,8 +192,9 @@ svm_fifo_segment_free_fifo (svm_fifo_segment_private_t * s, svm_fifo_t * f) sh = s->ssvm.sh; fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; - oldheap = ssvm_push_heap (sh); + ssvm_lock (sh, 1, 0); + oldheap = ssvm_push_heap (sh); for (i = 0; i < vec_len (fsh->fifos); i++) { if (fsh->fifos[i] == f) @@ -167,6 +208,7 @@ svm_fifo_segment_free_fifo (svm_fifo_segment_private_t * s, svm_fifo_t * f) found: clib_mem_free (f); ssvm_pop_heap (oldheap); + ssvm_unlock (sh); } void diff --git a/src/svm/svm_fifo_segment.h b/src/svm/svm_fifo_segment.h index 9ab47a4c..4218013a 100644 --- a/src/svm/svm_fifo_segment.h +++ b/src/svm/svm_fifo_segment.h @@ -17,6 +17,7 @@ #include #include +#include typedef struct { @@ -32,6 +33,8 @@ typedef struct typedef struct { + volatile u32 lock; + /** pool of segments */ svm_fifo_segment_private_t *segments; /* Where to put the next one */ @@ -78,6 +81,8 @@ typedef enum } ssvm_fifo_segment_api_error_enum_t; int svm_fifo_segment_create (svm_fifo_segment_create_args_t * a); +int svm_fifo_segment_create_process_private (svm_fifo_segment_create_args_t + * a); int svm_fifo_segment_attach (svm_fifo_segment_create_args_t * a); void svm_fifo_segment_delete (svm_fifo_segment_private_t * s); diff --git a/src/svm/test_svm_fifo1.c b/src/svm/test_svm_fifo1.c index 355653df..398dd6d7 100644 --- a/src/svm/test_svm_fifo1.c +++ b/src/svm/test_svm_fifo1.c @@ -25,7 +25,6 @@ hello_world (int verbose) u8 *test_data; u8 *retrieved_data = 0; clib_error_t *error = 0; - int pid = getpid (); memset (a, 0, sizeof (*a)); @@ -48,18 +47,16 @@ hello_world (int verbose) vec_validate (retrieved_data, vec_len (test_data) - 1); while (svm_fifo_max_enqueue (f) >= vec_len (test_data)) - svm_fifo_enqueue_nowait (f, pid, vec_len (test_data), test_data); + svm_fifo_enqueue_nowait (f, vec_len (test_data), test_data); while (svm_fifo_max_dequeue (f) >= vec_len (test_data)) - svm_fifo_dequeue_nowait (f, pid, vec_len (retrieved_data), - retrieved_data); + svm_fifo_dequeue_nowait (f, vec_len (retrieved_data), retrieved_data); while (svm_fifo_max_enqueue (f) >= vec_len (test_data)) - svm_fifo_enqueue_nowait (f, pid, vec_len (test_data), test_data); + svm_fifo_enqueue_nowait (f, vec_len (test_data), test_data); while (svm_fifo_max_dequeue (f) >= vec_len (test_data)) - svm_fifo_dequeue_nowait (f, pid, vec_len (retrieved_data), - retrieved_data); + svm_fifo_dequeue_nowait (f, vec_len (retrieved_data), retrieved_data); if (!memcmp (retrieved_data, test_data, vec_len (test_data))) error = clib_error_return (0, "data test OK, got '%s'", retrieved_data); @@ -81,7 +78,6 @@ master (int verbose) u8 *test_data; u8 *retrieved_data = 0; int i; - int pid = getpid (); memset (a, 0, sizeof (*a)); @@ -104,7 +100,7 @@ master (int verbose) vec_validate (retrieved_data, vec_len (test_data) - 1); for (i = 0; i < 1000; i++) - svm_fifo_enqueue_nowait (f, pid, vec_len (test_data), test_data); + svm_fifo_enqueue_nowait (f, vec_len (test_data), test_data); return clib_error_return (0, "master (enqueue) done"); } @@ -176,7 +172,6 @@ offset (int verbose) u32 *test_data = 0; u32 *recovered_data = 0; int i; - int pid = getpid (); memset (a, 0, sizeof (*a)); @@ -199,19 +194,19 @@ offset (int verbose) vec_add1 (test_data, i); /* Enqueue the first 1024 u32's */ - svm_fifo_enqueue_nowait (f, pid, 4096 /* bytes to enqueue */ , + svm_fifo_enqueue_nowait (f, 4096 /* bytes to enqueue */ , (u8 *) test_data); /* Enqueue the third 1024 u32's 2048 ahead of the current tail */ - svm_fifo_enqueue_with_offset (f, pid, 4096, 4096, (u8 *) & test_data[2048]); + svm_fifo_enqueue_with_offset (f, 4096, 4096, (u8 *) & test_data[2048]); /* Enqueue the second 1024 u32's at the current tail */ - svm_fifo_enqueue_nowait (f, pid, 4096 /* bytes to enqueue */ , + svm_fifo_enqueue_nowait (f, 4096 /* bytes to enqueue */ , (u8 *) & test_data[1024]); vec_validate (recovered_data, (3 * 1024) - 1); - svm_fifo_dequeue_nowait (f, pid, 3 * 4096, (u8 *) recovered_data); + svm_fifo_dequeue_nowait (f, 3 * 4096, (u8 *) recovered_data); for (i = 0; i < (3 * 1024); i++) { @@ -237,7 +232,6 @@ slave (int verbose) int rv; u8 *test_data; u8 *retrieved_data = 0; - int pid = getpid (); int i; memset (a, 0, sizeof (*a)); @@ -262,8 +256,7 @@ slave (int verbose) for (i = 0; i < 1000; i++) { - svm_fifo_dequeue_nowait (f, pid, vec_len (retrieved_data), - retrieved_data); + svm_fifo_dequeue_nowait (f, vec_len (retrieved_data), retrieved_data); if (memcmp (retrieved_data, test_data, vec_len (retrieved_data))) return clib_error_return (0, "retrieved data incorrect, '%s'", retrieved_data); diff --git a/src/uri/uri_tcp_test.c b/src/uri/uri_tcp_test.c index 2e15d36c..686c93f9 100755 --- a/src/uri/uri_tcp_test.c +++ b/src/uri/uri_tcp_test.c @@ -45,12 +45,13 @@ typedef struct svm_fifo_t *server_rx_fifo; svm_fifo_t *server_tx_fifo; - u32 vpp_session_handle; + u64 vpp_session_handle; } session_t; typedef enum { STATE_START, + STATE_ATTACHED, STATE_READY, STATE_DISCONNECTING, STATE_FAILED @@ -127,6 +128,34 @@ uri_tcp_test_main_t uri_tcp_test_main; #define NITER 4000000 #endif +static u8 * +format_api_error (u8 * s, va_list * args) +{ + uri_tcp_test_main_t *utm = &uri_tcp_test_main; + i32 error = va_arg (*args, u32); + uword *p; + + p = hash_get (utm->error_string_by_error_number, -error); + + if (p) + s = format (s, "%s", p[0]); + else + s = format (s, "%d", error); + return s; +} + +static void +init_error_string_table (uri_tcp_test_main_t * utm) +{ + utm->error_string_by_error_number = hash_create (0, sizeof (uword)); + +#define _(n,v,s) hash_set (utm->error_string_by_error_number, -v, s); + foreach_vnet_api_error; +#undef _ + + hash_set (utm->error_string_by_error_number, 99, "Misc"); +} + int wait_for_state_change (uri_tcp_test_main_t * utm, connection_state_t state) { @@ -150,7 +179,7 @@ wait_for_state_change (uri_tcp_test_main_t * utm, connection_state_t state) } void -application_attach (uri_tcp_test_main_t * utm) +application_send_attach (uri_tcp_test_main_t * utm) { vl_api_application_attach_t *bmp; u32 fifo_size = 3 << 20; @@ -160,8 +189,8 @@ application_attach (uri_tcp_test_main_t * utm) bmp->_vl_msg_id = ntohs (VL_API_APPLICATION_ATTACH); bmp->client_index = utm->my_client_index; bmp->context = ntohl (0xfeedface); - bmp->options[SESSION_OPTIONS_FLAGS] = - SESSION_OPTIONS_FLAGS_USE_FIFO | SESSION_OPTIONS_FLAGS_ADD_SEGMENT; + bmp->options[APP_OPTIONS_FLAGS] = + APP_OPTIONS_FLAGS_USE_FIFO | APP_OPTIONS_FLAGS_ADD_SEGMENT; bmp->options[SESSION_OPTIONS_RX_FIFO_SIZE] = fifo_size; bmp->options[SESSION_OPTIONS_TX_FIFO_SIZE] = fifo_size; bmp->options[SESSION_OPTIONS_ADD_SEGMENT_SIZE] = 128 << 20; @@ -169,6 +198,18 @@ application_attach (uri_tcp_test_main_t * utm) vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & bmp); } +int +application_attach (uri_tcp_test_main_t * utm) +{ + application_send_attach (utm); + if (wait_for_state_change (utm, STATE_ATTACHED)) + { + clib_warning ("timeout waiting for STATE_ATTACHED"); + return -1; + } + return 0; +} + void application_detach (uri_tcp_test_main_t * utm) { @@ -192,8 +233,8 @@ vl_api_application_attach_reply_t_handler (vl_api_application_attach_reply_t * if (mp->retval) { - uword *errp = hash_get (utm->error_string_by_error_number, -mp->retval); - clib_warning ("attach failed: %s", *errp); + clib_warning ("attach failed: %U", format_api_error, + clib_net_to_host_u32 (mp->retval)); utm->state = STATE_FAILED; return; } @@ -220,7 +261,7 @@ vl_api_application_attach_reply_t_handler (vl_api_application_attach_reply_t * utm->our_event_queue = (unix_shared_memory_queue_t *) mp->app_event_queue_address; - + utm->state = STATE_ATTACHED; } static void @@ -231,18 +272,6 @@ vl_api_application_detach_reply_t_handler (vl_api_application_detach_reply_t * clib_warning ("detach returned with err: %d", mp->retval); } -static void -init_error_string_table (uri_tcp_test_main_t * utm) -{ - utm->error_string_by_error_number = hash_create (0, sizeof (uword)); - -#define _(n,v,s) hash_set (utm->error_string_by_error_number, -v, s); - foreach_vnet_api_error; -#undef _ - - hash_set (utm->error_string_by_error_number, 99, "Misc"); -} - static void stop_signal (int signum) { @@ -392,7 +421,7 @@ client_handle_fifo_event_rx (uri_tcp_test_main_t * utm, /* Read the bytes */ do { - n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, + n_read = svm_fifo_dequeue_nowait (rx_fifo, clib_min (vec_len (utm->rx_buf), bytes), utm->rx_buf); if (n_read > 0) @@ -432,11 +461,11 @@ client_handle_event_queue (uri_tcp_test_main_t * utm) 0 /* nowait */ ); switch (e->event_type) { - case FIFO_EVENT_SERVER_RX: + case FIFO_EVENT_APP_RX: client_handle_fifo_event_rx (utm, e); break; - case FIFO_EVENT_SERVER_EXIT: + case FIFO_EVENT_DISCONNECT: return; default: @@ -458,11 +487,11 @@ client_rx_thread_fn (void *arg) 0 /* nowait */ ); switch (e->event_type) { - case FIFO_EVENT_SERVER_RX: + case FIFO_EVENT_APP_RX: client_handle_fifo_event_rx (utm, e); break; - case FIFO_EVENT_SERVER_EXIT: + case FIFO_EVENT_DISCONNECT: return 0; default: clib_warning ("unknown event type %d", e->event_type); @@ -487,9 +516,8 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) if (mp->retval) { - uword *errp = hash_get (utm->error_string_by_error_number, - -clib_net_to_host_u32 (mp->retval)); - clib_warning ("connection failed with code: %s", *errp); + clib_warning ("connection failed with code: %U", format_api_error, + clib_net_to_host_u32 (mp->retval)); utm->state = STATE_FAILED; return; } @@ -551,7 +579,7 @@ send_test_chunk (uri_tcp_test_main_t * utm, svm_fifo_t * tx_fifo, int mypid, { actual_write = bytes_to_snd > queue_max_chunk ? queue_max_chunk : bytes_to_snd; - rv = svm_fifo_enqueue_nowait (tx_fifo, mypid, actual_write, + rv = svm_fifo_enqueue_nowait (tx_fifo, actual_write, test_data + test_buf_offset); if (rv > 0) @@ -564,7 +592,7 @@ send_test_chunk (uri_tcp_test_main_t * utm, svm_fifo_t * tx_fifo, int mypid, { /* Fabricate TX event, send to vpp */ evt.fifo = tx_fifo; - evt.event_type = FIFO_EVENT_SERVER_TX; + evt.event_type = FIFO_EVENT_APP_TX; evt.event_id = serial_number++; unix_shared_memory_queue_add (utm->vpp_event_queue, @@ -619,7 +647,7 @@ client_send_data (uri_tcp_test_main_t * utm) } void -client_connect (uri_tcp_test_main_t * utm) +client_send_connect (uri_tcp_test_main_t * utm) { vl_api_connect_uri_t *cmp; cmp = vl_msg_api_alloc (sizeof (*cmp)); @@ -632,8 +660,20 @@ client_connect (uri_tcp_test_main_t * utm) vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & cmp); } +int +client_connect (uri_tcp_test_main_t * utm) +{ + client_send_connect (utm); + if (wait_for_state_change (utm, STATE_READY)) + { + clib_warning ("Connect failed"); + return -1; + } + return 0; +} + void -client_disconnect (uri_tcp_test_main_t * utm) +client_send_disconnect (uri_tcp_test_main_t * utm) { session_t *connected_session; vl_api_disconnect_session_t *dmp; @@ -647,16 +687,29 @@ client_disconnect (uri_tcp_test_main_t * utm) vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & dmp); } +int +client_disconnect (uri_tcp_test_main_t * utm) +{ + client_send_disconnect (utm); + if (wait_for_state_change (utm, STATE_START)) + { + clib_warning ("Disconnect failed"); + return -1; + } + return 0; +} + static void client_test (uri_tcp_test_main_t * utm) { int i; - application_attach (utm); - client_connect (utm); + if (application_attach (utm)) + return; - if (wait_for_state_change (utm, STATE_READY)) + if (client_connect (utm)) { + application_detach (utm); return; } @@ -671,11 +724,6 @@ client_test (uri_tcp_test_main_t * utm) /* Disconnect */ client_disconnect (utm); - if (wait_for_state_change (utm, STATE_START)) - { - clib_warning ("Disconnect failed"); - return; - } application_detach (utm); } @@ -686,9 +734,8 @@ vl_api_bind_uri_reply_t_handler (vl_api_bind_uri_reply_t * mp) if (mp->retval) { - uword *errp = hash_get (utm->error_string_by_error_number, - -clib_net_to_host_u32 (mp->retval)); - clib_warning ("bind failed: %s", (char *) *errp); + clib_warning ("bind failed: %s", format_api_error, + clib_net_to_host_u32 (mp->retval)); utm->state = STATE_FAILED; return; } @@ -869,7 +916,7 @@ server_handle_fifo_event_rx (uri_tcp_test_main_t * utm, /* Read the bytes */ do { - n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len (utm->rx_buf), + n_read = svm_fifo_dequeue_nowait (rx_fifo, vec_len (utm->rx_buf), utm->rx_buf); if (n_read > 0) bytes -= n_read; @@ -882,7 +929,7 @@ server_handle_fifo_event_rx (uri_tcp_test_main_t * utm, { do { - rv = svm_fifo_enqueue_nowait (tx_fifo, 0, n_read, utm->rx_buf); + rv = svm_fifo_enqueue_nowait (tx_fifo, n_read, utm->rx_buf); } while (rv <= 0 && !utm->time_to_stop); @@ -891,7 +938,7 @@ server_handle_fifo_event_rx (uri_tcp_test_main_t * utm, { /* Fabricate TX event, send to vpp */ evt.fifo = tx_fifo; - evt.event_type = FIFO_EVENT_SERVER_TX; + evt.event_type = FIFO_EVENT_APP_TX; evt.event_id = e->event_id; q = utm->vpp_event_queue; @@ -914,11 +961,11 @@ server_handle_event_queue (uri_tcp_test_main_t * utm) 0 /* nowait */ ); switch (e->event_type) { - case FIFO_EVENT_SERVER_RX: + case FIFO_EVENT_APP_RX: server_handle_fifo_event_rx (utm, e); break; - case FIFO_EVENT_SERVER_EXIT: + case FIFO_EVENT_DISCONNECT: return; default: @@ -936,7 +983,7 @@ server_handle_event_queue (uri_tcp_test_main_t * utm) } void -server_listen (uri_tcp_test_main_t * utm) +server_send_listen (uri_tcp_test_main_t * utm) { vl_api_bind_uri_t *bmp; bmp = vl_msg_api_alloc (sizeof (*bmp)); @@ -949,8 +996,20 @@ server_listen (uri_tcp_test_main_t * utm) vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & bmp); } +int +server_listen (uri_tcp_test_main_t * utm) +{ + server_send_listen (utm); + if (wait_for_state_change (utm, STATE_READY)) + { + clib_warning ("timeout waiting for STATE_READY"); + return -1; + } + return 0; +} + void -server_unbind (uri_tcp_test_main_t * utm) +server_send_unbind (uri_tcp_test_main_t * utm) { vl_api_unbind_uri_t *ump; @@ -963,31 +1022,33 @@ server_unbind (uri_tcp_test_main_t * utm) vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & ump); } +int +server_unbind (uri_tcp_test_main_t * utm) +{ + server_send_unbind (utm); + if (wait_for_state_change (utm, STATE_START)) + { + clib_warning ("timeout waiting for STATE_START"); + return -1; + } + return 0; +} + void server_test (uri_tcp_test_main_t * utm) { - application_attach (utm); + if (application_attach (utm)) + return; /* Bind to uri */ - server_listen (utm); - - if (wait_for_state_change (utm, STATE_READY)) - { - clib_warning ("timeout waiting for STATE_READY"); - return; - } + if (server_listen (utm)) + return; /* Enter handle event loop */ server_handle_event_queue (utm); /* Cleanup */ - server_unbind (utm); - - if (wait_for_state_change (utm, STATE_START)) - { - clib_warning ("timeout waiting for STATE_START"); - return; - } + server_send_unbind (utm); application_detach (utm); diff --git a/src/uri/uri_udp_test.c b/src/uri/uri_udp_test.c index 598052bc..266215c8 100644 --- a/src/uri/uri_udp_test.c +++ b/src/uri/uri_udp_test.c @@ -164,7 +164,7 @@ setup_signal_handlers (void) } void -application_attach (uri_udp_test_main_t * utm) +application_send_attach (uri_udp_test_main_t * utm) { vl_api_application_attach_t *bmp; u32 fifo_size = 3 << 20; @@ -174,8 +174,8 @@ application_attach (uri_udp_test_main_t * utm) bmp->_vl_msg_id = ntohs (VL_API_APPLICATION_ATTACH); bmp->client_index = utm->my_client_index; bmp->context = ntohl (0xfeedface); - bmp->options[SESSION_OPTIONS_FLAGS] = - SESSION_OPTIONS_FLAGS_USE_FIFO | SESSION_OPTIONS_FLAGS_ADD_SEGMENT; + bmp->options[APP_OPTIONS_FLAGS] = + APP_OPTIONS_FLAGS_USE_FIFO | APP_OPTIONS_FLAGS_ADD_SEGMENT; bmp->options[SESSION_OPTIONS_RX_FIFO_SIZE] = fifo_size; bmp->options[SESSION_OPTIONS_TX_FIFO_SIZE] = fifo_size; bmp->options[SESSION_OPTIONS_ADD_SEGMENT_SIZE] = 128 << 20; @@ -307,7 +307,7 @@ cut_through_thread_fn (void *arg) /* We read from the tx fifo and write to the rx fifo */ do { - actual_transfer = svm_fifo_dequeue_nowait (tx_fifo, 0, + actual_transfer = svm_fifo_dequeue_nowait (tx_fifo, vec_len (my_copy_buffer), my_copy_buffer); } @@ -318,7 +318,7 @@ cut_through_thread_fn (void *arg) buffer_offset = 0; while (actual_transfer > 0) { - rv = svm_fifo_enqueue_nowait (rx_fifo, 0, actual_transfer, + rv = svm_fifo_enqueue_nowait (rx_fifo, actual_transfer, my_copy_buffer + buffer_offset); if (rv > 0) { @@ -357,7 +357,6 @@ client_send (uri_udp_test_main_t * utm, session_t * session) u64 bytes_received = 0, bytes_sent = 0; i32 bytes_to_read; int rv; - int mypid = getpid (); f64 before, after, delta, bytes_per_second; svm_fifo_t *rx_fifo, *tx_fifo; int buffer_offset, bytes_to_send = 0; @@ -382,8 +381,7 @@ client_send (uri_udp_test_main_t * utm, session_t * session) buffer_offset = 0; while (bytes_to_send > 0) { - rv = svm_fifo_enqueue_nowait (tx_fifo, mypid, - bytes_to_send, + rv = svm_fifo_enqueue_nowait (tx_fifo, bytes_to_send, test_data + buffer_offset); if (rv > 0) @@ -402,7 +400,7 @@ client_send (uri_udp_test_main_t * utm, session_t * session) buffer_offset = 0; while (bytes_to_read > 0) { - rv = svm_fifo_dequeue_nowait (rx_fifo, mypid, + rv = svm_fifo_dequeue_nowait (rx_fifo, bytes_to_read, utm->rx_buf + buffer_offset); if (rv > 0) @@ -415,8 +413,8 @@ client_send (uri_udp_test_main_t * utm, session_t * session) } while (bytes_received < bytes_sent) { - rv = svm_fifo_dequeue_nowait (rx_fifo, mypid, - vec_len (utm->rx_buf), utm->rx_buf); + rv = + svm_fifo_dequeue_nowait (rx_fifo, vec_len (utm->rx_buf), utm->rx_buf); if (rv > 0) { #if CLIB_DEBUG > 0 @@ -459,7 +457,7 @@ uri_udp_client_test (uri_udp_test_main_t * utm) { session_t *session; - application_attach (utm); + application_send_attach (utm); udp_client_connect (utm); if (wait_for_state_change (utm, STATE_READY)) @@ -559,8 +557,8 @@ vl_api_connect_uri_t_handler (vl_api_connect_uri_t * mp) 128 * 1024); ASSERT (session->server_tx_fifo); - session->server_rx_fifo->server_session_index = session - utm->sessions; - session->server_tx_fifo->server_session_index = session - utm->sessions; + session->server_rx_fifo->master_session_index = session - utm->sessions; + session->server_tx_fifo->master_session_index = session - utm->sessions; utm->cut_through_session_index = session - utm->sessions; rv = pthread_create (&utm->cut_through_thread_handle, @@ -805,19 +803,19 @@ server_handle_fifo_event_rx (uri_udp_test_main_t * utm, do { - nbytes = svm_fifo_dequeue_nowait (rx_fifo, 0, - vec_len (utm->rx_buf), utm->rx_buf); + nbytes = svm_fifo_dequeue_nowait (rx_fifo, vec_len (utm->rx_buf), + utm->rx_buf); } while (nbytes <= 0); do { - rv = svm_fifo_enqueue_nowait (tx_fifo, 0, nbytes, utm->rx_buf); + rv = svm_fifo_enqueue_nowait (tx_fifo, nbytes, utm->rx_buf); } while (rv == -2); /* Fabricate TX event, send to vpp */ evt.fifo = tx_fifo; - evt.event_type = FIFO_EVENT_SERVER_TX; + evt.event_type = FIFO_EVENT_APP_TX; evt.event_id = e->event_id; if (svm_fifo_set_event (tx_fifo)) @@ -839,11 +837,11 @@ server_handle_event_queue (uri_udp_test_main_t * utm) 0 /* nowait */ ); switch (e->event_type) { - case FIFO_EVENT_SERVER_RX: + case FIFO_EVENT_APP_RX: server_handle_fifo_event_rx (utm, e); break; - case FIFO_EVENT_SERVER_EXIT: + case FIFO_EVENT_DISCONNECT: return; default: @@ -893,7 +891,7 @@ void udp_server_test (uri_udp_test_main_t * utm) { - application_attach (utm); + application_send_attach (utm); /* Bind to uri */ server_listen (utm); diff --git a/src/vnet/session/application.c b/src/vnet/session/application.c index 5a45537b..ccf9837f 100644 --- a/src/vnet/session/application.c +++ b/src/vnet/session/application.c @@ -87,14 +87,17 @@ application_new () void application_del (application_t * app) { - api_main_t *am = &api_main; - void *oldheap; segment_manager_t *sm; u64 handle; u32 index, *handles = 0; int i; vnet_unbind_args_t _a, *a = &_a; + /* + * The app event queue allocated in first segment is cleared with + * the segment manager. No need to explicitly free it. + */ + /* * Cleanup segment managers */ @@ -120,14 +123,6 @@ application_del (application_t * app) vnet_unbind (a); } - /* - * Free the event fifo in the /vpe-api shared-memory segment - */ - oldheap = svm_push_data_heap (am->vlib_rp); - if (app->event_queue) - unix_shared_memory_queue_free (app->event_queue); - svm_pop_heap (oldheap); - application_table_del (app); pool_put (app_pool, app); } @@ -149,30 +144,14 @@ int application_init (application_t * app, u32 api_client_index, u64 * options, session_cb_vft_t * cb_fns) { - api_main_t *am = &api_main; segment_manager_t *sm; segment_manager_properties_t *props; - void *oldheap; - u32 app_evt_queue_size; + u32 app_evt_queue_size, first_seg_size; int rv; app_evt_queue_size = options[APP_EVT_QUEUE_SIZE] > 0 ? options[APP_EVT_QUEUE_SIZE] : default_app_evt_queue_size; - /* Allocate event fifo in the /vpe-api shared-memory segment */ - oldheap = svm_push_data_heap (am->vlib_rp); - - /* Allocate server event queue */ - app->event_queue = - unix_shared_memory_queue_init (app_evt_queue_size, - sizeof (session_fifo_event_t), - 0 /* consumer pid */ , - 0 - /* (do not) signal when queue non-empty */ - ); - - svm_pop_heap (oldheap); - /* Setup segment manager */ sm = segment_manager_new (); sm->app_index = app->index; @@ -181,16 +160,21 @@ application_init (application_t * app, u32 api_client_index, u64 * options, props->rx_fifo_size = options[SESSION_OPTIONS_RX_FIFO_SIZE]; props->tx_fifo_size = options[SESSION_OPTIONS_TX_FIFO_SIZE]; props->add_segment = props->add_segment_size != 0; + props->use_private_segment = options[APP_OPTIONS_FLAGS] + & APP_OPTIONS_FLAGS_BUILTIN_APP; - if ((rv = segment_manager_init (sm, props, - options[SESSION_OPTIONS_SEGMENT_SIZE]))) + first_seg_size = options[SESSION_OPTIONS_SEGMENT_SIZE]; + if ((rv = segment_manager_init (sm, props, first_seg_size))) return rv; app->first_segment_manager = segment_manager_index (sm); app->api_client_index = api_client_index; - app->flags = options[SESSION_OPTIONS_FLAGS]; + app->flags = options[APP_OPTIONS_FLAGS]; app->cb_fns = *cb_fns; + /* Allocate app event queue in the first shared-memory segment */ + app->event_queue = segment_manager_alloc_queue (sm, app_evt_queue_size); + /* Check that the obvious things are properly set up */ application_verify_cb_fns (cb_fns); @@ -451,8 +435,8 @@ application_format_connects (application_t * app, int verbose) continue; fifo = fifos[i]; - session_index = fifo->server_session_index; - thread_index = fifo->server_thread_index; + session_index = fifo->master_session_index; + thread_index = fifo->master_thread_index; session = stream_session_get (session_index, thread_index); str = format (0, "%U", format_stream_session, session, verbose); diff --git a/src/vnet/session/application.h b/src/vnet/session/application.h index 6bcee9d3..35caae85 100644 --- a/src/vnet/session/application.h +++ b/src/vnet/session/application.h @@ -61,18 +61,6 @@ typedef struct _application /** Flags */ u32 flags; - /* Stream server mode: accept or connect - * TODO REMOVE*/ - u8 mode; - - /** Index of the listen session or connect session - * TODO REMOVE*/ - u32 session_index; - - /** Session thread index for client connect sessions - * TODO REMOVE */ - u32 thread_index; - /* * Binary API interface to external app */ diff --git a/src/vnet/session/application_interface.c b/src/vnet/session/application_interface.c index 96d2c621..ad44baa1 100644 --- a/src/vnet/session/application_interface.c +++ b/src/vnet/session/application_interface.c @@ -142,7 +142,7 @@ vnet_connect_i (u32 app_index, u32 api_context, session_type_t sst, * Server is willing to have a direct fifo connection created * instead of going through the state machine, etc. */ - if (server->flags & SESSION_OPTIONS_FLAGS_USE_FIFO) + if (server->flags & APP_OPTIONS_FLAGS_USE_FIFO) return server->cb_fns. redirect_connect_callback (server->api_client_index, mp); } @@ -363,7 +363,11 @@ vnet_disconnect_session (vnet_disconnect_args_t * a) if (!s || s->app_index != a->app_index) return VNET_API_ERROR_INVALID_VALUE; - stream_session_disconnect (s); + /* We're peeking into another's thread pool. Make sure */ + ASSERT (s->session_index == index); + + session_send_session_evt_to_thread (a->handle, FIFO_EVENT_DISCONNECT, + thread_index); return 0; } @@ -395,24 +399,6 @@ vnet_connect (vnet_connect_args_t * a) return vnet_connect_i (a->app_index, a->api_context, sst, &a->tep, a->mp); } -int -vnet_disconnect (vnet_disconnect_args_t * a) -{ - stream_session_t *session; - u32 session_index, thread_index; - - if (api_parse_session_handle (a->handle, &session_index, &thread_index)) - { - clib_warning ("Invalid handle"); - return -1; - } - - session = stream_session_get (session_index, thread_index); - stream_session_disconnect (session); - - return 0; -} - /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/session/application_interface.h b/src/vnet/session/application_interface.h index 2c497531..7d924c14 100644 --- a/src/vnet/session/application_interface.h +++ b/src/vnet/session/application_interface.h @@ -30,10 +30,18 @@ typedef enum _session_api_proto typedef struct _vnet_app_attach_args_t { + /** Binary API client index */ u32 api_client_index; + + /** Application and segment manager options */ u64 *options; + + /** Session to application callback functions */ session_cb_vft_t *session_cb_vft; + /** Flag that indicates if app is builtin */ + u8 builtin; + /* * Results */ @@ -110,7 +118,7 @@ typedef struct _vnet_disconnect_args_t typedef enum { APP_EVT_QUEUE_SIZE, - SESSION_OPTIONS_FLAGS, + APP_OPTIONS_FLAGS, SESSION_OPTIONS_SEGMENT_SIZE, SESSION_OPTIONS_ADD_SEGMENT_SIZE, SESSION_OPTIONS_RX_FIFO_SIZE, @@ -119,11 +127,30 @@ typedef enum SESSION_OPTIONS_N_OPTIONS } app_attach_options_index_t; -/** Server can handle delegated connect requests from local clients */ -#define SESSION_OPTIONS_FLAGS_USE_FIFO (1<<0) +#define foreach_app_options_flags \ + _(USE_FIFO, "Use FIFO with redirects") \ + _(ADD_SEGMENT, "Add segment and signal app if needed") \ + _(BUILTIN_APP, "Application is builtin") \ + +typedef enum _app_options +{ +#define _(sym, str) APP_OPTIONS_##sym, + foreach_app_options_flags +#undef _ +} app_options_t; + +typedef enum _app_options_flags +{ +#define _(sym, str) APP_OPTIONS_FLAGS_##sym = 1 << APP_OPTIONS_##sym, + foreach_app_options_flags +#undef _ +} app_options_flags_t; -/** Server wants vpp to add segments when out of memory for fifos */ -#define SESSION_OPTIONS_FLAGS_ADD_SEGMENT (1<<1) +///** Server can handle delegated connect requests from local clients */ +//#define APP_OPTIONS_FLAGS_USE_FIFO (1<<0) +// +///** Server wants vpp to add segments when out of memory for fifos */ +//#define APP_OPTIONS_FLAGS_ADD_SEGMENT (1<<1) #define VNET_CONNECT_REDIRECTED 123 @@ -138,7 +165,6 @@ int vnet_disconnect_session (vnet_disconnect_args_t * a); int vnet_bind (vnet_bind_args_t * a); int vnet_connect (vnet_connect_args_t * a); int vnet_unbind (vnet_unbind_args_t * a); -int vnet_disconnect (vnet_disconnect_args_t * a); int api_parse_session_handle (u64 handle, u32 * session_index, diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c index dd211c51..210754fa 100644 --- a/src/vnet/session/node.c +++ b/src/vnet/session/node.c @@ -218,8 +218,8 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, * 2) buffer chains */ if (peek_data) { - n_bytes_read = svm_fifo_peek (s0->server_tx_fifo, s0->pid, - rx_offset, len_to_deq0, data0); + n_bytes_read = svm_fifo_peek (s0->server_tx_fifo, rx_offset, + len_to_deq0, data0); if (n_bytes_read <= 0) goto dequeue_fail; @@ -230,8 +230,7 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, else { n_bytes_read = svm_fifo_dequeue_nowait (s0->server_tx_fifo, - s0->pid, len_to_deq0, - data0); + len_to_deq0, data0); if (n_bytes_read <= 0) goto dequeue_fail; } @@ -301,6 +300,26 @@ session_tx_fifo_dequeue_and_snd (vlib_main_t * vm, vlib_node_runtime_t * node, n_tx_pkts, 0); } +stream_session_t * +session_event_get_session (session_fifo_event_t * e0, u8 thread_index) +{ + svm_fifo_t *f0; + stream_session_t *s0; + u32 session_index0; + + f0 = e0->fifo; + session_index0 = f0->master_session_index; + + /* $$$ add multiple event queues, per vpp worker thread */ + ASSERT (f0->master_thread_index == thread_index); + + s0 = stream_session_get_if_valid (session_index0, thread_index); + + ASSERT (s0->thread_index == thread_index); + + return s0; +} + static uword session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) @@ -370,34 +389,24 @@ skip_dequeue: n_events = vec_len (my_fifo_events); for (i = 0; i < n_events; i++) { - svm_fifo_t *f0; /* $$$ prefetch 1 ahead maybe */ - stream_session_t *s0; - u32 session_index0; + stream_session_t *s0; /* $$$ prefetch 1 ahead maybe */ session_fifo_event_t *e0; e0 = &my_fifo_events[i]; - f0 = e0->fifo; - session_index0 = f0->server_session_index; - - /* $$$ add multiple event queues, per vpp worker thread */ - ASSERT (f0->server_thread_index == my_thread_index); - s0 = stream_session_get_if_valid (session_index0, my_thread_index); - - if (CLIB_DEBUG && !s0) + switch (e0->event_type) { - clib_warning ("It's dead, Jim!"); - continue; - } - - if (PREDICT_FALSE (s0->session_state == SESSION_STATE_CLOSED)) - continue; + case FIFO_EVENT_APP_TX: + s0 = session_event_get_session (e0, my_thread_index); - ASSERT (s0->thread_index == my_thread_index); + if (CLIB_DEBUG && !s0) + { + clib_warning ("It's dead, Jim!"); + continue; + } - switch (e0->event_type) - { - case FIFO_EVENT_SERVER_TX: + if (PREDICT_FALSE (s0->session_state == SESSION_STATE_CLOSED)) + continue; /* Spray packets in per session type frames, since they go to * different nodes */ rv = (smm->session_tx_fns[s0->session_type]) (vm, node, smm, e0, s0, @@ -408,10 +417,12 @@ skip_dequeue: goto done; break; - case FIFO_EVENT_SERVER_EXIT: + case FIFO_EVENT_DISCONNECT: + s0 = stream_session_get_from_handle (e0->session_handle); stream_session_disconnect (s0); break; case FIFO_EVENT_BUILTIN_RX: + s0 = session_event_get_session (e0, my_thread_index); svm_fifo_unset_event (s0->server_rx_fifo); /* Get session's server */ app = application_get (s0->app_index); diff --git a/src/vnet/session/segment_manager.c b/src/vnet/session/segment_manager.c index 16e5bc56..e0532320 100644 --- a/src/vnet/session/segment_manager.c +++ b/src/vnet/session/segment_manager.c @@ -27,6 +27,11 @@ u32 segment_name_counter = 0; */ segment_manager_t *segment_managers = 0; +/** + * Process private segment index + */ +u32 private_segment_index = ~0; + /** * Default fifo and segment size. TODO config. */ @@ -100,6 +105,26 @@ session_manager_add_first_segment (segment_manager_t * sm, u32 segment_size) return rv; } +static void +segment_manager_alloc_process_private_segment () +{ + svm_fifo_segment_create_args_t _a, *a = &_a; + + if (private_segment_index != ~0) + return; + + memset (a, 0, sizeof (*a)); + a->segment_name = "process-private-segment"; + a->segment_size = ~0; + a->new_segment_index = ~0; + + if (svm_fifo_segment_create_process_private (a)) + clib_warning ("Failed to create process private segment"); + + private_segment_index = a->new_segment_index; + ASSERT (private_segment_index != ~0); +} + /** * Initializes segment manager based on options provided. * Returns error if svm segment allocation fails. @@ -114,7 +139,9 @@ segment_manager_init (segment_manager_t * sm, /* app allocates these */ sm->properties = properties; - if (first_seg_size > 0) + first_seg_size = first_seg_size > 0 ? first_seg_size : default_segment_size; + + if (sm->properties->use_private_segment == 0) { rv = session_manager_add_first_segment (sm, first_seg_size); if (rv) @@ -123,7 +150,15 @@ segment_manager_init (segment_manager_t * sm, return rv; } } + else + { + if (private_segment_index == ~0) + segment_manager_alloc_process_private_segment (); + ASSERT (private_segment_index != ~0); + vec_add1 (sm->segment_indices, private_segment_index); + } + clib_spinlock_init (&sm->lockp); return 0; } @@ -162,8 +197,8 @@ segment_manager_del (segment_manager_t * sm) stream_session_t *session; fifo = fifos[i]; - session_index = fifo->server_session_index; - thread_index = fifo->server_thread_index; + session_index = fifo->master_session_index; + thread_index = fifo->master_thread_index; session = stream_session_get (session_index, thread_index); @@ -183,7 +218,9 @@ segment_manager_del (segment_manager_t * sm) deleted_thread_indices[i]); /* Instead of directly removing the session call disconnect */ - stream_session_disconnect (session); + session_send_session_evt_to_thread (stream_session_handle (session), + FIFO_EVENT_DISCONNECT, + deleted_thread_indices[i]); /* stream_session_table_del (smm, session); @@ -200,6 +237,7 @@ segment_manager_del (segment_manager_t * sm) /* svm_fifo_segment_delete (fifo_segment); */ } + clib_spinlock_free (&sm->lockp); vec_free (deleted_sessions); vec_free (deleted_thread_indices); pool_put (segment_managers, sm); @@ -232,9 +270,13 @@ segment_manager_alloc_session_fifos (segment_manager_t * sm, u8 added_a_segment = 0; int i; - /* Allocate svm fifos */ ASSERT (vec_len (sm->segment_indices)); + /* Make sure we don't have multiple threads trying to allocate segments + * at the same time. */ + clib_spinlock_lock (&sm->lockp); + + /* Allocate svm fifos */ again: for (i = 0; i < vec_len (sm->segment_indices); i++) { @@ -283,7 +325,9 @@ again: } if (session_manager_add_segment (sm)) - return VNET_API_ERROR_URI_FIFO_CREATE_FAILED; + { + return VNET_API_ERROR_URI_FIFO_CREATE_FAILED; + } added_a_segment = 1; goto again; @@ -295,14 +339,16 @@ again: } } - if (added_a_segment) - return segment_manager_notify_app_seg_add (sm, *fifo_segment_index); - /* Backpointers to segment manager */ sm_index = segment_manager_index (sm); (*server_tx_fifo)->segment_manager = sm_index; (*server_rx_fifo)->segment_manager = sm_index; + clib_spinlock_unlock (&sm->lockp); + + if (added_a_segment) + return segment_manager_notify_app_seg_add (sm, *fifo_segment_index); + return 0; } @@ -313,26 +359,72 @@ segment_manager_dealloc_fifos (u32 svm_segment_index, svm_fifo_t * rx_fifo, segment_manager_t *sm; svm_fifo_segment_private_t *fifo_segment; + sm = segment_manager_get_if_valid (rx_fifo->segment_manager); + + /* It's possible to have no segment manager if the session was removed + * as result of a detach */ + if (!sm) + return; + fifo_segment = svm_fifo_get_segment (svm_segment_index); svm_fifo_segment_free_fifo (fifo_segment, rx_fifo); svm_fifo_segment_free_fifo (fifo_segment, tx_fifo); - /* If we have segment manager, try doing some cleanup. - * It's possible to have no segment manager if the session was removed - * as result of a detach */ - sm = segment_manager_get_if_valid (rx_fifo->segment_manager); - if (sm) + /* Remove segment only if it holds no fifos and not the first */ + if (sm->segment_indices[0] != svm_segment_index + && !svm_fifo_segment_has_fifos (fifo_segment)) { - /* Remove segment only if it holds no fifos and not the first */ - if (sm->segment_indices[0] != svm_segment_index - && !svm_fifo_segment_has_fifos (fifo_segment)) - { - svm_fifo_segment_delete (fifo_segment); - vec_del1 (sm->segment_indices, svm_segment_index); - } + svm_fifo_segment_delete (fifo_segment); + vec_del1 (sm->segment_indices, svm_segment_index); } } +/** + * Allocates shm queue in the first segment + */ +unix_shared_memory_queue_t * +segment_manager_alloc_queue (segment_manager_t * sm, u32 queue_size) +{ + ssvm_shared_header_t *sh; + svm_fifo_segment_private_t *segment; + unix_shared_memory_queue_t *q; + void *oldheap; + + ASSERT (sm->segment_indices != 0); + + segment = svm_fifo_get_segment (sm->segment_indices[0]); + sh = segment->ssvm.sh; + + oldheap = ssvm_push_heap (sh); + q = + unix_shared_memory_queue_init (queue_size, sizeof (session_fifo_event_t), + 0 /* consumer pid */ , 0 + /* signal when queue non-empty */ ); + ssvm_pop_heap (oldheap); + return q; +} + +/** + * Frees shm queue allocated in the first segment + */ +void +segment_manager_dealloc_queue (segment_manager_t * sm, + unix_shared_memory_queue_t * q) +{ + ssvm_shared_header_t *sh; + svm_fifo_segment_private_t *segment; + void *oldheap; + + ASSERT (sm->segment_indices != 0); + + segment = svm_fifo_get_segment (sm->segment_indices[0]); + sh = segment->ssvm.sh; + + oldheap = ssvm_push_heap (sh); + unix_shared_memory_queue_free (q); + ssvm_pop_heap (oldheap); +} + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/session/segment_manager.h b/src/vnet/session/segment_manager.h index 778d6040..2710bb54 100644 --- a/src/vnet/session/segment_manager.h +++ b/src/vnet/session/segment_manager.h @@ -18,6 +18,10 @@ #include #include +#include +#include +#include + typedef struct _segment_manager_properties { /** Session fifo sizes. */ @@ -30,10 +34,14 @@ typedef struct _segment_manager_properties /** Flag that indicates if additional segments should be created */ u8 add_segment; + /** Use private memory segment instead of shared memory */ + u8 use_private_segment; } segment_manager_properties_t; typedef struct _segment_manager { + clib_spinlock_t lockp; + /** segments mapped by this manager */ u32 *segment_indices; @@ -95,6 +103,10 @@ segment_manager_alloc_session_fifos (segment_manager_t * sm, void segment_manager_dealloc_fifos (u32 svm_segment_index, svm_fifo_t * rx_fifo, svm_fifo_t * tx_fifo); +unix_shared_memory_queue_t *segment_manager_alloc_queue (segment_manager_t * + sm, u32 queue_size); +void segment_manager_dealloc_queue (segment_manager_t * sm, + unix_shared_memory_queue_t * q); #endif /* SRC_VNET_SESSION_SEGMENT_MANAGER_H_ */ /* diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index e6cfe7da..d17c93f8 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -377,33 +377,6 @@ stream_session_lookup_transport6 (ip6_address_t * lcl, ip6_address_t * rmt, return 0; } -/** - * Allocate vpp event queue (once) per worker thread - */ -void -session_vpp_event_queue_allocate (session_manager_main_t * smm, - u32 thread_index) -{ - api_main_t *am = &api_main; - void *oldheap; - - if (smm->vpp_event_queues[thread_index] == 0) - { - /* Allocate event fifo in the /vpe-api shared-memory segment */ - oldheap = svm_push_data_heap (am->vlib_rp); - - smm->vpp_event_queues[thread_index] = - unix_shared_memory_queue_init (2048 /* nels $$$$ config */ , - sizeof (session_fifo_event_t), - 0 /* consumer pid */ , - 0 - /* (do not) send signal when queue non-empty */ - ); - - svm_pop_heap (oldheap); - } -} - int stream_session_create_i (segment_manager_t * sm, transport_connection_t * tc, stream_session_t ** ret_s) @@ -428,11 +401,11 @@ stream_session_create_i (segment_manager_t * sm, transport_connection_t * tc, /* Initialize backpointers */ pool_index = s - smm->sessions[thread_index]; - server_rx_fifo->server_session_index = pool_index; - server_rx_fifo->server_thread_index = thread_index; + server_rx_fifo->master_session_index = pool_index; + server_rx_fifo->master_thread_index = thread_index; - server_tx_fifo->server_session_index = pool_index; - server_tx_fifo->server_thread_index = thread_index; + server_tx_fifo->master_session_index = pool_index; + server_tx_fifo->master_thread_index = thread_index; s->server_rx_fifo = server_rx_fifo; s->server_tx_fifo = server_tx_fifo; @@ -485,7 +458,7 @@ stream_session_enqueue_data (transport_connection_t * tc, u8 * data, u16 len, if (PREDICT_FALSE (len > svm_fifo_max_enqueue (s->server_rx_fifo))) return -1; - enqueued = svm_fifo_enqueue_nowait (s->server_rx_fifo, s->pid, len, data); + enqueued = svm_fifo_enqueue_nowait (s->server_rx_fifo, len, data); if (queue_event) { @@ -527,14 +500,14 @@ stream_session_peek_bytes (transport_connection_t * tc, u8 * buffer, u32 offset, u32 max_bytes) { stream_session_t *s = stream_session_get (tc->s_index, tc->thread_index); - return svm_fifo_peek (s->server_tx_fifo, s->pid, offset, max_bytes, buffer); + return svm_fifo_peek (s->server_tx_fifo, offset, max_bytes, buffer); } u32 stream_session_dequeue_drop (transport_connection_t * tc, u32 max_bytes) { stream_session_t *s = stream_session_get (tc->s_index, tc->thread_index); - return svm_fifo_dequeue_drop (s->server_tx_fifo, s->pid, max_bytes); + return svm_fifo_dequeue_drop (s->server_tx_fifo, max_bytes); } /** @@ -568,7 +541,7 @@ stream_session_enqueue_notify (stream_session_t * s, u8 block) { /* Fabricate event */ evt.fifo = s->server_rx_fifo; - evt.event_type = FIFO_EVENT_SERVER_RX; + evt.event_type = FIFO_EVENT_APP_RX; evt.event_id = serial_number++; /* Add event to server's event queue */ @@ -899,37 +872,45 @@ stream_session_stop_listen (stream_session_t * s) return 0; } +void +session_send_session_evt_to_thread (u64 session_handle, + fifo_event_type_t evt_type, + u32 thread_index) +{ + static u16 serial_number = 0; + session_fifo_event_t evt; + unix_shared_memory_queue_t *q; + + /* Fabricate event */ + evt.session_handle = session_handle; + evt.event_type = evt_type; + evt.event_id = serial_number++; + + q = session_manager_get_vpp_event_queue (thread_index); + + /* Based on request block (or not) for lack of space */ + if (PREDICT_TRUE (q->cursize < q->maxsize)) + unix_shared_memory_queue_add (q, (u8 *) & evt, + 0 /* do wait for mutex */ ); + else + { + clib_warning ("queue full"); + return; + } +} + /** * Disconnect session and propagate to transport. This should eventually * result in a delete notification that allows us to cleanup session state. * Called for both active/passive disconnects. + * + * Should be called from the session's thread. */ void stream_session_disconnect (stream_session_t * s) { -// session_fifo_event_t evt; - s->session_state = SESSION_STATE_CLOSED; - /* RPC to vpp evt queue in the right thread */ - tp_vfts[s->session_type].close (s->connection_index, s->thread_index); - -// { -// /* Fabricate event */ -// evt.fifo = s->server_rx_fifo; -// evt.event_type = FIFO_EVENT_SERVER_RX; -// evt.event_id = serial_number++; -// -// /* Based on request block (or not) for lack of space */ -// if (PREDICT_TRUE(q->cursize < q->maxsize)) -// unix_shared_memory_queue_add (app->event_queue, (u8 *) &evt, -// 0 /* do wait for mutex */); -// else -// { -// clib_warning("fifo full"); -// return -1; -// } -// } } /** @@ -976,6 +957,33 @@ session_get_transport_vft (u8 type) return &tp_vfts[type]; } +/** + * Allocate vpp event queue (once) per worker thread + */ +void +session_vpp_event_queue_allocate (session_manager_main_t * smm, + u32 thread_index) +{ + api_main_t *am = &api_main; + void *oldheap; + + if (smm->vpp_event_queues[thread_index] == 0) + { + /* Allocate event fifo in the /vpe-api shared-memory segment */ + oldheap = svm_push_data_heap (am->vlib_rp); + + smm->vpp_event_queues[thread_index] = + unix_shared_memory_queue_init (2048 /* nels $$$$ config */ , + sizeof (session_fifo_event_t), + 0 /* consumer pid */ , + 0 + /* (do not) send signal when queue non-empty */ + ); + + svm_pop_heap (oldheap); + } +} + static clib_error_t * session_manager_main_enable (vlib_main_t * vm) { @@ -1043,6 +1051,18 @@ session_manager_main_enable (vlib_main_t * vm) return 0; } +void +session_node_enable_disable (u8 is_en) +{ + u8 state = is_en ? VLIB_NODE_STATE_POLLING : VLIB_NODE_STATE_DISABLED; + /* *INDENT-OFF* */ + foreach_vlib_main (({ + vlib_node_set_state (this_vlib_main, session_queue_node.index, + state); + })); + /* *INDENT-ON* */ +} + clib_error_t * vnet_session_enable_disable (vlib_main_t * vm, u8 is_en) { @@ -1051,16 +1071,14 @@ vnet_session_enable_disable (vlib_main_t * vm, u8 is_en) if (session_manager_main.is_enabled) return 0; - vlib_node_set_state (vm, session_queue_node.index, - VLIB_NODE_STATE_POLLING); + session_node_enable_disable (is_en); return session_manager_main_enable (vm); } else { session_manager_main.is_enabled = 0; - vlib_node_set_state (vm, session_queue_node.index, - VLIB_NODE_STATE_DISABLED); + session_node_enable_disable (is_en); } return 0; diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index 6e4ea96d..8cd72f35 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -17,9 +17,6 @@ #include #include -#include -#include -#include #include #include @@ -31,10 +28,10 @@ typedef enum { - FIFO_EVENT_SERVER_RX, - FIFO_EVENT_SERVER_TX, + FIFO_EVENT_APP_RX, + FIFO_EVENT_APP_TX, FIFO_EVENT_TIMEOUT, - FIFO_EVENT_SERVER_EXIT, + FIFO_EVENT_DISCONNECT, FIFO_EVENT_BUILTIN_RX } fifo_event_type_t; @@ -96,7 +93,11 @@ typedef enum /* *INDENT-OFF* */ typedef CLIB_PACKED (struct { - svm_fifo_t * fifo; + union + { + svm_fifo_t * fifo; + u64 session_handle; + }; u8 event_type; u16 event_id; }) session_fifo_event_t; @@ -370,7 +371,9 @@ int stream_session_listen (stream_session_t * s, transport_endpoint_t * tep); int stream_session_stop_listen (stream_session_t * s); void stream_session_disconnect (stream_session_t * s); void stream_session_cleanup (stream_session_t * s); - +void session_send_session_evt_to_thread (u64 session_handle, + fifo_event_type_t evt_type, + u32 thread_index); u8 *format_stream_session (u8 * s, va_list * args); void session_register_transport (u8 type, const transport_proto_vft_t * vft); diff --git a/src/vnet/session/session_api.c b/src/vnet/session/session_api.c index 8116b673..79d67a2f 100755 --- a/src/vnet/session/session_api.c +++ b/src/vnet/session/session_api.c @@ -96,7 +96,7 @@ send_session_accept_callback (stream_session_t * s) memset (mp, 0, sizeof (*mp)); mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_ACCEPT_SESSION); - + mp->context = server->index; listener = listen_session_get (s->session_type, s->listener_index); tp_vft = session_get_transport_vft (s->session_type); tc = tp_vft->get_connection (s->connection_index, s->thread_index); @@ -270,23 +270,6 @@ static session_cb_vft_t uri_session_cb_vft = { .redirect_connect_callback = redirect_connect_callback }; -static int -api_session_not_valid (u32 session_index, u32 thread_index) -{ - session_manager_main_t *smm = vnet_get_session_manager_main (); - stream_session_t *pool; - - if (thread_index >= vec_len (smm->sessions)) - return VNET_API_ERROR_INVALID_VALUE; - - pool = smm->sessions[thread_index]; - - if (pool_is_free_index (pool, session_index)) - return VNET_API_ERROR_INVALID_VALUE_2; - - return 0; -} - static void vl_api_session_enable_disable_t_handler (vl_api_session_enable_disable_t * mp) { @@ -324,9 +307,9 @@ vl_api_application_attach_t_handler (vl_api_application_attach_t * mp) rv = vnet_application_attach (a); done: + /* *INDENT-OFF* */ REPLY_MACRO2 (VL_API_APPLICATION_ATTACH_REPLY, ({ - rmp->retval = rv; if (!rv) { rmp->segment_name_length = 0; @@ -558,24 +541,33 @@ static void vl_api_accept_session_reply_t_handler (vl_api_accept_session_reply_t * mp) { stream_session_t *s; - int rv; u32 session_index, thread_index; - session_index = stream_session_index_from_handle (mp->handle); - thread_index = stream_session_thread_from_handle (mp->handle); - if (api_session_not_valid (session_index, thread_index)) - return; - - s = stream_session_get (session_index, thread_index); - rv = mp->retval; + vnet_disconnect_args_t _a, *a = &_a; - if (rv) + /* Server isn't interested, kill the session */ + if (mp->retval) { - /* Server isn't interested, kill the session */ - stream_session_disconnect (s); - return; + a->app_index = mp->context; + a->handle = mp->handle; + vnet_disconnect_session (a); + } + else + { + stream_session_parse_handle (mp->handle, &session_index, &thread_index); + s = stream_session_get_if_valid (session_index, thread_index); + if (!s) + { + clib_warning ("session doesn't exist"); + return; + } + if (s->app_index != mp->context) + { + clib_warning ("app doesn't own session"); + return; + } + /* XXX volatile? */ + s->session_state = SESSION_STATE_READY; } - - s->session_state = SESSION_STATE_READY; } static void diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index f8fbf28c..276beb21 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -62,8 +62,7 @@ send_test_chunk (tclient_main_t * tm, session_t * s) bytes_this_chunk = bytes_this_chunk < s->bytes_to_send ? bytes_this_chunk : s->bytes_to_send; - rv = svm_fifo_enqueue_nowait (s->server_tx_fifo, 0 /*pid */ , - bytes_this_chunk, + rv = svm_fifo_enqueue_nowait (s->server_tx_fifo, bytes_this_chunk, test_data + test_buf_offset); /* If we managed to enqueue data... */ @@ -95,7 +94,7 @@ send_test_chunk (tclient_main_t * tm, session_t * s) { /* Fabricate TX event, send to vpp */ evt.fifo = s->server_tx_fifo; - evt.event_type = FIFO_EVENT_SERVER_TX; + evt.event_type = FIFO_EVENT_APP_TX; evt.event_id = serial_number++; unix_shared_memory_queue_add (tm->vpp_event_queue, (u8 *) & evt, @@ -113,7 +112,7 @@ receive_test_chunk (tclient_main_t * tm, session_t * s) /* Allow enqueuing of new event */ // svm_fifo_unset_event (rx_fifo); - n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len (tm->rx_buf), + n_read = svm_fifo_dequeue_nowait (rx_fifo, vec_len (tm->rx_buf), tm->rx_buf); if (n_read > 0) { @@ -457,6 +456,8 @@ attach_builtin_test_clients () options[SESSION_OPTIONS_ACCEPT_COOKIE] = 0x12345678; options[SESSION_OPTIONS_SEGMENT_SIZE] = (2 << 30); /*$$$$ config / arg */ + options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP; + a->options = options; return vnet_application_attach (a); diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c index 8308e3d9..34682699 100644 --- a/src/vnet/tcp/builtin_server.c +++ b/src/vnet/tcp/builtin_server.c @@ -180,7 +180,7 @@ builtin_server_rx_callback (stream_session_t * s) vec_validate (bsm->rx_buf, max_transfer - 1); _vec_len (bsm->rx_buf) = max_transfer; - actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, 0, max_transfer, + actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, max_transfer, bsm->rx_buf); ASSERT (actual_transfer == max_transfer); @@ -190,8 +190,7 @@ builtin_server_rx_callback (stream_session_t * s) * Echo back */ - n_written = - svm_fifo_enqueue_nowait (tx_fifo, 0, actual_transfer, bsm->rx_buf); + n_written = svm_fifo_enqueue_nowait (tx_fifo, actual_transfer, bsm->rx_buf); if (n_written != max_transfer) clib_warning ("short trout!"); @@ -200,7 +199,7 @@ builtin_server_rx_callback (stream_session_t * s) { /* Fabricate TX event, send to vpp */ evt.fifo = tx_fifo; - evt.event_type = FIFO_EVENT_SERVER_TX; + evt.event_type = FIFO_EVENT_APP_TX; evt.event_id = serial_number++; unix_shared_memory_queue_add (bsm->vpp_queue[s->thread_index], @@ -288,6 +287,7 @@ server_attach () a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 128 << 20; a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 1 << 16; a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 1 << 16; + a->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP; a->segment_name = segment_name; a->segment_name_length = ARRAY_LEN (segment_name); diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 12982589..245a35ab 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -487,7 +487,8 @@ u8 * format_tcp_connection (u8 * s, va_list * args) { tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - + if (!tc) + return s; if (tc->c_is_ip4) { s = format (s, "[#%d][%s] %U:%d->%U:%d", tc->c_thread_index, "T", @@ -747,12 +748,14 @@ void tcp_initialize_timer_wheels (tcp_main_t * tm) { tw_timer_wheel_16t_2w_512sl_t *tw; - vec_foreach (tw, tm->timer_wheels) - { + /* *INDENT-OFF* */ + foreach_vlib_main (({ + tw = &tm->timer_wheels[ii]; tw_timer_wheel_init_16t_2w_512sl (tw, tcp_expired_timers_dispatch, 100e-3 /* timer period 100ms */ , ~0); - tw->last_run_time = vlib_time_now (tm->vlib_main); - } + tw->last_run_time = vlib_time_now (this_vlib_main); + })); + /* *INDENT-ON* */ } clib_error_t * diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 97679aaf..3bd53878 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -1011,8 +1011,8 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, clib_warning ("ooo: offset %d len %d", offset, data_len); - rv = svm_fifo_enqueue_with_offset (s0->server_rx_fifo, s0->pid, offset, - data_len, vlib_buffer_get_current (b)); + rv = svm_fifo_enqueue_with_offset (s0->server_rx_fifo, offset, data_len, + vlib_buffer_get_current (b)); /* Nothing written */ if (rv) @@ -2392,8 +2392,8 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); clib_memcpy (&t0->tcp_header, tcp0, sizeof (t0->tcp_header)); - clib_memcpy (&t0->tcp_connection, tc0, - sizeof (t0->tcp_connection)); + if (tc0) + clib_memcpy (&t0->tcp_connection, tc0, sizeof (*tc0)); } vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index a7be8bd5..4e1a7aa5 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -1558,7 +1558,6 @@ tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t *b0; tcp_tx_trace_t *t0; tcp_header_t *th0; - tcp_connection_t *tc0; u32 error0 = TCP_ERROR_RST_SENT, next0 = TCP_RESET_NEXT_IP_LOOKUP; bi0 = from[0]; @@ -1592,13 +1591,8 @@ tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, th0 = ip4_next_header ((ip4_header_t *) th0); else th0 = ip6_next_header ((ip6_header_t *) th0); - tc0 = - tcp_connection_get (vnet_buffer (b0)->tcp.connection_index, - my_thread_index); t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); - clib_memcpy (&t0->tcp_connection, tc0, - sizeof (t0->tcp_connection)); } vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c index 890e50b9..0146154b 100644 --- a/src/vnet/tcp/tcp_test.c +++ b/src/vnet/tcp/tcp_test.c @@ -351,8 +351,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) /* * Enqueue an initial (un-dequeued) chunk */ - rv = svm_fifo_enqueue_nowait (f, 0 /* pid */ , - sizeof (u32), (u8 *) test_data); + rv = svm_fifo_enqueue_nowait (f, sizeof (u32), (u8 *) test_data); TCP_TEST ((rv == sizeof (u32)), "enqueued %d", rv); TCP_TEST ((f->tail == 4), "fifo tail %u", f->tail); @@ -364,7 +363,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) { offset = (2 * i + 1) * sizeof (u32); data = (u8 *) (test_data + (2 * i + 1)); - rv = svm_fifo_enqueue_with_offset (f, 0, offset, sizeof (u32), data); + rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data); if (verbose) vlib_cli_output (vm, "add [%d] [%d, %d]", 2 * i + 1, offset, offset + sizeof (u32)); @@ -393,7 +392,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) { offset = (2 * i + 0) * sizeof (u32); data = (u8 *) (test_data + (2 * i + 0)); - rv = svm_fifo_enqueue_with_offset (f, 0, offset, sizeof (u32), data); + rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data); if (verbose) vlib_cli_output (vm, "add [%d] [%d, %d]", 2 * i, offset, offset + sizeof (u32)); @@ -418,8 +417,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) /* * Enqueue the missing u32 */ - rv = svm_fifo_enqueue_nowait (f, 0 /* pid */ , sizeof (u32), - (u8 *) (test_data + 2)); + rv = svm_fifo_enqueue_nowait (f, sizeof (u32), (u8 *) (test_data + 2)); if (verbose) vlib_cli_output (vm, "fifo after missing link: %U", format_svm_fifo, f, 1); @@ -432,8 +430,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) */ for (i = 0; i < 7; i++) { - rv = svm_fifo_dequeue_nowait (f, 0 /* pid */ , sizeof (u32), - (u8 *) & data_word); + rv = svm_fifo_dequeue_nowait (f, sizeof (u32), (u8 *) & data_word); if (rv != sizeof (u32)) { clib_warning ("bytes dequeues %u", rv); @@ -457,7 +454,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) { offset = (2 * i + 1) * sizeof (u32); data = (u8 *) (test_data + (2 * i + 1)); - rv = svm_fifo_enqueue_with_offset (f, 0, offset, sizeof (u32), data); + rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data); if (verbose) vlib_cli_output (vm, "add [%d] [%d, %d]", 2 * i + 1, offset, offset + sizeof (u32)); @@ -468,13 +465,13 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) } } - rv = svm_fifo_enqueue_with_offset (f, 0, 8, 21, data); + rv = svm_fifo_enqueue_with_offset (f, 8, 21, data); TCP_TEST ((rv == 0), "ooo enqueued %u", rv); TCP_TEST ((svm_fifo_number_ooo_segments (f) == 1), "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); vec_validate (data_buf, vec_len (data)); - svm_fifo_peek (f, 0, 0, vec_len (data), data_buf); + svm_fifo_peek (f, 0, vec_len (data), data_buf); if (compare_data (data_buf, data, 8, vec_len (data), &j)) { TCP_TEST (0, "[%d] peeked %u expected %u", j, data_buf[j], data[j]); @@ -491,7 +488,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) { offset = (2 * i + 1) * sizeof (u32); data = (u8 *) (test_data + (2 * i + 1)); - rv = svm_fifo_enqueue_with_offset (f, 0, offset, sizeof (u32), data); + rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data); if (verbose) vlib_cli_output (vm, "add [%d] [%d, %d]", 2 * i + 1, offset, offset + sizeof (u32)); @@ -502,13 +499,13 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) } } - rv = svm_fifo_enqueue_nowait (f, 0, 29, data); + rv = svm_fifo_enqueue_nowait (f, 29, data); TCP_TEST ((rv == 32), "ooo enqueued %u", rv); TCP_TEST ((svm_fifo_number_ooo_segments (f) == 0), "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); vec_validate (data_buf, vec_len (data)); - svm_fifo_peek (f, 0, 0, vec_len (data), data_buf); + svm_fifo_peek (f, 0, vec_len (data), data_buf); if (compare_data (data_buf, data, 0, vec_len (data), &j)) { TCP_TEST (0, "[%d] peeked %u expected %u", j, data_buf[j], data[j]); @@ -551,7 +548,7 @@ tcp_test_fifo2 (vlib_main_t * vm) { tp = vp + i; data64 = tp->offset; - rv = svm_fifo_enqueue_with_offset (f, 0, tp->offset, tp->len, + rv = svm_fifo_enqueue_with_offset (f, tp->offset, tp->len, (u8 *) & data64); } @@ -565,7 +562,7 @@ tcp_test_fifo2 (vlib_main_t * vm) "first ooo seg length %u", ooo_seg->length); data64 = 0; - rv = svm_fifo_enqueue_nowait (f, 0, sizeof (u32), (u8 *) & data64); + rv = svm_fifo_enqueue_nowait (f, sizeof (u32), (u8 *) & data64); TCP_TEST ((rv == 3000), "bytes to be enqueued %u", rv); svm_fifo_free (f); @@ -581,7 +578,7 @@ tcp_test_fifo2 (vlib_main_t * vm) { tp = &test_data[i]; data64 = tp->offset; - rv = svm_fifo_enqueue_with_offset (f, 0, tp->offset, tp->len, + rv = svm_fifo_enqueue_with_offset (f, tp->offset, tp->len, (u8 *) & data64); if (rv) { @@ -599,7 +596,7 @@ tcp_test_fifo2 (vlib_main_t * vm) "first ooo seg length %u", ooo_seg->length); data64 = 0; - rv = svm_fifo_enqueue_nowait (f, 0, sizeof (u32), (u8 *) & data64); + rv = svm_fifo_enqueue_nowait (f, sizeof (u32), (u8 *) & data64); TCP_TEST ((rv == 3000), "bytes to be enqueued %u", rv); @@ -755,7 +752,7 @@ tcp_test_fifo3 (vlib_main_t * vm, unformat_input_t * input) for (i = 0; i < vec_len (generate); i++) { tp = generate + i; - rv = svm_fifo_enqueue_with_offset (f, 0, fifo_initial_offset + rv = svm_fifo_enqueue_with_offset (f, fifo_initial_offset + tp->offset, tp->len, (u8 *) data_pattern + tp->offset); } @@ -776,7 +773,7 @@ tcp_test_fifo3 (vlib_main_t * vm, unformat_input_t * input) u32 bytes_to_enq = 1; if (in_seq_all) bytes_to_enq = total_size; - rv = svm_fifo_enqueue_nowait (f, 0, bytes_to_enq, data_pattern + 0); + rv = svm_fifo_enqueue_nowait (f, bytes_to_enq, data_pattern + 0); if (verbose) vlib_cli_output (vm, "in-order enqueue returned %d", rv); @@ -793,7 +790,7 @@ tcp_test_fifo3 (vlib_main_t * vm, unformat_input_t * input) * Test if peeked data is the same as original data */ vec_validate (data_buf, vec_len (data_pattern)); - svm_fifo_peek (f, 0, 0, vec_len (data_pattern), data_buf); + svm_fifo_peek (f, 0, vec_len (data_pattern), data_buf); if (compare_data (data_buf, data_pattern, 0, vec_len (data_pattern), &j)) { TCP_TEST (0, "[%d] peeked %u expected %u", j, data_buf[j], @@ -806,11 +803,11 @@ tcp_test_fifo3 (vlib_main_t * vm, unformat_input_t * input) */ if (drop) { - svm_fifo_dequeue_drop (f, 0, vec_len (data_pattern)); + svm_fifo_dequeue_drop (f, vec_len (data_pattern)); } else { - svm_fifo_dequeue_nowait (f, 0, vec_len (data_pattern), data_buf); + svm_fifo_dequeue_nowait (f, vec_len (data_pattern), data_buf); if (compare_data (data_buf, data_pattern, 0, vec_len (data_pattern), &j)) { diff --git a/src/vnet/udp/builtin_server.c b/src/vnet/udp/builtin_server.c index 8565f04c..18684d54 100644 --- a/src/vnet/udp/builtin_server.c +++ b/src/vnet/udp/builtin_server.c @@ -59,10 +59,10 @@ builtin_server_rx_callback (stream_session_t * s) vec_validate (my_copy_buffer, this_transfer - 1); _vec_len (my_copy_buffer) = this_transfer; - actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, 0, this_transfer, + actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, this_transfer, my_copy_buffer); ASSERT (actual_transfer == this_transfer); - actual_transfer = svm_fifo_enqueue_nowait (tx_fifo, 0, this_transfer, + actual_transfer = svm_fifo_enqueue_nowait (tx_fifo, this_transfer, my_copy_buffer); ASSERT (actual_transfer == this_transfer); @@ -72,7 +72,7 @@ builtin_server_rx_callback (stream_session_t * s) { /* Fabricate TX event, send to ourselves */ evt.fifo = tx_fifo; - evt.event_type = FIFO_EVENT_SERVER_TX; + evt.event_type = FIFO_EVENT_APP_TX; evt.event_id = 0; q = session_manager_get_vpp_event_queue (s->thread_index); unix_shared_memory_queue_add (q, (u8 *) & evt, @@ -110,6 +110,8 @@ attach_builtin_uri_server () options[SESSION_OPTIONS_ACCEPT_COOKIE] = 0x12345678; options[SESSION_OPTIONS_SEGMENT_SIZE] = (2 << 30); /*$$$$ config / arg */ + options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP; + a->options = options; return vnet_application_attach (a); diff --git a/src/vnet/udp/udp_input.c b/src/vnet/udp/udp_input.c index 810278e6..e6b4f8fc 100644 --- a/src/vnet/udp/udp_input.c +++ b/src/vnet/udp/udp_input.c @@ -145,8 +145,7 @@ udp4_uri_input_node_fn (vlib_main_t * vm, goto trace0; } - svm_fifo_enqueue_nowait (f0, 0 /* pid */ , - udp_len0 - sizeof (*udp0), + svm_fifo_enqueue_nowait (f0, udp_len0 - sizeof (*udp0), (u8 *) (udp0 + 1)); b0->error = node->errors[SESSION_ERROR_ENQUEUED]; @@ -255,7 +254,7 @@ udp4_uri_input_node_fn (vlib_main_t * vm, { /* Fabricate event */ evt.fifo = s0->server_rx_fifo; - evt.event_type = FIFO_EVENT_SERVER_RX; + evt.event_type = FIFO_EVENT_APP_RX; evt.event_id = serial_number++; /* Add event to server's event queue */ -- cgit 1.2.3-korg From 636815199a1f359fdd0da706985a74eca95254da Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Thu, 20 Apr 2017 17:50:39 -0400 Subject: Debug CLI to magically create / delete a TCP session The session ends up in established state, and is hand-crafted to look like it was created by the builtin_server. This will come in handy for injecting packets into tcp46-established, along with ancillary debug CLI to adjust connection parameters. Immediate applications include screwball window cases, out of order segments, paws checking, and so on and so forth. Debug CLI script: loop create set int ip address loop0 6.0.1.1/8 set int state loop0 up set ip arp loop0 6.0.1.2 feed.face.babe test tcp server test tcp session packet-generator new { name tcp limit 1 node ip4-input size 100-100 interface loop0 no-recycle data { TCP: 6.0.1.2 -> 6.0.1.1 TCP: 11234 -> 1234 ACK window 2000 seqnum 0 acknum 0 incrementing 100 } } Change-Id: I866c2159376064b7d14f70531022c1fe949258c2 Signed-off-by: Dave Barach --- src/vnet/tcp/tcp_format.c | 3 +- src/vnet/tcp/tcp_input.c | 2 +- src/vnet/tcp/tcp_pg.c | 108 +++++++++++++++++++++++++--------------------- src/vnet/tcp/tcp_test.c | 69 +++++++++++++++++++++++++++-- 4 files changed, 127 insertions(+), 55 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/vnet/tcp/tcp_format.c b/src/vnet/tcp/tcp_format.c index 994ccfd6..1ca2f58e 100644 --- a/src/vnet/tcp/tcp_format.c +++ b/src/vnet/tcp/tcp_format.c @@ -45,7 +45,8 @@ format_tcp_flags (u8 * s, va_list * args) { int flags = va_arg (*args, int); -#define _(f) if (flags & TCP_FLAG_##f) s = format (s, "%s, ", #f); + s = format (s, "0x%02x", flags); +#define _(f) if (flags & TCP_FLAG_##f) s = format (s, " %s", #f); foreach_tcp_flag #undef _ return s; diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 3bd53878..bfe3665a 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -2376,7 +2376,7 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vnet_buffer (b0)->tcp.flags = tc0->state; clib_warning ("disp error state %U flags %U", format_tcp_state, &state0, - format_tcp_flags, flags0); + format_tcp_flags, (int) flags0); } } else diff --git a/src/vnet/tcp/tcp_pg.c b/src/vnet/tcp/tcp_pg.c index dc324049..3be4592c 100644 --- a/src/vnet/tcp/tcp_pg.c +++ b/src/vnet/tcp/tcp_pg.c @@ -54,21 +54,19 @@ static void tcp_pg_edit_function (pg_main_t * pg, pg_stream_t * s, - pg_edit_group_t * g, - u32 * packets, - u32 n_packets) + pg_edit_group_t * g, u32 * packets, u32 n_packets) { - vlib_main_t * vm = vlib_get_main(); + vlib_main_t *vm = vlib_get_main (); u32 ip_offset, tcp_offset; tcp_offset = g->start_byte_offset; - ip_offset = (g-1)->start_byte_offset; + ip_offset = (g - 1)->start_byte_offset; while (n_packets >= 1) { - vlib_buffer_t * p0; - ip4_header_t * ip0; - tcp_header_t * tcp0; + vlib_buffer_t *p0; + ip4_header_t *ip0; + tcp_header_t *tcp0; ip_csum_t sum0; u32 tcp_len0; @@ -85,7 +83,9 @@ tcp_pg_edit_function (pg_main_t * pg, if (BITS (sum0) == 32) { sum0 = clib_mem_unaligned (&ip0->src_address, u32); - sum0 = ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->dst_address, u32)); + sum0 = + ip_csum_with_carry (sum0, + clib_mem_unaligned (&ip0->dst_address, u32)); } else sum0 = clib_mem_unaligned (&ip0->src_address, u64); @@ -96,20 +96,22 @@ tcp_pg_edit_function (pg_main_t * pg, /* Invalidate possibly old checksum. */ tcp0->checksum = 0; - sum0 = ip_incremental_checksum_buffer (vm, p0, tcp_offset, tcp_len0, sum0); + sum0 = + ip_incremental_checksum_buffer (vm, p0, tcp_offset, tcp_len0, sum0); - tcp0->checksum = ~ ip_csum_fold (sum0); + tcp0->checksum = ~ip_csum_fold (sum0); } } -typedef struct { +typedef struct +{ pg_edit_t src, dst; pg_edit_t seq_number, ack_number; pg_edit_t data_offset_and_reserved; #define _(f) pg_edit_t f##_flag; - foreach_tcp_flag + foreach_tcp_flag #undef _ - pg_edit_t window; + pg_edit_t window; pg_edit_t checksum; pg_edit_t urgent_pointer; } pg_tcp_header_t; @@ -119,13 +121,13 @@ pg_tcp_header_init (pg_tcp_header_t * p) { /* Initialize fields that are not bit fields in the IP header. */ #define _(f) pg_edit_init (&p->f, tcp_header_t, f); - _ (src); - _ (dst); - _ (seq_number); - _ (ack_number); - _ (window); - _ (checksum); - _ (urgent_pointer); + _(src); + _(dst); + _(seq_number); + _(ack_number); + _(window); + _(checksum); + _(urgent_pointer); #undef _ /* Initialize bit fields. */ @@ -136,19 +138,17 @@ pg_tcp_header_init (pg_tcp_header_t * p) foreach_tcp_flag #undef _ - - pg_edit_init_bitfield (&p->data_offset_and_reserved, tcp_header_t, - data_offset_and_reserved, - 4, 4); + pg_edit_init_bitfield (&p->data_offset_and_reserved, tcp_header_t, + data_offset_and_reserved, 4, 4); } uword unformat_pg_tcp_header (unformat_input_t * input, va_list * args) { - pg_stream_t * s = va_arg (*args, pg_stream_t *); - pg_tcp_header_t * p; + pg_stream_t *s = va_arg (*args, pg_stream_t *); + pg_tcp_header_t *p; u32 group_index; - + p = pg_create_edit_group (s, sizeof (p[0]), sizeof (tcp_header_t), &group_index); pg_tcp_header_init (p); @@ -157,8 +157,8 @@ unformat_pg_tcp_header (unformat_input_t * input, va_list * args) pg_edit_set_fixed (&p->seq_number, 0); pg_edit_set_fixed (&p->ack_number, 0); - pg_edit_set_fixed (&p->data_offset_and_reserved, - sizeof (tcp_header_t) / sizeof (u32)); + pg_edit_set_fixed (&p->data_offset_and_reserved, + sizeof (tcp_header_t) / sizeof (u32)); pg_edit_set_fixed (&p->window, 4096); pg_edit_set_fixed (&p->urgent_pointer, 0); @@ -166,43 +166,44 @@ unformat_pg_tcp_header (unformat_input_t * input, va_list * args) #define _(f) pg_edit_set_fixed (&p->f##_flag, 0); foreach_tcp_flag #undef _ + p->checksum.type = PG_EDIT_UNSPECIFIED; - p->checksum.type = PG_EDIT_UNSPECIFIED; - - if (! unformat (input, "TCP: %U -> %U", - unformat_pg_edit, - unformat_tcp_udp_port, &p->src, - unformat_pg_edit, - unformat_tcp_udp_port, &p->dst)) + if (!unformat (input, "TCP: %U -> %U", + unformat_pg_edit, + unformat_tcp_udp_port, &p->src, + unformat_pg_edit, unformat_tcp_udp_port, &p->dst)) goto error; /* Parse options. */ while (1) { if (unformat (input, "window %U", - unformat_pg_edit, - unformat_pg_number, &p->window)) + unformat_pg_edit, unformat_pg_number, &p->window)) ; else if (unformat (input, "checksum %U", - unformat_pg_edit, - unformat_pg_number, &p->checksum)) + unformat_pg_edit, unformat_pg_number, &p->checksum)) ; + else if (unformat (input, "seqnum %U", unformat_pg_edit, + unformat_pg_number, &p->seq_number)) + ; + else if (unformat (input, "acknum %U", unformat_pg_edit, + unformat_pg_number, &p->ack_number)) + ; /* Flags. */ #define _(f) else if (unformat (input, #f)) pg_edit_set_fixed (&p->f##_flag, 1); - foreach_tcp_flag + foreach_tcp_flag #undef _ - - /* Can't parse input: try next protocol level. */ - else + /* Can't parse input: try next protocol level. */ + else break; } { - ip_main_t * im = &ip_main; + ip_main_t *im = &ip_main; u16 dst_port; - tcp_udp_port_info_t * pi; + tcp_udp_port_info_t *pi; pi = 0; if (p->dst.type == PG_EDIT_FIXED) @@ -215,12 +216,12 @@ unformat_pg_tcp_header (unformat_input_t * input, va_list * args) && unformat_user (input, pi->unformat_pg_edit, s)) ; - else if (! unformat_user (input, unformat_pg_payload, s)) + else if (!unformat_user (input, unformat_pg_payload, s)) goto error; if (p->checksum.type == PG_EDIT_UNSPECIFIED) { - pg_edit_group_t * g = pg_stream_get_group (s, group_index); + pg_edit_group_t *g = pg_stream_get_group (s, group_index); g->edit_function = tcp_pg_edit_function; g->edit_function_opaque = 0; } @@ -228,9 +229,16 @@ unformat_pg_tcp_header (unformat_input_t * input, va_list * args) return 1; } - error: +error: /* Free up any edits we may have added. */ pg_free_edit_group (s); return 0; } +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c index 0146154b..d65ce1be 100644 --- a/src/vnet/tcp/tcp_test.c +++ b/src/vnet/tcp/tcp_test.c @@ -895,6 +895,68 @@ tcp_test_fifo (vlib_main_t * vm, unformat_input_t * input) return res; } +static int +tcp_test_session (vlib_main_t * vm, unformat_input_t * input) +{ + int rv = 0; + tcp_connection_t *tc0; + u8 sst = SESSION_TYPE_IP4_TCP; + ip4_address_t local, remote; + u16 local_port, remote_port; + tcp_main_t *tm = vnet_get_tcp_main (); + int is_add = 1; + + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "del")) + is_add = 0; + else if (unformat (input, "add")) + is_add = 1; + else + break; + } + + if (is_add) + { + local.as_u32 = clib_host_to_net_u32 (0x06000101); + remote.as_u32 = clib_host_to_net_u32 (0x06000102); + local_port = clib_host_to_net_u16 (1234); + remote_port = clib_host_to_net_u16 (11234); + + pool_get (tm->connections[0], tc0); + memset (tc0, 0, sizeof (*tc0)); + + tc0->state = TCP_STATE_ESTABLISHED; + tc0->rcv_las = 1; + tc0->c_c_index = tc0 - tm->connections[0]; + tc0->c_lcl_port = local_port; + tc0->c_rmt_port = remote_port; + tc0->c_is_ip4 = 1; + tc0->c_thread_index = 0; + tc0->c_lcl_ip4.as_u32 = local.as_u32; + tc0->c_rmt_ip4.as_u32 = remote.as_u32; + tc0->opt.mss = 1450; + tcp_connection_init_vars (tc0); + + TCP_EVT_DBG (TCP_EVT_OPEN, tc0); + + if (stream_session_accept (&tc0->connection, 0 /* listener index */ , + sst, 0 /* notify */ )) + clib_warning ("stream_session_accept failed"); + + stream_session_accept_notify (&tc0->connection); + } + else + { + tc0 = tcp_connection_get (0 /* connection index */ , 0 /* thread */ ); + tc0->state = TCP_STATE_CLOSED; + stream_session_disconnect_notify (&tc0->connection); + } + + return rv; +} + static clib_error_t * tcp_test (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd_arg) @@ -911,11 +973,12 @@ tcp_test (vlib_main_t * vm, { res = tcp_test_fifo (vm, input); } - else + else if (unformat (input, "session")) { - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + res = tcp_test_session (vm, input); } + else + break; } if (res) -- cgit 1.2.3-korg From 45d3496f3d86ee1a930ce0ffd6ca3d1730355eb8 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Tue, 25 Apr 2017 00:05:27 -0700 Subject: Add sack tx unit test Change-Id: Ib91db6e531231bdc52b0104673a912bee024872f Signed-off-by: Florin Coras --- src/vnet/tcp/tcp.h | 6 +- src/vnet/tcp/tcp_format.c | 12 ++++ src/vnet/tcp/tcp_input.c | 42 +++++++++----- src/vnet/tcp/tcp_test.c | 143 +++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 185 insertions(+), 18 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 2ac6a9b8..40fb3515 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -59,6 +59,7 @@ typedef enum _tcp_state format_function_t format_tcp_state; format_function_t format_tcp_flags; +format_function_t format_tcp_sacks; /** TCP timers */ #define foreach_tcp_timer \ @@ -470,11 +471,13 @@ tcp_available_snd_space (const tcp_connection_t * tc) void tcp_update_rcv_wnd (tcp_connection_t * tc); void tcp_retransmit_first_unacked (tcp_connection_t * tc); - void tcp_fast_retransmit (tcp_connection_t * tc); void tcp_cc_congestion (tcp_connection_t * tc); void tcp_cc_recover (tcp_connection_t * tc); +/* Made public for unit testing only */ +void tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end); + always_inline u32 tcp_time_now (void) { @@ -496,7 +499,6 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, void tcp_connection_timers_init (tcp_connection_t * tc); void tcp_connection_timers_reset (tcp_connection_t * tc); - void tcp_connection_init_vars (tcp_connection_t * tc); always_inline void diff --git a/src/vnet/tcp/tcp_format.c b/src/vnet/tcp/tcp_format.c index 1ca2f58e..3148fd40 100644 --- a/src/vnet/tcp/tcp_format.c +++ b/src/vnet/tcp/tcp_format.c @@ -128,6 +128,18 @@ format_tcp_header (u8 * s, va_list * args) return s; } +u8 * +format_tcp_sacks (u8 * s, va_list * args) +{ + sack_block_t *sacks = va_arg (*args, sack_block_t *); + sack_block_t *block; + vec_foreach (block, sacks) + { + s = format (s, " start %u end %u\n", block->start, block->end); + } + return s; +} + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index bfe3665a..e184a4d6 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -894,37 +894,51 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, * @param start Start sequence number of the newest SACK block * @param end End sequence of the newest SACK block */ -static void +void tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end) { - sack_block_t *new_list = 0, block; + sack_block_t *new_list = 0, *block = 0; int i; /* If the first segment is ooo add it to the list. Last write might've moved * rcv_nxt over the first segment. */ if (seq_lt (tc->rcv_nxt, start)) { - block.start = start; - block.end = end; - vec_add1 (new_list, block); + vec_add2 (new_list, block, 1); + block->start = start; + block->end = end; } /* Find the blocks still worth keeping. */ for (i = 0; i < vec_len (tc->snd_sacks); i++) { - /* Discard if: - * 1) rcv_nxt advanced beyond current block OR - * 2) Segment overlapped by the first segment, i.e., it has been merged - * into it.*/ - if (seq_leq (tc->snd_sacks[i].start, tc->rcv_nxt) - || seq_leq (tc->snd_sacks[i].start, end)) + /* Discard if rcv_nxt advanced beyond current block */ + if (seq_leq (tc->snd_sacks[i].start, tc->rcv_nxt)) continue; - /* Save to new SACK list. */ - vec_add1 (new_list, tc->snd_sacks[i]); + /* Merge or drop if segment overlapped by the new segment */ + if (block && (seq_geq (tc->snd_sacks[i].end, new_list[0].start) + && seq_leq (tc->snd_sacks[i].start, new_list[0].end))) + { + if (seq_lt (tc->snd_sacks[i].start, new_list[0].start)) + new_list[0].start = tc->snd_sacks[i].start; + if (seq_lt (new_list[0].end, tc->snd_sacks[i].end)) + new_list[0].end = tc->snd_sacks[i].end; + continue; + } + + /* Save to new SACK list if we have space. */ + if (vec_len (new_list) < TCP_MAX_SACK_BLOCKS) + { + vec_add1 (new_list, tc->snd_sacks[i]); + } + else + { + clib_warning ("dropped sack blocks"); + } } - ASSERT (vec_len (new_list) < TCP_MAX_SACK_BLOCKS); + ASSERT (vec_len (new_list) <= TCP_MAX_SACK_BLOCKS); /* Replace old vector with new one */ vec_free (tc->snd_sacks); diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c index d65ce1be..bca5795a 100644 --- a/src/vnet/tcp/tcp_test.c +++ b/src/vnet/tcp/tcp_test.c @@ -35,7 +35,7 @@ } static int -tcp_test_sack () +tcp_test_sack_rx () { tcp_connection_t _tc, *tc = &_tc; sack_scoreboard_t *sb = &tc->sack_sb; @@ -173,6 +173,145 @@ tcp_test_sack () return 0; } +static int +tcp_test_sack_tx (vlib_main_t * vm, unformat_input_t * input) +{ + tcp_connection_t _tc, *tc = &_tc; + sack_block_t *sacks; + int i, verbose = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "verbose")) + verbose = 1; + else + { + vlib_cli_output (vm, "parse error: '%U'", format_unformat_error, + input); + return -1; + } + } + + memset (tc, 0, sizeof (*tc)); + + /* + * Add odd sack block pairs + */ + for (i = 1; i < 10; i += 2) + { + tcp_update_sack_list (tc, i * 100, (i + 1) * 100); + } + + TCP_TEST ((vec_len (tc->snd_sacks) == 5), "sack blocks %d expected %d", + vec_len (tc->snd_sacks), 5); + TCP_TEST ((tc->snd_sacks[0].start = 900), + "first sack block start %u expected %u", tc->snd_sacks[0].start, + 900); + + /* + * Try to add one extra + */ + sacks = vec_dup (tc->snd_sacks); + + tcp_update_sack_list (tc, 1100, 1200); + TCP_TEST ((vec_len (tc->snd_sacks) == 5), "sack blocks %d expected %d", + vec_len (tc->snd_sacks), 5); + TCP_TEST ((tc->snd_sacks[0].start == 1100), + "first sack block start %u expected %u", tc->snd_sacks[0].start, + 1100); + + /* restore */ + vec_free (tc->snd_sacks); + tc->snd_sacks = sacks; + + /* + * Overlap first 2 segment + */ + tc->rcv_nxt = 300; + tcp_update_sack_list (tc, 300, 300); + if (verbose) + vlib_cli_output (vm, "overlap first 2 segments:\n%U", + format_tcp_sacks, tc->snd_sacks); + TCP_TEST ((vec_len (tc->snd_sacks) == 3), "sack blocks %d expected %d", + vec_len (tc->snd_sacks), 3); + TCP_TEST ((tc->snd_sacks[0].start == 900), + "first sack block start %u expected %u", tc->snd_sacks[0].start, + 500); + + /* + * Add a new segment + */ + tcp_update_sack_list (tc, 1100, 1200); + if (verbose) + vlib_cli_output (vm, "add new segment [1100, 1200]\n%U", + format_tcp_sacks, tc->snd_sacks); + TCP_TEST ((vec_len (tc->snd_sacks) == 4), "sack blocks %d expected %d", + vec_len (tc->snd_sacks), 4); + TCP_TEST ((tc->snd_sacks[0].start == 1100), + "first sack block start %u expected %u", tc->snd_sacks[0].start, + 1100); + + /* + * Join middle segments + */ + tcp_update_sack_list (tc, 800, 900); + if (verbose) + vlib_cli_output (vm, "join middle segments [800, 900]\n%U", + format_tcp_sacks, tc->snd_sacks); + + TCP_TEST ((vec_len (tc->snd_sacks) == 3), "sack blocks %d expected %d", + vec_len (tc->snd_sacks), 3); + TCP_TEST ((tc->snd_sacks[0].start == 700), + "first sack block start %u expected %u", tc->snd_sacks[0].start, + 1100); + + /* + * Advance rcv_nxt to overlap all + */ + tc->rcv_nxt = 1200; + tcp_update_sack_list (tc, 1200, 1200); + if (verbose) + vlib_cli_output (vm, "advance rcv_nxt to 1200\n%U", + format_tcp_sacks, tc->snd_sacks); + TCP_TEST ((vec_len (tc->snd_sacks) == 0), "sack blocks %d expected %d", + vec_len (tc->snd_sacks), 0); + return 0; +} + +static int +tcp_test_sack (vlib_main_t * vm, unformat_input_t * input) +{ + int res = 0; + + /* Run all tests */ + if (unformat_check_input (input) == UNFORMAT_END_OF_INPUT) + { + if (tcp_test_sack_tx (vm, input)) + { + return -1; + } + + if (tcp_test_sack_rx ()) + { + return -1; + } + } + else + { + if (unformat (input, "tx")) + { + res = tcp_test_sack_tx (vm, input); + } + else if (unformat (input, "rx")) + { + res = tcp_test_sack_rx (); + } + } + + return res; +} + + typedef struct { u32 offset; @@ -967,7 +1106,7 @@ tcp_test (vlib_main_t * vm, { if (unformat (input, "sack")) { - res = tcp_test_sack (); + res = tcp_test_sack (vm, input); } else if (unformat (input, "fifo")) { -- cgit 1.2.3-korg From 82b13a89a3cd436b9d3ed5538952508354ea65ba Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Tue, 25 Apr 2017 11:58:06 -0700 Subject: Session/tcp coverity fixes Change-Id: Ic5467df16e870b49c49678b1dbb40f4a2390b3c9 Signed-off-by: Florin Coras --- src/uri/uri_tcp_test.c | 2 + src/vnet/buffer.h | 6 +- src/vnet/session/session_api.c | 6 ++ src/vnet/tcp/builtin_client.c | 3 + src/vnet/tcp/tcp.h | 8 ++ src/vnet/tcp/tcp_error.def | 1 + src/vnet/tcp/tcp_input.c | 165 +++++++++++++++++------------------------ src/vnet/tcp/tcp_test.c | 9 +-- 8 files changed, 98 insertions(+), 102 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/uri/uri_tcp_test.c b/src/uri/uri_tcp_test.c index 686c93f9..0b4aae37 100755 --- a/src/uri/uri_tcp_test.c +++ b/src/uri/uri_tcp_test.c @@ -616,6 +616,8 @@ client_send_data (uri_tcp_test_main_t * utm) session = pool_elt_at_index (utm->sessions, utm->connected_session_index); tx_fifo = session->server_tx_fifo; + ASSERT (vec_len (test_data) > 0); + vec_validate (utm->rx_buf, vec_len (test_data) - 1); n_iterations = utm->bytes_to_send / vec_len (test_data); diff --git a/src/vnet/buffer.h b/src/vnet/buffer.h index ed869d1f..5d1b1c4d 100644 --- a/src/vnet/buffer.h +++ b/src/vnet/buffer.h @@ -83,7 +83,8 @@ _(policer) \ _(ipsec) \ _(map) \ _(map_t) \ -_(ip_frag) +_(ip_frag) \ +_(tcp) /* * vnet stack buffer opaque array overlay structure. @@ -279,6 +280,9 @@ typedef struct u32 seq_number; u32 seq_end; u32 ack_number; + u16 hdr_offset; /**< offset relative to ip hdr */ + u16 data_offset; /**< offset relative to ip hdr */ + u16 data_len; /**< data len */ u8 flags; } tcp; diff --git a/src/vnet/session/session_api.c b/src/vnet/session/session_api.c index 79d67a2f..5a02a08e 100755 --- a/src/vnet/session/session_api.c +++ b/src/vnet/session/session_api.c @@ -227,6 +227,12 @@ redirect_connect_callback (u32 server_api_client_index, void *mp_arg) /* Tell the server the client's API queue address, so it can reply */ mp->client_queue_address = (u64) client_q; app = application_lookup (mp->client_index); + if (!app) + { + clib_warning ("no client application"); + return -1; + } + mp->options[SESSION_OPTIONS_RX_FIFO_SIZE] = app->sm_properties.rx_fifo_size; mp->options[SESSION_OPTIONS_TX_FIFO_SIZE] = app->sm_properties.tx_fifo_size; diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index 276beb21..32d69a96 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -56,6 +56,9 @@ send_test_chunk (tclient_main_t * tm, session_t * s) session_fifo_event_t evt; static int serial_number = 0; int rv; + + ASSERT (vec_len (test_data) > 0); + test_buf_offset = s->bytes_sent % vec_len (test_data); bytes_this_chunk = vec_len (test_data) - test_buf_offset; diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 40fb3515..f61a1b52 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -351,6 +351,14 @@ vnet_get_tcp_main () return &tcp_main; } +always_inline tcp_header_t * +tcp_buffer_hdr (vlib_buffer_t * b) +{ + ASSERT ((signed) b->current_data >= (signed) -VLIB_BUFFER_PRE_DATA_SIZE); + return (tcp_header_t *) (b->data + b->current_data + + vnet_buffer (b)->tcp.hdr_offset); +} + clib_error_t *vnet_tcp_enable_disable (vlib_main_t * vm, u8 is_en); always_inline tcp_connection_t * diff --git a/src/vnet/tcp/tcp_error.def b/src/vnet/tcp/tcp_error.def index b91a08c0..0d75d975 100644 --- a/src/vnet/tcp/tcp_error.def +++ b/src/vnet/tcp/tcp_error.def @@ -13,6 +13,7 @@ * limitations under the License. */ tcp_error (NONE, "no error") +tcp_error (LENGTH, "inconsistent ip/tcp lengths") tcp_error (NO_LISTENER, "no listener for dst port") tcp_error (LOOKUP_DROPS, "lookup drops") tcp_error (DISPATCH, "Dispatch error") diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index e184a4d6..3c65a5ea 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -1176,6 +1176,21 @@ format_tcp_rx_trace_short (u8 * s, va_list * args) return s; } +void +tcp_set_rx_trace_data (tcp_rx_trace_t * t0, tcp_connection_t * tc0, + tcp_header_t * th0, vlib_buffer_t * b0, u8 is_ip4) +{ + if (tc0) + { + clib_memcpy (&t0->tcp_connection, tc0, sizeof (t0->tcp_connection)); + } + else + { + th0 = tcp_buffer_hdr (b0); + } + clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); +} + always_inline void tcp_established_inc_counter (vlib_main_t * vm, u8 is_ip4, u8 evt, u8 val) { @@ -1212,12 +1227,8 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { u32 bi0; vlib_buffer_t *b0; - tcp_rx_trace_t *t0; tcp_header_t *th0 = 0; tcp_connection_t *tc0; - ip4_header_t *ip40; - ip6_header_t *ip60; - u32 n_advance_bytes0, n_data_bytes0; u32 next0 = TCP_ESTABLISHED_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED; bi0 = from[0]; @@ -1237,32 +1248,13 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto done; } - /* Checksum computed by ipx_local no need to compute again */ - - if (is_ip4) - { - ip40 = vlib_buffer_get_current (b0); - th0 = ip4_next_header (ip40); - n_advance_bytes0 = (ip4_header_bytes (ip40) - + tcp_header_bytes (th0)); - n_data_bytes0 = clib_net_to_host_u16 (ip40->length) - - n_advance_bytes0; - } - else - { - ip60 = vlib_buffer_get_current (b0); - th0 = ip6_next_header (ip60); - n_advance_bytes0 = tcp_header_bytes (th0); - n_data_bytes0 = clib_net_to_host_u16 (ip60->payload_length) - - n_advance_bytes0; - n_advance_bytes0 += sizeof (ip60[0]); - } + th0 = tcp_buffer_hdr (b0); is_fin = (th0->flags & TCP_FLAG_FIN) != 0; /* SYNs, FINs and data consume sequence numbers */ vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number - + tcp_is_syn (th0) + is_fin + n_data_bytes0; + + tcp_is_syn (th0) + is_fin + vnet_buffer (b0)->tcp.data_len; /* TODO header prediction fast path */ @@ -1286,8 +1278,9 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* 7: process the segment text */ - vlib_buffer_advance (b0, n_advance_bytes0); - error0 = tcp_segment_rcv (tm, tc0, b0, n_data_bytes0, &next0); + vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset); + error0 = tcp_segment_rcv (tm, tc0, b0, + vnet_buffer (b0)->tcp.data_len, &next0); /* N.B. buffer is rewritten if segment is ooo. Thus, th0 becomes a * dangling reference. */ @@ -1308,10 +1301,9 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, b0->error = node->errors[error0]; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); - clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); - clib_memcpy (&t0->tcp_connection, tc0, - sizeof (t0->tcp_connection)); + tcp_rx_trace_t *t0 = + vlib_add_trace (vm, node, b0, sizeof (*t0)); + tcp_set_rx_trace_data (t0, tc0, th0, b0, is_ip4); } vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, @@ -1416,9 +1408,6 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_rx_trace_t *t0; tcp_header_t *tcp0 = 0; tcp_connection_t *tc0; - ip4_header_t *ip40; - ip6_header_t *ip60; - u32 n_advance_bytes0, n_data_bytes0; tcp_connection_t *new_tc0; u32 next0 = TCP_SYN_SENT_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED; @@ -1436,27 +1425,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, ack0 = vnet_buffer (b0)->tcp.ack_number; seq0 = vnet_buffer (b0)->tcp.seq_number; - - /* Checksum computed by ipx_local no need to compute again */ - - if (is_ip4) - { - ip40 = vlib_buffer_get_current (b0); - tcp0 = ip4_next_header (ip40); - n_advance_bytes0 = (ip4_header_bytes (ip40) - + tcp_header_bytes (tcp0)); - n_data_bytes0 = clib_net_to_host_u16 (ip40->length) - - n_advance_bytes0; - } - else - { - ip60 = vlib_buffer_get_current (b0); - tcp0 = ip6_next_header (ip60); - n_advance_bytes0 = tcp_header_bytes (tcp0); - n_data_bytes0 = clib_net_to_host_u16 (ip60->payload_length) - - n_advance_bytes0; - n_advance_bytes0 += sizeof (ip60[0]); - } + tcp0 = tcp_buffer_hdr (b0); if (PREDICT_FALSE (!tcp_ack (tcp0) && !tcp_rst (tcp0) && !tcp_syn (tcp0))) @@ -1464,7 +1433,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* SYNs, FINs and data consume sequence numbers */ vnet_buffer (b0)->tcp.seq_end = seq0 + tcp_is_syn (tcp0) - + tcp_is_fin (tcp0) + n_data_bytes0; + + tcp_is_fin (tcp0) + vnet_buffer (b0)->tcp.data_len; /* * 1. check the ACK bit @@ -1591,10 +1560,12 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } /* Read data, if any */ - if (n_data_bytes0) + if (vnet_buffer (b0)->tcp.data_len) { - error0 = - tcp_segment_rcv (tm, new_tc0, b0, n_data_bytes0, &next0); + vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset); + error0 = tcp_segment_rcv (tm, new_tc0, b0, + vnet_buffer (b0)->tcp.data_len, + &next0); if (error0 == TCP_ERROR_PURE_ACK) error0 = TCP_ERROR_SYN_ACKS_RCVD; } @@ -1720,12 +1691,8 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { u32 bi0; vlib_buffer_t *b0; - tcp_rx_trace_t *t0; tcp_header_t *tcp0 = 0; tcp_connection_t *tc0; - ip4_header_t *ip40; - ip6_header_t *ip60; - u32 n_advance_bytes0, n_data_bytes0; u32 next0 = TCP_RCV_PROCESS_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED; bi0 = from[0]; @@ -1744,30 +1711,12 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; } - /* Checksum computed by ipx_local no need to compute again */ - - if (is_ip4) - { - ip40 = vlib_buffer_get_current (b0); - tcp0 = ip4_next_header (ip40); - n_advance_bytes0 = (ip4_header_bytes (ip40) - + tcp_header_bytes (tcp0)); - n_data_bytes0 = clib_net_to_host_u16 (ip40->length) - - n_advance_bytes0; - } - else - { - ip60 = vlib_buffer_get_current (b0); - tcp0 = ip6_next_header (ip60); - n_advance_bytes0 = tcp_header_bytes (tcp0); - n_data_bytes0 = clib_net_to_host_u16 (ip60->payload_length) - - n_advance_bytes0; - n_advance_bytes0 += sizeof (ip60[0]); - } + tcp0 = tcp_buffer_hdr (b0); /* SYNs, FINs and data consume sequence numbers */ vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number - + tcp_is_syn (tcp0) + tcp_is_fin (tcp0) + n_data_bytes0; + + tcp_is_syn (tcp0) + tcp_is_fin (tcp0) + + vnet_buffer (b0)->tcp.data_len; /* * Special treatment for CLOSED @@ -1911,8 +1860,10 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, case TCP_STATE_ESTABLISHED: case TCP_STATE_FIN_WAIT_1: case TCP_STATE_FIN_WAIT_2: - vlib_buffer_advance (b0, n_advance_bytes0); - error0 = tcp_segment_rcv (tm, tc0, b0, n_data_bytes0, &next0); + vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset); + error0 = tcp_segment_rcv (tm, tc0, b0, + vnet_buffer (b0)->tcp.data_len, + &next0); break; case TCP_STATE_CLOSE_WAIT: case TCP_STATE_CLOSING: @@ -1964,15 +1915,14 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0); + drop: b0->error = error0 ? node->errors[error0] : 0; - drop: if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); - clib_memcpy (&t0->tcp_header, tcp0, sizeof (t0->tcp_header)); - clib_memcpy (&t0->tcp_connection, tc0, - sizeof (t0->tcp_connection)); + tcp_rx_trace_t *t0 = + vlib_add_trace (vm, node, b0, sizeof (*t0)); + tcp_set_rx_trace_data (t0, tc0, tcp0, b0, is_ip4); } vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, @@ -2320,9 +2270,9 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, while (n_left_from > 0 && n_left_to_next > 0) { + int n_advance_bytes0, n_data_bytes0; u32 bi0; vlib_buffer_t *b0; - tcp_rx_trace_t *t0; tcp_header_t *tcp0 = 0; tcp_connection_t *tc0; ip4_header_t *ip40; @@ -2340,10 +2290,16 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, b0 = vlib_get_buffer (vm, bi0); vnet_buffer (b0)->tcp.flags = 0; + /* Checksum computed by ipx_local no need to compute again */ + if (is_ip4) { ip40 = vlib_buffer_get_current (b0); tcp0 = ip4_next_header (ip40); + n_advance_bytes0 = (ip4_header_bytes (ip40) + + tcp_header_bytes (tcp0)); + n_data_bytes0 = clib_net_to_host_u16 (ip40->length) + - n_advance_bytes0; /* lookup session */ tc0 = @@ -2359,6 +2315,11 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { ip60 = vlib_buffer_get_current (b0); tcp0 = ip6_next_header (ip60); + n_advance_bytes0 = tcp_header_bytes (tcp0); + n_data_bytes0 = clib_net_to_host_u16 (ip60->payload_length) + - n_advance_bytes0; + n_advance_bytes0 += sizeof (ip60[0]); + tc0 = (tcp_connection_t *) stream_session_lookup_transport6 (&ip60->src_address, @@ -2369,6 +2330,13 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, my_thread_index); } + /* Length check */ + if (PREDICT_FALSE (n_advance_bytes0 < 0)) + { + error0 = TCP_ERROR_LENGTH; + goto done; + } + /* Session exists */ if (PREDICT_TRUE (0 != tc0)) { @@ -2379,6 +2347,11 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vnet_buffer (b0)->tcp.ack_number = clib_net_to_host_u32 (tcp0->ack_number); + vnet_buffer (b0)->tcp.hdr_offset = (u8 *) tcp0 + - (u8 *) vlib_buffer_get_current (b0); + vnet_buffer (b0)->tcp.data_offset = n_advance_bytes0; + vnet_buffer (b0)->tcp.data_len = n_data_bytes0; + flags0 = tcp0->flags & filter_flags; next0 = tm->dispatch_table[tc0->state][flags0].next; error0 = tm->dispatch_table[tc0->state][flags0].error; @@ -2400,14 +2373,14 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, error0 = TCP_ERROR_NO_LISTENER; } + done: b0->error = error0 ? node->errors[error0] : 0; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); - clib_memcpy (&t0->tcp_header, tcp0, sizeof (t0->tcp_header)); - if (tc0) - clib_memcpy (&t0->tcp_connection, tc0, sizeof (*tc0)); + tcp_rx_trace_t *t0 = + vlib_add_trace (vm, node, b0, sizeof (*t0)); + tcp_set_rx_trace_data (t0, tc0, tcp0, b0, is_ip4); } vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c index bca5795a..ed032206 100644 --- a/src/vnet/tcp/tcp_test.c +++ b/src/vnet/tcp/tcp_test.c @@ -687,8 +687,7 @@ tcp_test_fifo2 (vlib_main_t * vm) { tp = vp + i; data64 = tp->offset; - rv = svm_fifo_enqueue_with_offset (f, tp->offset, tp->len, - (u8 *) & data64); + svm_fifo_enqueue_with_offset (f, tp->offset, tp->len, (u8 *) & data64); } /* Expected result: one big fat chunk at offset 4 */ @@ -891,9 +890,9 @@ tcp_test_fifo3 (vlib_main_t * vm, unformat_input_t * input) for (i = 0; i < vec_len (generate); i++) { tp = generate + i; - rv = svm_fifo_enqueue_with_offset (f, fifo_initial_offset - + tp->offset, tp->len, - (u8 *) data_pattern + tp->offset); + svm_fifo_enqueue_with_offset (f, fifo_initial_offset + tp->offset, + tp->len, + (u8 *) data_pattern + tp->offset); } /* -- cgit 1.2.3-korg From c28764fd356632763614ea579f678d8f55eca4c7 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Wed, 26 Apr 2017 00:08:42 -0700 Subject: TCP ooo reception fixes - Improve svm fifo handling of out-of-order segments - Ensure tsval_recent is updated only if rcv_las falls withing the segments's sequence space - Avoid directly dropping old ACKs - Improve debugging Change-Id: I88dbe2394a0ad7eb389a4cc12d013a13733953aa Signed-off-by: Florin Coras --- src/svm/svm_fifo.c | 144 ++++++++++++++++++++++----------------------- src/svm/svm_fifo.h | 1 + src/vnet/session/session.c | 15 +++++ src/vnet/session/session.h | 7 ++- src/vnet/tcp/tcp_debug.h | 15 +++++ src/vnet/tcp/tcp_error.def | 3 +- src/vnet/tcp/tcp_format.c | 6 +- src/vnet/tcp/tcp_input.c | 81 +++++++++++++++++-------- src/vnet/tcp/tcp_output.c | 4 +- src/vnet/tcp/tcp_test.c | 114 ++++++++++++++++++++++++++++++++--- 10 files changed, 275 insertions(+), 115 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c index 8f2ed0c9..9b09d0c2 100644 --- a/src/svm/svm_fifo.c +++ b/src/svm/svm_fifo.c @@ -17,6 +17,8 @@ #define offset_lt(_a, _b) ((i32)((_a)-(_b)) < 0) #define offset_leq(_a, _b) ((i32)((_a)-(_b)) <= 0) +#define offset_gt(_a, _b) ((i32)((_a)-(_b)) > 0) +#define offset_geq(_a, _b) ((i32)((_a)-(_b)) >= 0) u8 * format_ooo_segment (u8 * s, va_list * args) @@ -160,14 +162,23 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) && offset_leq (ooo_segment_offset (f, s), offset)) s = pool_elt_at_index (f->ooo_segments, s->next); + /* If we have a previous and we overlap it, use it as starting point */ + prev = ooo_segment_get_prev (f, s); + if (prev && offset_leq (offset, ooo_segment_end_offset (f, prev))) + { + s = prev; + prev = ooo_segment_get_prev (f, s); + s_sof = ooo_segment_offset (f, s); + s_eof = ooo_segment_end_offset (f, s); + goto merge; + } + s_index = s - f->ooo_segments; s_sof = ooo_segment_offset (f, s); s_eof = ooo_segment_end_offset (f, s); - prev = ooo_segment_get_prev (f, s); /* No overlap, add before current segment */ - if (offset_lt (end_offset, s_sof) - && (!prev || offset_lt (prev->start + prev->length, offset))) + if (offset_lt (end_offset, s_sof)) { new_s = ooo_segment_new (f, offset, length); new_index = new_s - f->ooo_segments; @@ -192,7 +203,7 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) return; } /* No overlap, add after current segment */ - else if (offset_lt (s_eof, offset)) + else if (offset_gt (offset, s_eof)) { new_s = ooo_segment_new (f, offset, length); new_index = new_s - f->ooo_segments; @@ -218,62 +229,16 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) * Merge needed */ +merge: + /* Merge at head */ - if (offset_leq (offset, s_sof)) + if (offset_lt (offset, s_sof)) { - /* If we have a previous, check if we overlap */ - if (s->prev != OOO_SEGMENT_INVALID_INDEX) - { - prev = pool_elt_at_index (f->ooo_segments, s->prev); - - /* New segment merges prev and current. Remove previous and - * update position of current. */ - if (offset_leq (offset, ooo_segment_end_offset (f, prev))) - { - s->start = prev->start; - s->length = s_eof - ooo_segment_offset (f, prev); - ooo_segment_del (f, s->prev); - } - else - { - s->start = offset; - s->length = s_eof - ooo_segment_offset (f, s); - } - } - else - { - s->start = offset; - s->length = s_eof - ooo_segment_offset (f, s); - } - - /* The new segment's tail may cover multiple smaller ones */ - if (offset_lt (s_eof, end_offset)) - { - /* Remove segments completely covered */ - it = (s->next != OOO_SEGMENT_INVALID_INDEX) ? - pool_elt_at_index (f->ooo_segments, s->next) : 0; - while (it && offset_lt (ooo_segment_end_offset (f, it), end_offset)) - { - next = (it->next != OOO_SEGMENT_INVALID_INDEX) ? - pool_elt_at_index (f->ooo_segments, it->next) : 0; - ooo_segment_del (f, it - f->ooo_segments); - it = next; - } - - /* Update length. Segment's start might have changed. */ - s->length = end_offset - ooo_segment_offset (f, s); - - /* If partial overlap with last, merge */ - if (it && offset_lt (ooo_segment_offset (f, it), end_offset)) - { - s->length += - it->length - (ooo_segment_offset (f, it) - end_offset); - ooo_segment_del (f, it - f->ooo_segments); - } - } + s->start = offset; + s->length = s_eof - ooo_segment_offset (f, s); } /* Last but overlapping previous */ - else if (offset_leq (s_eof, end_offset)) + else if (offset_gt (end_offset, s_eof)) { s->length = end_offset - ooo_segment_offset (f, s); } @@ -281,8 +246,33 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) else { /* Do Nothing */ + goto done; + } + + /* The new segment's tail may cover multiple smaller ones */ + if (offset_geq (end_offset, s_eof)) + { + /* Remove the completely overlapped segments */ + it = (s->next != OOO_SEGMENT_INVALID_INDEX) ? + pool_elt_at_index (f->ooo_segments, s->next) : 0; + while (it && offset_leq (ooo_segment_end_offset (f, it), end_offset)) + { + next = (it->next != OOO_SEGMENT_INVALID_INDEX) ? + pool_elt_at_index (f->ooo_segments, it->next) : 0; + ooo_segment_del (f, it - f->ooo_segments); + it = next; + } + + /* If partial overlap with last, merge */ + if (it && offset_leq (ooo_segment_offset (f, it), end_offset)) + { + s->length = ooo_segment_end_offset (f, it) - + ooo_segment_offset (f, s); + ooo_segment_del (f, it - f->ooo_segments); + } } +done: /* Most recently updated segment */ f->ooos_newest = s - f->ooo_segments; } @@ -296,14 +286,17 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) { ooo_segment_t *s; u32 index, bytes = 0, diff; - u32 cursize; + u32 cursize, norm_start, nitems; /* current size has not yet been updated */ cursize = svm_fifo_max_dequeue (f) + n_bytes_enqueued; + nitems = f->nitems; s = pool_elt_at_index (f->ooo_segments, f->ooos_list_head); - diff = (f->nitems + (i32) (f->tail - s->start)) % f->nitems; + norm_start = s->start % nitems; + diff = (f->nitems + (i32) (f->tail - norm_start)) % nitems; + if (diff > cursize) return 0; @@ -326,7 +319,8 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) if (s->next != OOO_SEGMENT_INVALID_INDEX) { s = pool_elt_at_index (f->ooo_segments, s->next); - diff = (f->nitems + (i32) (f->tail - s->start)) % f->nitems; + norm_start = s->start % nitems; + diff = (f->nitems + (i32) (f->tail - norm_start)) % nitems; ooo_segment_del (f, index); } /* End of search */ @@ -340,11 +334,11 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) /* If tail is adjacent to an ooo segment, 'consume' it */ if (diff == 0) { - bytes = ((f->nitems - cursize) >= s->length) ? s->length : - f->nitems - cursize; + bytes = ((nitems - cursize) >= s->length) ? s->length : + nitems - cursize; f->tail += bytes; - f->tail %= f->nitems; + f->tail %= nitems; ooo_segment_del (f, s - f->ooo_segments); } @@ -430,31 +424,22 @@ svm_fifo_enqueue_with_offset_internal (svm_fifo_t * f, { u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; u32 cursize, nitems; - u32 normalized_offset; - int rv; - - /* Users would do well to avoid this */ - if (PREDICT_FALSE (f->tail == (offset % f->nitems))) - { - rv = svm_fifo_enqueue_internal (f, required_bytes, copy_from_here); - if (rv > 0) - return 0; - return -1; - } + u32 normalized_offset, offset_from_tail; /* read cursize, which can only increase while we're working */ cursize = svm_fifo_max_dequeue (f); nitems = f->nitems; + normalized_offset = offset % nitems; /* Will this request fit? */ - if ((required_bytes + (offset - f->tail) % nitems) > (nitems - cursize)) + offset_from_tail = (nitems + normalized_offset - f->tail) % nitems; + if ((required_bytes + offset_from_tail) > (nitems - cursize)) return -1; ooo_segment_add (f, offset, required_bytes); /* Number of bytes we're going to copy */ total_copy_bytes = required_bytes; - normalized_offset = offset % nitems; /* Number of bytes in first copy segment */ first_copy_bytes = ((nitems - normalized_offset) < total_copy_bytes) @@ -631,6 +616,15 @@ svm_fifo_first_ooo_segment (svm_fifo_t * f) return pool_elt_at_index (f->ooo_segments, f->ooos_list_head); } +/** + * Set fifo pointers to requested offset + */ +void +svm_fifo_init_pointers (svm_fifo_t * f, u32 pointer) +{ + f->head = f->tail = pointer % f->nitems; +} + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/svm/svm_fifo.h b/src/svm/svm_fifo.h index d67237c6..36158dc5 100644 --- a/src/svm/svm_fifo.h +++ b/src/svm/svm_fifo.h @@ -119,6 +119,7 @@ int svm_fifo_peek (svm_fifo_t * f, u32 offset, u32 max_bytes, u8 * copy_here); int svm_fifo_dequeue_drop (svm_fifo_t * f, u32 max_bytes); u32 svm_fifo_number_ooo_segments (svm_fifo_t * f); ooo_segment_t *svm_fifo_first_ooo_segment (svm_fifo_t * f); +void svm_fifo_init_pointers (svm_fifo_t * f, u32 pointer); format_function_t format_svm_fifo; diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index d17c93f8..e92bb440 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -609,6 +609,21 @@ session_manager_flush_enqueue_events (u32 thread_index) return errors; } +/** + * Init fifo tail and head pointers + * + * Useful if transport uses absolute offsets for tracking ooo segments. + */ +void +stream_session_init_fifos_pointers (transport_connection_t * tc, + u32 rx_pointer, u32 tx_pointer) +{ + stream_session_t *s; + s = stream_session_get (tc->s_index, tc->thread_index); + svm_fifo_init_pointers (s->server_rx_fifo, rx_pointer); + svm_fifo_init_pointers (s->server_tx_fifo, tx_pointer); +} + void stream_session_connect_notify (transport_connection_t * tc, u8 sst, u8 is_fail) diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index 8cd72f35..f41a8a96 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -352,9 +352,10 @@ stream_session_peek_bytes (transport_connection_t * tc, u8 * buffer, u32 offset, u32 max_bytes); u32 stream_session_dequeue_drop (transport_connection_t * tc, u32 max_bytes); -void -stream_session_connect_notify (transport_connection_t * tc, u8 sst, - u8 is_fail); +void stream_session_connect_notify (transport_connection_t * tc, u8 sst, + u8 is_fail); +void stream_session_init_fifos_pointers (transport_connection_t * tc, + u32 rx_pointer, u32 tx_pointer); void stream_session_accept_notify (transport_connection_t * tc); void stream_session_disconnect_notify (transport_connection_t * tc); diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index ecbf7887..b4497a3b 100755 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -50,6 +50,7 @@ _(CC_EVT, "cc event") \ _(CC_PACK, "cc partial ack") \ _(SEG_INVALID, "invalid segment") \ + _(PAWS_FAIL, "failed paws check") \ _(ACK_RCV_ERR, "invalid ack") \ _(RCV_WND_SHRUNK, "shrunk rcv_wnd") \ @@ -382,6 +383,20 @@ typedef enum _tcp_dbg_evt ed->data[4] = _tc->rcv_wnd; \ } +#define TCP_EVT_PAWS_FAIL_HANDLER(_tc, _seq, _end, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "paws fail: seq %u end %u tsval %u tsval_recent %u", \ + .format_args = "i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 4); \ + ed->data[0] = _seq - _tc->irs; \ + ed->data[1] = _end - _tc->irs; \ + ed->data[2] = _tc->opt.tsval; \ + ed->data[3] = _tc->tsval_recent; \ +} + #define TCP_EVT_ACK_RCV_ERR_HANDLER(_tc, _type, _ack, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ diff --git a/src/vnet/tcp/tcp_error.def b/src/vnet/tcp/tcp_error.def index 0d75d975..a4e46d64 100644 --- a/src/vnet/tcp/tcp_error.def +++ b/src/vnet/tcp/tcp_error.def @@ -37,4 +37,5 @@ tcp_error (PKTS_SENT, "Packets sent") tcp_error (FILTERED_DUPACKS, "Filtered duplicate ACKs") tcp_error (RST_SENT, "Resets sent") tcp_error (INVALID_CONNECTION, "Invalid connection") -tcp_error (NO_WND, "No window") \ No newline at end of file +tcp_error (NO_WND, "No window") +tcp_error (CONNECTION_CLOSED, "Connection closed") \ No newline at end of file diff --git a/src/vnet/tcp/tcp_format.c b/src/vnet/tcp/tcp_format.c index 3148fd40..4de99235 100644 --- a/src/vnet/tcp/tcp_format.c +++ b/src/vnet/tcp/tcp_format.c @@ -131,11 +131,13 @@ format_tcp_header (u8 * s, va_list * args) u8 * format_tcp_sacks (u8 * s, va_list * args) { - sack_block_t *sacks = va_arg (*args, sack_block_t *); + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + sack_block_t *sacks = tc->snd_sacks; sack_block_t *block; vec_foreach (block, sacks) { - s = format (s, " start %u end %u\n", block->start, block->end); + s = format (s, " start %u end %u\n", block->start - tc->irs, + block->end - tc->irs); } return s; } diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 3c65a5ea..0030cfe2 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -208,6 +208,15 @@ tcp_options_parse (tcp_header_t * th, tcp_options_t * to) } } +/** + * RFC1323: Check against wrapped sequence numbers (PAWS). If we have + * timestamp to echo and it's less than tsval_recent, drop segment + * but still send an ACK in order to retain TCP's mechanism for detecting + * and recovering from half-open connections + * + * Or at least that's what the theory says. It seems that this might not work + * very well with packet reordering and fast retransmit. XXX + */ always_inline int tcp_segment_check_paws (tcp_connection_t * tc) { @@ -215,6 +224,27 @@ tcp_segment_check_paws (tcp_connection_t * tc) && timestamp_lt (tc->opt.tsval, tc->tsval_recent); } +/** + * Update tsval recent + */ +always_inline void +tcp_update_timestamp (tcp_connection_t * tc, u32 seq, u32 seq_end) +{ + /* + * RFC1323: If Last.ACK.sent falls within the range of sequence numbers + * of an incoming segment: + * SEG.SEQ <= Last.ACK.sent < SEG.SEQ + SEG.LEN + * then the TSval from the segment is copied to TS.Recent; + * otherwise, the TSval is ignored. + */ + if (tcp_opts_tstamp (&tc->opt) && tc->tsval_recent + && seq_leq (seq, tc->rcv_las) && seq_leq (tc->rcv_las, seq_end)) + { + tc->tsval_recent = tc->opt.tsval; + tc->tsval_recent_age = tcp_time_now (); + } +} + /** * Validate incoming segment as per RFC793 p. 69 and RFC1323 p. 19 * @@ -228,21 +258,16 @@ static int tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, vlib_buffer_t * b0, tcp_header_t * th0, u32 * next0) { - u8 paws_failed; - if (PREDICT_FALSE (!tcp_ack (th0) && !tcp_rst (th0) && !tcp_syn (th0))) return -1; tcp_options_parse (th0, &tc0->opt); - /* RFC1323: Check against wrapped sequence numbers (PAWS). If we have - * timestamp to echo and it's less than tsval_recent, drop segment - * but still send an ACK in order to retain TCP's mechanism for detecting - * and recovering from half-open connections */ - paws_failed = tcp_segment_check_paws (tc0); - if (paws_failed) + if (tcp_segment_check_paws (tc0)) { clib_warning ("paws failed"); + TCP_EVT_DBG (TCP_EVT_PAWS_FAIL, tc0, vnet_buffer (b0)->tcp.seq_number, + vnet_buffer (b0)->tcp.seq_end); /* If it just so happens that a segment updates tsval_recent for a * segment over 24 days old, invalidate tsval_recent. */ @@ -251,6 +276,7 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, { /* Age isn't reset until we get a valid tsval (bsd inspired) */ tc0->tsval_recent = 0; + clib_warning ("paws failed - really old segment. REALLY?"); } else { @@ -305,12 +331,9 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, return -1; } - /* If PAWS passed and segment in window, save timestamp */ - if (!paws_failed) - { - tc0->tsval_recent = tc0->opt.tsval; - tc0->tsval_recent_age = tcp_time_now (); - } + /* If segment in window, save timestamp */ + tcp_update_timestamp (tc0, vnet_buffer (b0)->tcp.seq_number, + vnet_buffer (b0)->tcp.seq_end); return 0; } @@ -835,7 +858,8 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc); tcp_cc_rcv_dupack (tc, vnet_buffer (b)->tcp.ack_number); } - return -1; + /* Don't drop yet */ + return 0; } if (tcp_opts_sack_permitted (&tc->opt)) @@ -932,10 +956,6 @@ tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end) { vec_add1 (new_list, tc->snd_sacks[i]); } - else - { - clib_warning ("dropped sack blocks"); - } } ASSERT (vec_len (new_list) <= TCP_MAX_SACK_BLOCKS); @@ -1011,7 +1031,6 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, u16 data_len) { stream_session_t *s0; - u32 offset; int rv; /* Pure ACK. Do nothing */ @@ -1021,12 +1040,11 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, } s0 = stream_session_get (tc->c_s_index, tc->c_thread_index); - offset = vnet_buffer (b)->tcp.seq_number - tc->irs; - clib_warning ("ooo: offset %d len %d", offset, data_len); - - rv = svm_fifo_enqueue_with_offset (s0->server_rx_fifo, offset, data_len, - vlib_buffer_get_current (b)); + /* Enqueue out-of-order data with absolute offset */ + rv = svm_fifo_enqueue_with_offset (s0->server_rx_fifo, + vnet_buffer (b)->tcp.seq_number, + data_len, vlib_buffer_get_current (b)); /* Nothing written */ if (rv) @@ -1542,6 +1560,9 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Notify app that we have connection */ stream_session_connect_notify (&new_tc0->connection, sst, 0); + stream_session_init_fifos_pointers (&new_tc0->connection, + new_tc0->irs + 1, + new_tc0->iss + 1); /* Make sure after data segment processing ACK is sent */ new_tc0->flags |= TCP_CONN_SNDACK; } @@ -1552,7 +1573,9 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Notify app that we have connection */ stream_session_connect_notify (&new_tc0->connection, sst, 0); - + stream_session_init_fifos_pointers (&new_tc0->connection, + new_tc0->irs + 1, + new_tc0->iss + 1); tcp_make_synack (new_tc0, b0); next0 = tcp_next_output (is_ip4); @@ -2139,6 +2162,10 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_make_synack (child0, b0); next0 = tcp_next_output (is_ip4); + /* Init fifo pointers after we have iss */ + stream_session_init_fifos_pointers (&child0->connection, + child0->irs + 1, + child0->iss + 1); drop: if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { @@ -2474,6 +2501,7 @@ do { \ _(LISTEN, TCP_FLAG_SYN, TCP_INPUT_NEXT_LISTEN, TCP_ERROR_NONE); /* ACK for for a SYN-ACK -> tcp-rcv-process. */ _(SYN_RCVD, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(SYN_RCVD, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); /* SYN-ACK for a SYN */ _(SYN_SENT, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE); @@ -2499,6 +2527,7 @@ do { \ _(FIN_WAIT_2, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); #undef _ } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 4e1a7aa5..a85d30da 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -359,7 +359,8 @@ tcp_make_established_options (tcp_connection_t * tc, tcp_options_t * opts) { opts->flags |= TCP_OPTS_FLAG_SACK; opts->sacks = tc->snd_sacks; - opts->n_sack_blocks = vec_len (tc->snd_sacks); + opts->n_sack_blocks = clib_min (vec_len (tc->snd_sacks), + TCP_OPTS_MAX_SACK_BLOCKS); len += 2 + TCP_OPTION_LEN_SACK_BLOCK * opts->n_sack_blocks; } } @@ -917,6 +918,7 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, vnet_buffer (b)->tcp.connection_index = tc->c_c_index; tc->snd_nxt += data_len; + tc->rcv_las = tc->rcv_nxt; /* TODO this is updated in output as well ... */ if (tc->snd_nxt > tc->snd_una_max) diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c index ed032206..a457ac8f 100644 --- a/src/vnet/tcp/tcp_test.c +++ b/src/vnet/tcp/tcp_test.c @@ -231,7 +231,7 @@ tcp_test_sack_tx (vlib_main_t * vm, unformat_input_t * input) tcp_update_sack_list (tc, 300, 300); if (verbose) vlib_cli_output (vm, "overlap first 2 segments:\n%U", - format_tcp_sacks, tc->snd_sacks); + format_tcp_sacks, tc); TCP_TEST ((vec_len (tc->snd_sacks) == 3), "sack blocks %d expected %d", vec_len (tc->snd_sacks), 3); TCP_TEST ((tc->snd_sacks[0].start == 900), @@ -244,7 +244,7 @@ tcp_test_sack_tx (vlib_main_t * vm, unformat_input_t * input) tcp_update_sack_list (tc, 1100, 1200); if (verbose) vlib_cli_output (vm, "add new segment [1100, 1200]\n%U", - format_tcp_sacks, tc->snd_sacks); + format_tcp_sacks, tc); TCP_TEST ((vec_len (tc->snd_sacks) == 4), "sack blocks %d expected %d", vec_len (tc->snd_sacks), 4); TCP_TEST ((tc->snd_sacks[0].start == 1100), @@ -257,7 +257,7 @@ tcp_test_sack_tx (vlib_main_t * vm, unformat_input_t * input) tcp_update_sack_list (tc, 800, 900); if (verbose) vlib_cli_output (vm, "join middle segments [800, 900]\n%U", - format_tcp_sacks, tc->snd_sacks); + format_tcp_sacks, tc); TCP_TEST ((vec_len (tc->snd_sacks) == 3), "sack blocks %d expected %d", vec_len (tc->snd_sacks), 3); @@ -271,8 +271,7 @@ tcp_test_sack_tx (vlib_main_t * vm, unformat_input_t * input) tc->rcv_nxt = 1200; tcp_update_sack_list (tc, 1200, 1200); if (verbose) - vlib_cli_output (vm, "advance rcv_nxt to 1200\n%U", - format_tcp_sacks, tc->snd_sacks); + vlib_cli_output (vm, "advance rcv_nxt to 1200\n%U", format_tcp_sacks, tc); TCP_TEST ((vec_len (tc->snd_sacks) == 0), "sack blocks %d expected %d", vec_len (tc->snd_sacks), 0); return 0; @@ -502,7 +501,13 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) { offset = (2 * i + 1) * sizeof (u32); data = (u8 *) (test_data + (2 * i + 1)); - rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data); + if (i == 0) + { + rv = svm_fifo_enqueue_nowait (f, sizeof (u32), data); + rv = rv > 0 ? 0 : rv; + } + else + rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data); if (verbose) vlib_cli_output (vm, "add [%d] [%d, %d]", 2 * i + 1, offset, offset + sizeof (u32)); @@ -517,6 +522,26 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) vlib_cli_output (vm, "fifo after odd segs: %U", format_svm_fifo, f, 1); TCP_TEST ((f->tail == 8), "fifo tail %u", f->tail); + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 2), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); + + /* + * Try adding a completely overlapped segment + */ + offset = 3 * sizeof (u32); + data = (u8 *) (test_data + 3); + rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data); + if (rv) + { + clib_warning ("enqueue returned %d", rv); + goto err; + } + + if (verbose) + vlib_cli_output (vm, "fifo after overlap seg: %U", format_svm_fifo, f, 1); + + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 2), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); /* * Make sure format functions are not buggy @@ -887,7 +912,7 @@ tcp_test_fifo3 (vlib_main_t * vm, unformat_input_t * input) f->head = fifo_initial_offset; f->tail = fifo_initial_offset; - for (i = 0; i < vec_len (generate); i++) + for (i = !randomize; i < vec_len (generate); i++) { tp = generate + i; svm_fifo_enqueue_with_offset (f, fifo_initial_offset + tp->offset, @@ -895,6 +920,10 @@ tcp_test_fifo3 (vlib_main_t * vm, unformat_input_t * input) (u8 *) data_pattern + tp->offset); } + /* Add the first segment in order for non random data */ + if (!randomize) + svm_fifo_enqueue_nowait (f, generate[0].len, (u8 *) data_pattern); + /* * Expected result: one big fat chunk at offset 1 if randomize == 1 */ @@ -964,6 +993,73 @@ tcp_test_fifo3 (vlib_main_t * vm, unformat_input_t * input) return 0; } +static int +tcp_test_fifo4 (vlib_main_t * vm, unformat_input_t * input) +{ + svm_fifo_t *f; + u32 fifo_size = 6 << 10; + u32 fifo_initial_offset = 1000000000; + u32 test_n_bytes = 5000, j; + u8 *test_data = 0, *data_buf = 0; + int i, rv, verbose = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "verbose")) + verbose = 1; + else + { + clib_error_t *e = clib_error_return + (0, "unknown input `%U'", format_unformat_error, input); + clib_error_report (e); + return -1; + } + } + + /* + * Create a fifo and add segments + */ + f = fifo_prepare (fifo_size); + + /* Set head and tail pointers */ + fifo_initial_offset = fifo_initial_offset % fifo_size; + svm_fifo_init_pointers (f, fifo_initial_offset); + + vec_validate (test_data, test_n_bytes - 1); + for (i = 0; i < vec_len (test_data); i++) + test_data[i] = i; + + for (i = test_n_bytes - 1; i > 0; i--) + { + rv = svm_fifo_enqueue_with_offset (f, fifo_initial_offset + i, + sizeof (u8), &test_data[i]); + if (verbose) + vlib_cli_output (vm, "add [%d] [%d, %d]", i, i, i + sizeof (u8)); + if (rv) + { + clib_warning ("enqueue returned %d", rv); + svm_fifo_free (f); + vec_free (test_data); + return -1; + } + } + + svm_fifo_enqueue_nowait (f, sizeof (u8), &test_data[0]); + + vec_validate (data_buf, vec_len (test_data)); + + svm_fifo_dequeue_nowait (f, vec_len (test_data), data_buf); + rv = compare_data (data_buf, test_data, 0, vec_len (test_data), &j); + if (rv) + vlib_cli_output (vm, "[%d] dequeued %u expected %u", j, data_buf[j], + test_data[j]); + TCP_TEST ((rv == 0), "dequeued compared to original returned %d", rv); + + svm_fifo_free (f); + vec_free (test_data); + return 0; +} + static int tcp_test_fifo (vlib_main_t * vm, unformat_input_t * input) { @@ -1028,6 +1124,10 @@ tcp_test_fifo (vlib_main_t * vm, unformat_input_t * input) { res = tcp_test_fifo1 (vm, input); } + else if (unformat (input, "fifo4")) + { + res = tcp_test_fifo4 (vm, input); + } } return res; -- cgit 1.2.3-korg From 3af90fceb61d0c236709c25df936bbbf304cbff5 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Wed, 3 May 2017 21:09:42 -0700 Subject: Fix TCP loss recovery, VPP-745 Allows pure loss recovery retransmits only on timeout. Change-Id: I563cdbf9e7b890a6569350bdbda4f746ace0544e Signed-off-by: Florin Coras --- src/vnet/tcp/tcp.c | 35 +++++++++++++++++++++++++++-------- src/vnet/tcp/tcp.h | 4 ++-- src/vnet/tcp/tcp_input.c | 35 +++++++++++++++++++++++++---------- src/vnet/tcp/tcp_newreno.c | 2 +- src/vnet/tcp/tcp_output.c | 44 +++++++++++++++++--------------------------- 5 files changed, 72 insertions(+), 48 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index de4edfa6..e80e2ec9 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -567,30 +567,49 @@ tcp_session_send_mss (transport_connection_t * trans_conn) return tc->snd_mss; } +always_inline u32 +tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space) +{ + if (tc->snd_wnd < tc->snd_mss) + { + return tc->snd_wnd < snd_space ? tc->snd_wnd : 0; + } + + /* If we can't write at least a segment, don't try at all */ + if (snd_space < tc->snd_mss) + return 0; + + /* round down to mss multiple */ + return snd_space - (snd_space % tc->snd_mss); +} + /** * Compute tx window session is allowed to fill. */ u32 tcp_session_send_space (transport_connection_t * trans_conn) { - u32 snd_space, chunk; + int snd_space; tcp_connection_t *tc = (tcp_connection_t *) trans_conn; /* If we haven't gotten dupacks or if we did and have gotten sacked bytes * then we can still send */ - if (PREDICT_TRUE (tcp_in_fastrecovery (tc) == 0 + if (PREDICT_TRUE (tcp_in_cong_recovery (tc) == 0 && (tc->rcv_dupacks == 0 || tc->sack_sb.last_sacked_bytes))) { - chunk = tc->snd_wnd > tc->snd_mss ? tc->snd_mss : tc->snd_wnd; snd_space = tcp_available_snd_space (tc); + return tcp_round_snd_space (tc, snd_space); + } - /* If we can't write at least a segment, don't try at all */ - if (chunk == 0 || snd_space < chunk) + if (tcp_in_recovery (tc)) + { + tc->snd_nxt = tc->snd_una_max; + snd_space = tcp_available_wnd (tc) - tc->rtx_bytes + - (tc->snd_una_max - tc->snd_congestion); + if (snd_space <= 0) return 0; - - /* round down to mss multiple */ - return snd_space - (snd_space % chunk); + return tcp_round_snd_space (tc, snd_space); } /* If in fast recovery, send 1 SMSS if wnd allows */ diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index f61a1b52..c75479dc 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -24,8 +24,8 @@ #include #include -#define TCP_TICK 10e-3 /**< TCP tick period (s) */ -#define THZ 1/TCP_TICK /**< TCP tick frequency */ +#define TCP_TICK 0.001 /**< TCP tick period (s) */ +#define THZ (u32) (1/TCP_TICK) /**< TCP tick frequency */ #define TCP_TSTAMP_RESOLUTION TCP_TICK /**< Time stamp resolution */ #define TCP_PAWS_IDLE 24 * 24 * 60 * 60 * THZ /**< 24 days */ #define TCP_MAX_OPTION_SPACE 40 diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 0030cfe2..e9c52c5e 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -392,11 +392,10 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) /* Karn's rule, part 1. Don't use retransmitted segments to estimate * RTT because they're ambiguous. */ - if (tc->rtt_seq && seq_gt (ack, tc->rtt_seq) && !tc->rto_boff) + if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq) && !tc->rto_boff) { mrtt = tcp_time_now () - tc->rtt_ts; } - /* As per RFC7323 TSecr can be used for RTTM only if the segment advances * snd_una, i.e., the left side of the send window: * seq_lt (tc->snd_una, ack). Note: last condition could be dropped, we don't @@ -406,19 +405,22 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) mrtt = tcp_time_now () - tc->opt.tsecr; } + /* Allow measuring of a new RTT */ + tc->rtt_ts = 0; + + /* If ACK moves left side of the wnd make sure boff is 0, even if mrtt is + * not valid */ + if (tc->bytes_acked) + tc->rto_boff = 0; + /* Ignore dubious measurements */ if (mrtt == 0 || mrtt > TCP_RTT_MAX) return 0; tcp_estimate_rtt (tc, mrtt); - tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); - /* Allow measuring of RTT and make sure boff is 0 */ - tc->rtt_seq = 0; - tc->rto_boff = 0; - - return 1; + return 0; } /** @@ -735,7 +737,7 @@ tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) { u8 partial_ack; - if (tcp_in_cong_recovery (tc)) + if (tcp_in_fastrecovery (tc)) { partial_ack = seq_lt (tc->snd_una, tc->snd_congestion); if (!partial_ack) @@ -749,6 +751,7 @@ tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) /* Clear retransmitted bytes. XXX should we clear all? */ tc->rtx_bytes = 0; + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK); /* In case snd_nxt is still in the past and output tries to @@ -772,6 +775,13 @@ tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) tc->cc_algo->rcv_ack (tc); tc->tsecr_last_ack = tc->opt.tsecr; tc->rcv_dupacks = 0; + if (tcp_in_recovery (tc)) + { + tc->rtx_bytes -= clib_min (tc->bytes_acked, tc->rtx_bytes); + tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); + if (seq_geq (tc->snd_una, tc->snd_congestion)) + tcp_recovery_off (tc); + } } } @@ -897,7 +907,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, tcp_cc_rcv_ack (tc, b); /* If everything has been acked, stop retransmit timer - * otherwise update */ + * otherwise update. */ if (tc->snd_una == tc->snd_una_max) tcp_retransmit_timer_reset (tc); else @@ -1778,6 +1788,11 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_send_reset (b0, is_ip4); goto drop; } + + /* Update rtt and rto */ + tc0->bytes_acked = 1; + tcp_update_rtt (tc0, vnet_buffer (b0)->tcp.ack_number); + /* Switch state to ESTABLISHED */ tc0->state = TCP_STATE_ESTABLISHED; diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c index 856dffe4..3525f4e5 100644 --- a/src/vnet/tcp/tcp_newreno.c +++ b/src/vnet/tcp/tcp_newreno.c @@ -38,7 +38,7 @@ newreno_rcv_ack (tcp_connection_t * tc) else { /* Round up to 1 if needed */ - tc->cwnd += clib_max (tc->snd_mss * tc->snd_mss / tc->cwnd, 1); + tc->cwnd += clib_max ((tc->snd_mss * tc->snd_mss) / tc->cwnd, 1); } } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index a85d30da..7ee930c6 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -73,7 +73,7 @@ tcp_set_snd_mss (tcp_connection_t * tc) snd_mss = dummy_mtu; /* TODO cache mss and consider PMTU discovery */ - snd_mss = tc->opt.mss < snd_mss ? tc->opt.mss : snd_mss; + snd_mss = clib_min (tc->opt.mss, snd_mss); tc->snd_mss = snd_mss; @@ -923,6 +923,12 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, /* TODO this is updated in output as well ... */ if (tc->snd_nxt > tc->snd_una_max) tc->snd_una_max = tc->snd_nxt; + + if (tc->rtt_ts == 0) + { + tc->rtt_ts = tcp_time_now (); + tc->rtt_seq = tc->snd_nxt; + } TCP_EVT_DBG (TCP_EVT_PKTIZE, tc); } @@ -1019,9 +1025,10 @@ tcp_rtx_timeout_cc (tcp_connection_t * tc) } /* Start again from the beginning */ - tcp_recovery_on (tc); + tc->cwnd = tcp_loss_wnd (tc); tc->snd_congestion = tc->snd_una_max; + tcp_recovery_on (tc); } static void @@ -1032,7 +1039,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; - u32 bi, snd_space, n_bytes; + u32 bi, n_bytes; if (is_syn) { @@ -1065,33 +1072,16 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Exponential backoff */ tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); - /* Figure out what and how many bytes we can send */ - snd_space = tcp_available_snd_space (tc); - TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1); - if (snd_space == 0) - { - clib_warning ("no wnd to retransmit"); - tcp_return_buffer (tm); - - /* Force one segment */ - tcp_retransmit_first_unacked (tc); + /* Send one segment. No fancy recovery for now! */ + n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); + scoreboard_clear (&tc->sack_sb); - /* Re-enable retransmit timer. Output may be unwilling - * to do it for us */ - tcp_retransmit_timer_set (tc); - - return; - } - else + if (n_bytes == 0) { - /* No fancy recovery for now! */ - n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, snd_space); - scoreboard_clear (&tc->sack_sb); - - if (n_bytes == 0) - return; + clib_warning ("could not retransmit"); + return; } } else @@ -1400,7 +1390,7 @@ tcp46_output_inline (vlib_main_t * vm, } /* If not retransmitting - * 1) update snd_una_max (SYN, SYNACK, new data, FIN) + * 1) update snd_una_max (SYN, SYNACK, FIN) * 2) If we're not tracking an ACK, start tracking */ if (seq_lt (tc0->snd_una_max, tc0->snd_nxt)) { -- cgit 1.2.3-korg From c8343412ee0deaa8e9192cbc4846aae1e9f48a4d Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Thu, 4 May 2017 14:25:50 -0700 Subject: Include TCP options in segment size computation Ensure that TCP data plus options does not exceed peer's advertised MSS. Change-Id: I0de824cb3619346f0394dd694942fc1cf33a82b7 Signed-off-by: Florin Coras --- src/vnet/session/node.c | 2 +- src/vnet/tcp/tcp.c | 17 +++++++- src/vnet/tcp/tcp.h | 10 +++-- src/vnet/tcp/tcp_input.c | 6 ++- src/vnet/tcp/tcp_output.c | 104 ++++++++++++++++++++++++++++++---------------- 5 files changed, 97 insertions(+), 42 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c index 210754fa..2d12ee2b 100644 --- a/src/vnet/session/node.c +++ b/src/vnet/session/node.c @@ -95,8 +95,8 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, tc0 = transport_vft->get_connection (s0->connection_index, thread_index); /* Make sure we have space to send and there's something to dequeue */ - snd_space0 = transport_vft->send_space (tc0); snd_mss0 = transport_vft->send_mss (tc0); + snd_space0 = transport_vft->send_space (tc0); /* Can't make any progress */ if (snd_space0 == 0 || snd_mss0 == 0) diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index e80e2ec9..224ee0dd 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -331,7 +331,7 @@ void tcp_connection_init_vars (tcp_connection_t * tc) { tcp_connection_timers_init (tc); - tcp_set_snd_mss (tc); + tcp_init_mss (tc); scoreboard_init (&tc->sack_sb); tcp_cc_init (tc); } @@ -560,10 +560,23 @@ tcp_half_open_session_get_transport (u32 conn_index) return &tc->connection; } +/** + * Compute maximum segment size for session layer. + * + * Since the result needs to be the actual data length, it first computes + * the tcp options to be used in the next burst and subtracts their + * length from the connection's snd_mss. + */ u16 tcp_session_send_mss (transport_connection_t * trans_conn) { tcp_connection_t *tc = (tcp_connection_t *) trans_conn; + + /* Ensure snd_mss does accurately reflect the amount of data we can push + * in a segment. This also makes sure that options are updated according to + * the current state of the connection. */ + tcp_update_snd_mss (tc); + return tc->snd_mss; } @@ -607,7 +620,7 @@ tcp_session_send_space (transport_connection_t * trans_conn) tc->snd_nxt = tc->snd_una_max; snd_space = tcp_available_wnd (tc) - tc->rtx_bytes - (tc->snd_una_max - tc->snd_congestion); - if (snd_space <= 0) + if (snd_space <= 0 || (tc->snd_una_max - tc->snd_una) >= tc->snd_wnd) return 0; return tcp_round_snd_space (tc, snd_space); } diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index c75479dc..8212ada7 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -211,7 +211,9 @@ typedef struct _tcp_connection u32 irs; /**< initial remote sequence */ /* Options */ - tcp_options_t opt; /**< TCP connection options parsed */ + tcp_options_t opt; /**< TCP connection options parsed */ + tcp_options_t snd_opts; /**< Tx options for connection */ + u8 snd_opts_len; /**< Tx options len */ u8 rcv_wscale; /**< Window scale to advertise to peer */ u8 snd_wscale; /**< Window scale to use when sending */ u32 tsval_recent; /**< Last timestamp received */ @@ -241,7 +243,8 @@ typedef struct _tcp_connection u32 rtt_ts; /**< Timestamp for tracked ACK */ u32 rtt_seq; /**< Sequence number for tracked ACK */ - u16 snd_mss; /**< Send MSS */ + u16 snd_mss; /**< Effective send max seg (data) size */ + u16 mss; /**< Our max seg size that includes options */ } tcp_connection_t; struct _tcp_cc_algorithm @@ -405,7 +408,8 @@ void tcp_make_synack (tcp_connection_t * ts, vlib_buffer_t * b); void tcp_send_reset (vlib_buffer_t * pkt, u8 is_ip4); void tcp_send_syn (tcp_connection_t * tc); void tcp_send_fin (tcp_connection_t * tc); -void tcp_set_snd_mss (tcp_connection_t * tc); +void tcp_init_mss (tcp_connection_t * tc); +void tcp_update_snd_mss (tcp_connection_t * tc); always_inline u32 tcp_end_seq (tcp_header_t * th, u32 len) diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index e9c52c5e..d268251c 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -389,10 +389,14 @@ static int tcp_update_rtt (tcp_connection_t * tc, u32 ack) { u32 mrtt = 0; + u8 rtx_acked; + + /* Determine if only rtx bytes are acked. TODO fast retransmit */ + rtx_acked = tc->rto_boff && (tc->bytes_acked <= tc->snd_mss); /* Karn's rule, part 1. Don't use retransmitted segments to estimate * RTT because they're ambiguous. */ - if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq) && !tc->rto_boff) + if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq) && !rtx_acked) { mrtt = tcp_time_now () - tc->rtt_ts; } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 7ee930c6..2a1b1407 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -64,26 +64,6 @@ format_tcp_tx_trace (u8 * s, va_list * args) return s; } -void -tcp_set_snd_mss (tcp_connection_t * tc) -{ - u16 snd_mss; - - /* TODO find our iface MTU */ - snd_mss = dummy_mtu; - - /* TODO cache mss and consider PMTU discovery */ - snd_mss = clib_min (tc->opt.mss, snd_mss); - - tc->snd_mss = snd_mss; - - if (tc->snd_mss == 0) - { - clib_warning ("snd mss is 0"); - tc->snd_mss = dummy_mtu; - } -} - static u8 tcp_window_compute_scale (u32 available_space) { @@ -100,7 +80,7 @@ tcp_window_compute_scale (u32 available_space) always_inline u32 tcp_initial_wnd_unscaled (tcp_connection_t * tc) { - return TCP_IW_N_SEGMENTS * dummy_mtu; + return TCP_IW_N_SEGMENTS * tc->mss; } /** @@ -310,7 +290,7 @@ tcp_make_synack_options (tcp_connection_t * tc, tcp_options_t * opts) u8 len = 0; opts->flags |= TCP_OPTS_FLAG_MSS; - opts->mss = dummy_mtu; /*XXX discover that */ + opts->mss = tc->mss; len += TCP_OPTION_LEN_MSS; if (tcp_opts_wscale (&tc->opt)) @@ -389,6 +369,57 @@ tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts, } } +/** + * Update max segment size we're able to process. + * + * The value is constrained by our interface's MTU and IP options. It is + * also what we advertise to our peer. + */ +void +tcp_update_rcv_mss (tcp_connection_t * tc) +{ + /* TODO find our iface MTU */ + tc->mss = dummy_mtu; +} + +/** + * Update snd_mss to reflect the effective segment size that we can send + * by taking into account all TCP options, including SACKs + */ +void +tcp_update_snd_mss (tcp_connection_t * tc) +{ + /* Compute options to be used for connection. These may be reused when + * sending data or to compute the effective mss (snd_mss) */ + tc->snd_opts_len = + tcp_make_options (tc, &tc->snd_opts, TCP_STATE_ESTABLISHED); + + /* XXX check if MTU has been updated */ + tc->snd_mss = clib_min (tc->mss, tc->opt.mss) - tc->snd_opts_len; +} + +void +tcp_init_mss (tcp_connection_t * tc) +{ + tcp_update_rcv_mss (tc); + + /* TODO cache mss and consider PMTU discovery */ + tc->snd_mss = clib_min (tc->opt.mss, tc->mss); + + if (tc->snd_mss == 0) + { + clib_warning ("snd mss is 0"); + tc->snd_mss = tc->mss; + } + + /* We should have enough space for 40 bytes of options */ + ASSERT (tc->snd_mss > 45); + + /* If we use timestamp option, account for it */ + if (tcp_opts_tstamp (&tc->opt)) + tc->snd_mss -= TCP_OPTION_LEN_TIMESTAMP; +} + #define tcp_get_free_buffer_index(tm, bidx) \ do { \ u32 *my_tx_buffers, n_free_buffers; \ @@ -886,20 +917,20 @@ tcp_make_state_flags (tcp_state_t next_state) */ static void tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, - tcp_state_t next_state) + tcp_state_t next_state, u8 compute_opts) { u32 advertise_wnd, data_len; - u8 tcp_opts_len, tcp_hdr_opts_len, opts_write_len, flags; - tcp_options_t _snd_opts, *snd_opts = &_snd_opts; + u8 tcp_hdr_opts_len, opts_write_len, flags; tcp_header_t *th; data_len = b->current_length; vnet_buffer (b)->tcp.flags = 0; - /* Make and write options */ - memset (snd_opts, 0, sizeof (*snd_opts)); - tcp_opts_len = tcp_make_options (tc, snd_opts, next_state); - tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t); + if (compute_opts) + tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); + + /* Write pre-computed options */ + tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t); /* Get rcv window to advertise */ advertise_wnd = tcp_window_to_advertise (tc, next_state); @@ -910,9 +941,9 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, tc->rcv_nxt, tcp_hdr_opts_len, flags, advertise_wnd); - opts_write_len = tcp_options_write ((u8 *) (th + 1), snd_opts); + opts_write_len = tcp_options_write ((u8 *) (th + 1), &tc->snd_opts); - ASSERT (opts_write_len == tcp_opts_len); + ASSERT (opts_write_len == tc->snd_opts_len); /* Tag the buffer with the connection index */ vnet_buffer (b)->tcp.connection_index = tc->c_c_index; @@ -993,6 +1024,8 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, goto done; } + tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); + ASSERT (max_bytes <= tc->snd_mss); n_bytes = stream_session_peek_bytes (&tc->connection, @@ -1000,7 +1033,7 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, max_bytes); ASSERT (n_bytes != 0); b->current_length = n_bytes; - tcp_push_hdr_i (tc, b, tc->state); + tcp_push_hdr_i (tc, b, tc->state, 0); tc->rtx_bytes += n_bytes; done: @@ -1097,7 +1130,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) vlib_buffer_make_headroom (b, MAX_HDRS_LEN); - tcp_push_hdr_i (tc, b, tc->state); + tcp_push_hdr_i (tc, b, tc->state, 1); /* Account for the SYN */ tc->snd_nxt += 1; @@ -1168,6 +1201,7 @@ tcp_timer_persist_handler (u32 index) /* Try to force the first unsent segment */ tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); + tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); n_bytes = stream_session_peek_bytes (&tc->connection, vlib_buffer_get_current (b), tc->snd_una_max - tc->snd_una, @@ -1180,7 +1214,7 @@ tcp_timer_persist_handler (u32 index) } b->current_length = n_bytes; - tcp_push_hdr_i (tc, b, tc->state); + tcp_push_hdr_i (tc, b, tc->state, 0); tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); /* Re-enable persist timer */ @@ -1507,7 +1541,7 @@ tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b) tcp_connection_t *tc; tc = (tcp_connection_t *) tconn; - tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED); + tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED, 0); return 0; } -- cgit 1.2.3-korg From f6d68ed2db2bcd41c9b7ddde5e411073c1566c29 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Sun, 7 May 2017 19:12:02 -0700 Subject: Add support for tcp/session buffer chains Change-Id: I01c6e3dc3a1b2785df37bb66b19c4b5cbb8f3211 Signed-off-by: Florin Coras --- src/scripts/vnet/uri/dummy_app.py | 19 ++++- src/uri/uri_socket_server.c | 47 +++++++++-- src/vnet/session/node.c | 172 ++++++++++++++++++++++++++------------ src/vnet/session/session.c | 89 +++++++++++++++++--- src/vnet/session/session.h | 4 +- src/vnet/tcp/tcp_input.c | 16 ++-- src/vnet/tcp/tcp_output.c | 4 +- 7 files changed, 266 insertions(+), 85 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/scripts/vnet/uri/dummy_app.py b/src/scripts/vnet/uri/dummy_app.py index 50333923..ff00f2fc 100644 --- a/src/scripts/vnet/uri/dummy_app.py +++ b/src/scripts/vnet/uri/dummy_app.py @@ -6,14 +6,28 @@ import time # action can be reflect or drop action = "drop" +test = 0 + +def test_data (data, n_rcvd): + n_read = len (data); + for i in range(n_read): + expected = (n_rcvd + i) & 0xff + byte_got = ord (data[i]) + if (byte_got != expected): + print("Difference at byte {}. Expected {} got {}" + .format(n_rcvd + i, expected, byte_got)) + return n_read def handle_connection (connection, client_address): print("Received connection from {}".format(repr(client_address))) + n_rcvd = 0 try: while True: data = connection.recv(4096) if not data: break; + if (test == 1): + n_rcvd += test_data (data, n_rcvd) if (action != "drop"): connection.sendall(data) finally: @@ -78,8 +92,9 @@ def run(mode, ip, port): if __name__ == "__main__": if (len(sys.argv)) < 4: - raise Exception("Usage: ./dummy_app []") - if (len(sys.argv) == 5): + raise Exception("Usage: ./dummy_app [ ]") + if (len(sys.argv) == 6): action = sys.argv[4] + test = int(sys.argv[5]) run (sys.argv[1], sys.argv[2], int(sys.argv[3])) diff --git a/src/uri/uri_socket_server.c b/src/uri/uri_socket_server.c index 64d3b492..2366f420 100644 --- a/src/uri/uri_socket_server.c +++ b/src/uri/uri_socket_server.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -72,32 +73,59 @@ setup_signal_handler (void) int main (int argc, char *argv[]) { - int sockfd, portno, n, sent, accfd; + int sockfd, portno, n, sent, accfd, reuse; + socklen_t client_addr_len; struct sockaddr_in serv_addr; + struct sockaddr_in client; struct hostent *server; u8 *rx_buffer = 0; - if (0 && argc < 3) + if (argc > 1 && argc < 3) { - fformat (stderr, "usage %s hostname port\n", argv[0]); + fformat (stderr, "usage %s host port\n", argv[0]); exit (0); } + if (argc >= 3) + { + portno = atoi (argv[2]); + server = gethostbyname (argv[1]); + if (server == NULL) + { + clib_unix_warning ("gethostbyname"); + exit (1); + } + } + else + { + /* Defaults */ + portno = 1234; + server = gethostbyname ("6.0.1.1"); + if (server == NULL) + { + clib_unix_warning ("gethostbyname"); + exit (1); + } + } + + setup_signal_handler (); - portno = 1234; // atoi(argv[2]); sockfd = socket (AF_INET, SOCK_STREAM, 0); if (sockfd < 0) { clib_unix_error ("socket"); exit (1); } - server = gethostbyname ("6.0.1.1"); - if (server == NULL) + + reuse = 1; + if (setsockopt (sockfd, SOL_SOCKET, SO_REUSEADDR, (const char *) &reuse, + sizeof (reuse)) < 0) { - clib_unix_warning ("gethostbyname"); + clib_unix_error ("setsockopt(SO_REUSEADDR) failed"); exit (1); } + bzero ((char *) &serv_addr, sizeof (serv_addr)); serv_addr.sin_family = AF_INET; bcopy ((char *) server->h_addr, @@ -123,12 +151,15 @@ main (int argc, char *argv[]) if (signal_received) break; - accfd = accept (sockfd, 0 /* don't care */ , 0); + client_addr_len = sizeof (struct sockaddr); + accfd = accept (sockfd, (struct sockaddr *) &client, &client_addr_len); if (accfd < 0) { clib_unix_warning ("accept"); continue; } + fformat (stderr, "Accepted connection from: %s : %d\n", + inet_ntoa (client.sin_addr), client.sin_port); while (1) { n = recv (accfd, rx_buffer, vec_len (rx_buffer), 0 /* flags */ ); diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c index 2d12ee2b..ce7c3868 100644 --- a/src/vnet/session/node.c +++ b/src/vnet/session/node.c @@ -70,6 +70,58 @@ static u32 session_type_to_next[] = { SESSION_QUEUE_NEXT_IP6_LOOKUP, }; +always_inline void +session_tx_fifo_chain_tail (session_manager_main_t * smm, vlib_main_t * vm, + u8 thread_index, svm_fifo_t * fifo, + vlib_buffer_t * b0, u32 bi0, u8 n_bufs_per_seg, + u32 * left_to_snd0, u16 * n_bufs, u32 * rx_offset, + u16 deq_per_buf, u8 peek_data) +{ + vlib_buffer_t *chain_b0, *prev_b0; + u32 chain_bi0; + u16 len_to_deq0, n_bytes_read; + u8 *data0, j; + + chain_bi0 = bi0; + chain_b0 = b0; + for (j = 1; j < n_bufs_per_seg; j++) + { + prev_b0 = chain_b0; + len_to_deq0 = clib_min (*left_to_snd0, deq_per_buf); + + *n_bufs -= 1; + chain_bi0 = smm->tx_buffers[thread_index][*n_bufs]; + _vec_len (smm->tx_buffers[thread_index]) = *n_bufs; + + chain_b0 = vlib_get_buffer (vm, chain_bi0); + chain_b0->current_data = 0; + data0 = vlib_buffer_get_current (chain_b0); + if (peek_data) + { + n_bytes_read = svm_fifo_peek (fifo, *rx_offset, len_to_deq0, data0); + *rx_offset += n_bytes_read; + } + else + { + n_bytes_read = svm_fifo_dequeue_nowait (fifo, len_to_deq0, data0); + } + ASSERT (n_bytes_read == len_to_deq0); + chain_b0->current_length = n_bytes_read; + b0->total_length_not_including_first_buffer += chain_b0->current_length; + + /* update previous buffer */ + prev_b0->next_buffer = chain_bi0; + prev_b0->flags |= VLIB_BUFFER_NEXT_PRESENT; + + /* update current buffer */ + chain_b0->next_buffer = 0; + + *left_to_snd0 -= n_bytes_read; + if (*left_to_snd0 == 0) + break; + } +} + always_inline int session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, session_manager_main_t * smm, @@ -78,16 +130,17 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, int *n_tx_packets, u8 peek_data) { u32 n_trace = vlib_get_trace_count (vm, node); - u32 left_to_snd0, max_len_to_snd0, len_to_deq0, n_bufs, snd_space0; - u32 n_frame_bytes, n_frames_per_evt; + u32 left_to_snd0, max_len_to_snd0, len_to_deq0, snd_space0; + u32 n_bufs_per_evt, n_frames_per_evt; transport_connection_t *tc0; transport_proto_vft_t *transport_vft; u32 next_index, next0, *to_next, n_left_to_next, bi0; vlib_buffer_t *b0; - u32 rx_offset = 0, max_dequeue0; - u16 snd_mss0; + u32 rx_offset = 0, max_dequeue0, n_bytes_per_seg; + u16 snd_mss0, n_bufs_per_seg, n_bufs; u8 *data0; int i, n_bytes_read; + u32 n_bytes_per_buf, deq_per_buf; next_index = next0 = session_type_to_next[s0->session_type]; @@ -134,8 +187,15 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, max_len_to_snd0 = snd_space0; } - n_frame_bytes = snd_mss0 * VLIB_FRAME_SIZE; - n_frames_per_evt = ceil ((double) max_len_to_snd0 / n_frame_bytes); + n_bytes_per_buf = vlib_buffer_free_list_buffer_size (vm, + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + n_bytes_per_seg = MAX_HDRS_LEN + snd_mss0; + n_bufs_per_seg = ceil ((double) n_bytes_per_seg / n_bytes_per_buf); + n_bufs_per_evt = (ceil ((double) max_len_to_snd0 / n_bytes_per_seg)) + * n_bufs_per_seg; + n_frames_per_evt = ceil ((double) n_bufs_per_evt / VLIB_FRAME_SIZE); + + deq_per_buf = clib_min (snd_mss0, n_bytes_per_buf); n_bufs = vec_len (smm->tx_buffers[thread_index]); left_to_snd0 = max_len_to_snd0; @@ -146,9 +206,9 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, { vec_validate (smm->tx_buffers[thread_index], n_bufs + VLIB_FRAME_SIZE - 1); - n_bufs += - vlib_buffer_alloc (vm, &smm->tx_buffers[thread_index][n_bufs], - VLIB_FRAME_SIZE); + n_bufs += vlib_buffer_alloc (vm, + &smm->tx_buffers[thread_index][n_bufs], + VLIB_FRAME_SIZE); /* buffer shortage * XXX 0.9 because when debugging we might not get a full frame */ @@ -165,11 +225,14 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, } vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); - while (left_to_snd0 && n_left_to_next) + while (left_to_snd0 && n_left_to_next >= n_bufs_per_seg) { + /* + * Handle first buffer in chain separately + */ + /* Get free buffer */ - n_bufs--; - bi0 = smm->tx_buffers[thread_index][n_bufs]; + bi0 = smm->tx_buffers[thread_index][--n_bufs]; _vec_len (smm->tx_buffers[thread_index]) = n_bufs; b0 = vlib_get_buffer (vm, bi0); @@ -177,52 +240,19 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID | VNET_BUFFER_LOCALLY_ORIGINATED; b0->current_data = 0; + b0->total_length_not_including_first_buffer = 0; /* RX on the local interface. tx in default fib */ vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; - /* usual speculation, or the enqueue_x1 macro will barf */ - to_next[0] = bi0; - to_next += 1; - n_left_to_next -= 1; - - VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); - if (PREDICT_FALSE (n_trace > 0)) - { - session_queue_trace_t *t0; - vlib_trace_buffer (vm, node, next_index, b0, - 1 /* follow_chain */ ); - vlib_set_trace_count (vm, node, --n_trace); - t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); - t0->session_index = s0->session_index; - t0->server_thread_index = s0->thread_index; - } + len_to_deq0 = clib_min (left_to_snd0, deq_per_buf); - len_to_deq0 = (left_to_snd0 < snd_mss0) ? left_to_snd0 : snd_mss0; - - /* *INDENT-OFF* */ - SESSION_EVT_DBG(SESSION_EVT_DEQ, s0, ({ - ed->data[0] = e0->event_id; - ed->data[1] = max_dequeue0; - ed->data[2] = len_to_deq0; - ed->data[3] = left_to_snd0; - })); - /* *INDENT-ON* */ - - /* Make room for headers */ data0 = vlib_buffer_make_headroom (b0, MAX_HDRS_LEN); - - /* Dequeue the data - * TODO 1) peek instead of dequeue - * 2) buffer chains */ if (peek_data) { n_bytes_read = svm_fifo_peek (s0->server_tx_fifo, rx_offset, len_to_deq0, data0); - if (n_bytes_read <= 0) - goto dequeue_fail; - /* Keep track of progress locally, transport is also supposed to * increment it independently when pushing the header */ rx_offset += n_bytes_read; @@ -231,18 +261,56 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, { n_bytes_read = svm_fifo_dequeue_nowait (s0->server_tx_fifo, len_to_deq0, data0); - if (n_bytes_read <= 0) - goto dequeue_fail; } - b0->current_length = n_bytes_read; + if (n_bytes_read <= 0) + goto dequeue_fail; - /* Ask transport to push header */ - transport_vft->push_header (tc0, b0); + b0->current_length = n_bytes_read; left_to_snd0 -= n_bytes_read; *n_tx_packets = *n_tx_packets + 1; + /* + * Fill in the remaining buffers in the chain, if any + */ + if (PREDICT_FALSE (n_bufs_per_seg > 1)) + session_tx_fifo_chain_tail (smm, vm, thread_index, + s0->server_tx_fifo, b0, bi0, + n_bufs_per_seg, &left_to_snd0, + &n_bufs, &rx_offset, deq_per_buf, + peek_data); + + /* Ask transport to push header after current_length and + * total_length_not_including_first_buffer are updated */ + transport_vft->push_header (tc0, b0); + + /* *INDENT-OFF* */ + SESSION_EVT_DBG(SESSION_EVT_DEQ, s0, ({ + ed->data[0] = e0->event_id; + ed->data[1] = max_dequeue0; + ed->data[2] = len_to_deq0; + ed->data[3] = left_to_snd0; + })); + /* *INDENT-ON* */ + + /* usual speculation, or the enqueue_x1 macro will barf */ + to_next[0] = bi0; + to_next += 1; + n_left_to_next -= 1; + + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + if (PREDICT_FALSE (n_trace > 0)) + { + session_queue_trace_t *t0; + vlib_trace_buffer (vm, node, next_index, b0, + 1 /* follow_chain */ ); + vlib_set_trace_count (vm, node, --n_trace); + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + t0->session_index = s0->session_index; + t0->server_thread_index = s0->thread_index; + } + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, n_left_to_next, bi0, next0); diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index e92bb440..6e129dde 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -432,33 +432,97 @@ stream_session_create_i (segment_manager_t * sm, transport_connection_t * tc, return 0; } +/** Enqueue buffer chain tail */ +always_inline int +session_enqueue_chain_tail (stream_session_t * s, vlib_buffer_t * b, + u32 offset, u8 is_in_order) +{ + vlib_buffer_t *chain_b; + u32 chain_bi = b->next_buffer; + vlib_main_t *vm = vlib_get_main (); + u8 *data, len; + u16 written = 0; + int rv = 0; + + do + { + chain_b = vlib_get_buffer (vm, chain_bi); + data = vlib_buffer_get_current (chain_b); + len = chain_b->current_length; + if (is_in_order) + { + rv = svm_fifo_enqueue_nowait (s->server_rx_fifo, len, data); + if (rv < len) + { + return (rv > 0) ? (written + rv) : written; + } + written += rv; + } + else + { + rv = svm_fifo_enqueue_with_offset (s->server_rx_fifo, offset, len, + data); + if (rv) + return -1; + offset += len; + } + } + while ((chain_bi = (chain_b->flags & VLIB_BUFFER_NEXT_PRESENT) + ? chain_b->next_buffer : 0)); + + if (is_in_order) + return written; + + return 0; +} + /* * Enqueue data for delivery to session peer. Does not notify peer of enqueue * event but on request can queue notification events for later delivery by * calling stream_server_flush_enqueue_events(). * * @param tc Transport connection which is to be enqueued data - * @param data Data to be enqueued - * @param len Length of data to be enqueued + * @param b Buffer to be enqueued + * @param offset Offset at which to start enqueueing if out-of-order * @param queue_event Flag to indicate if peer is to be notified or if event * is to be queued. The former is useful when more data is * enqueued and only one event is to be generated. + * @param is_in_order Flag to indicate if data is in order * @return Number of bytes enqueued or a negative value if enqueueing failed. */ int -stream_session_enqueue_data (transport_connection_t * tc, u8 * data, u16 len, - u8 queue_event) +stream_session_enqueue_data (transport_connection_t * tc, vlib_buffer_t * b, + u32 offset, u8 queue_event, u8 is_in_order) { stream_session_t *s; - int enqueued; + int enqueued = 0, rv; s = stream_session_get (tc->s_index, tc->thread_index); - /* Make sure there's enough space left. We might've filled the pipes */ - if (PREDICT_FALSE (len > svm_fifo_max_enqueue (s->server_rx_fifo))) - return -1; - - enqueued = svm_fifo_enqueue_nowait (s->server_rx_fifo, len, data); + if (is_in_order) + { + enqueued = + svm_fifo_enqueue_nowait (s->server_rx_fifo, b->current_length, + vlib_buffer_get_current (b)); + if (PREDICT_FALSE + ((b->flags & VLIB_BUFFER_NEXT_PRESENT) && enqueued > 0)) + { + rv = session_enqueue_chain_tail (s, b, 0, 1); + if (rv <= 0) + return enqueued; + enqueued += rv; + } + } + else + { + rv = svm_fifo_enqueue_with_offset (s->server_rx_fifo, offset, + b->current_length, + vlib_buffer_get_current (b)); + if (PREDICT_FALSE ((b->flags & VLIB_BUFFER_NEXT_PRESENT) && !rv)) + rv = session_enqueue_chain_tail (s, b, offset + b->current_length, 0); + if (rv) + return -1; + } if (queue_event) { @@ -476,7 +540,10 @@ stream_session_enqueue_data (transport_connection_t * tc, u8 * data, u16 len, } } - return enqueued; + if (is_in_order) + return enqueued; + + return 0; } /** Check if we have space in rx fifo to push more bytes */ diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index f41a8a96..f152a2be 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -345,8 +345,8 @@ stream_session_fifo_size (transport_connection_t * tc) } int -stream_session_enqueue_data (transport_connection_t * tc, u8 * data, u16 len, - u8 queue_event); +stream_session_enqueue_data (transport_connection_t * tc, vlib_buffer_t * b, + u32 offset, u8 queue_event, u8 is_in_order); u32 stream_session_peek_bytes (transport_connection_t * tc, u8 * buffer, u32 offset, u32 max_bytes); diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index d268251c..ceb00fc3 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -993,9 +993,8 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, return TCP_ERROR_PURE_ACK; } - written = stream_session_enqueue_data (&tc->connection, - vlib_buffer_get_current (b), - data_len, 1 /* queue event */ ); + written = stream_session_enqueue_data (&tc->connection, b, 0, + 1 /* queue event */ , 1); TCP_EVT_DBG (TCP_EVT_INPUT, tc, 0, data_len, written); @@ -1053,12 +1052,10 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, return TCP_ERROR_PURE_ACK; } - s0 = stream_session_get (tc->c_s_index, tc->c_thread_index); - /* Enqueue out-of-order data with absolute offset */ - rv = svm_fifo_enqueue_with_offset (s0->server_rx_fifo, - vnet_buffer (b)->tcp.seq_number, - data_len, vlib_buffer_get_current (b)); + rv = stream_session_enqueue_data (&tc->connection, b, + vnet_buffer (b)->tcp.seq_number, + 0 /* queue event */ , 0); /* Nothing written */ if (rv) @@ -1075,6 +1072,8 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, ooo_segment_t *newest; u32 start, end; + s0 = stream_session_get (tc->c_s_index, tc->c_thread_index); + /* Get the newest segment from the fifo */ newest = svm_fifo_newest_ooo_segment (s0->server_rx_fifo); start = ooo_segment_offset (s0->server_rx_fifo, newest); @@ -2543,6 +2542,7 @@ do { \ _(FIN_WAIT_1, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); /* FIN confirming that the peer (app) has closed */ _(FIN_WAIT_2, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(FIN_WAIT_2, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(FIN_WAIT_2, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 2a1b1407..33e599ec 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -46,7 +46,7 @@ typedef struct tcp_connection_t tcp_connection; } tcp_tx_trace_t; -u16 dummy_mtu = 400; +u16 dummy_mtu = 1460; u8 * format_tcp_tx_trace (u8 * s, va_list * args) @@ -923,7 +923,7 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, u8 tcp_hdr_opts_len, opts_write_len, flags; tcp_header_t *th; - data_len = b->current_length; + data_len = b->current_length + b->total_length_not_including_first_buffer; vnet_buffer (b)->tcp.flags = 0; if (compute_opts) -- cgit 1.2.3-korg From dc629cd95fde7601679354de4a1668070c2ac486 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Tue, 9 May 2017 00:52:37 -0700 Subject: Ignore data in RST segments, VPP-723 Change-Id: I01650b3b10b79ec549be0d5eceb0e318c06fcb50 Signed-off-by: Florin Coras --- src/vnet/tcp/tcp_input.c | 38 ++++++++++++++---------- src/vnet/tcp/tcp_output.c | 74 +++++++++++++++++++---------------------------- 2 files changed, 53 insertions(+), 59 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index ceb00fc3..82e676d4 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -2106,16 +2106,16 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Create child session. For syn-flood protection use filter */ - /* 1. first check for an RST */ - if (tcp_rst (th0)) - goto drop; + /* 1. first check for an RST: handled in dispatch */ + /* if (tcp_rst (th0)) + goto drop; */ - /* 2. second check for an ACK */ - if (tcp_ack (th0)) - { - tcp_send_reset (b0, is_ip4); - goto drop; - } + /* 2. second check for an ACK: handled in dispatch */ + /* if (tcp_ack (th0)) + { + tcp_send_reset (b0, is_ip4); + goto drop; + } */ /* 3. check for a SYN (did that already) */ @@ -2401,14 +2401,17 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, next0 = tm->dispatch_table[tc0->state][flags0].next; error0 = tm->dispatch_table[tc0->state][flags0].error; - if (PREDICT_FALSE (error0 == TCP_ERROR_DISPATCH)) + if (PREDICT_FALSE (error0 == TCP_ERROR_DISPATCH + || next0 == TCP_INPUT_NEXT_RESET)) { - tcp_state_t state0 = tc0->state; /* Overload tcp flags to store state */ + tcp_state_t state0 = tc0->state; vnet_buffer (b0)->tcp.flags = tc0->state; - clib_warning ("disp error state %U flags %U", - format_tcp_state, &state0, - format_tcp_flags, (int) flags0); + + if (error0 == TCP_ERROR_DISPATCH) + clib_warning ("disp error state %U flags %U", + format_tcp_state, &state0, format_tcp_flags, + (int) flags0); } } else @@ -2517,6 +2520,8 @@ do { \ /* SYNs for new connections -> tcp-listen. */ _(LISTEN, TCP_FLAG_SYN, TCP_INPUT_NEXT_LISTEN, TCP_ERROR_NONE); + _(LISTEN, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_NONE); + _(LISTEN, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_NONE); /* ACK for for a SYN-ACK -> tcp-rcv-process. */ _(SYN_RCVD, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(SYN_RCVD, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); @@ -2534,6 +2539,8 @@ do { \ _(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); _(ESTABLISHED, TCP_FLAG_RST, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); + _(ESTABLISHED, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, + TCP_ERROR_NONE); /* ACK or FIN-ACK to our FIN */ _(FIN_WAIT_1, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(FIN_WAIT_1, TCP_FLAG_ACK | TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, @@ -2546,7 +2553,8 @@ do { \ _(FIN_WAIT_2, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); - _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); + _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED); + _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); #undef _ } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 33e599ec..39891fc3 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -597,54 +597,41 @@ tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, int tcp_make_reset_in_place (vlib_main_t * vm, vlib_buffer_t * b0, - tcp_state_t state, u32 my_thread_index, u8 is_ip4) + tcp_state_t state, u8 thread_index, u8 is_ip4) { - u8 tcp_hdr_len = sizeof (tcp_header_t); ip4_header_t *ih4; ip6_header_t *ih6; tcp_header_t *th0; - ip4_address_t src_ip40; - ip6_address_t src_ip60; - u16 src_port0; + ip4_address_t src_ip40, dst_ip40; + ip6_address_t src_ip60, dst_ip60; + u16 src_port, dst_port; u32 tmp; + u32 seq, ack; + u8 flags; /* Find IP and TCP headers */ - if (is_ip4) - { - ih4 = vlib_buffer_get_current (b0); - th0 = ip4_next_header (ih4); - } - else - { - ih6 = vlib_buffer_get_current (b0); - th0 = ip6_next_header (ih6); - } + th0 = tcp_buffer_hdr (b0); - /* Swap src and dst ip */ + /* Save src and dst ip */ if (is_ip4) { + ih4 = vlib_buffer_get_current (b0); ASSERT ((ih4->ip_version_and_header_length & 0xF0) == 0x40); src_ip40.as_u32 = ih4->src_address.as_u32; - ih4->src_address.as_u32 = ih4->dst_address.as_u32; - ih4->dst_address.as_u32 = src_ip40.as_u32; - - /* Chop the end of the pkt */ - b0->current_length += ip4_header_bytes (ih4) + tcp_hdr_len; + dst_ip40.as_u32 = ih4->dst_address.as_u32; } else { + ih6 = vlib_buffer_get_current (b0); ASSERT ((ih6->ip_version_traffic_class_and_flow_label & 0xF0) == 0x60); clib_memcpy (&src_ip60, &ih6->src_address, sizeof (ip6_address_t)); - clib_memcpy (&ih6->src_address, &ih6->dst_address, - sizeof (ip6_address_t)); - clib_memcpy (&ih6->dst_address, &src_ip60, sizeof (ip6_address_t)); - - /* Chop the end of the pkt */ - b0->current_length += sizeof (ip6_header_t) + tcp_hdr_len; + clib_memcpy (&dst_ip60, &ih6->dst_address, sizeof (ip6_address_t)); } - /* Try to determine what/why we're actually resetting and swap - * src and dst ports */ + src_port = th0->src_port; + dst_port = th0->dst_port; + + /* Try to determine what/why we're actually resetting */ if (state == TCP_STATE_CLOSED) { if (!tcp_syn (th0)) @@ -653,33 +640,32 @@ tcp_make_reset_in_place (vlib_main_t * vm, vlib_buffer_t * b0, tmp = clib_net_to_host_u32 (th0->seq_number); /* Got a SYN for no listener. */ - th0->flags = TCP_FLAG_RST | TCP_FLAG_ACK; - th0->ack_number = clib_host_to_net_u32 (tmp + 1); - th0->seq_number = 0; - + flags = TCP_FLAG_RST | TCP_FLAG_ACK; + ack = clib_host_to_net_u32 (tmp + 1); + seq = 0; } - else if (state >= TCP_STATE_SYN_SENT) + else { - th0->flags = TCP_FLAG_RST | TCP_FLAG_ACK; - th0->seq_number = th0->ack_number; - th0->ack_number = 0; + flags = TCP_FLAG_RST; + seq = th0->ack_number; + ack = 0; } - src_port0 = th0->src_port; - th0->src_port = th0->dst_port; - th0->dst_port = src_port0; - th0->window = 0; - th0->data_offset_and_reserved = (tcp_hdr_len >> 2) << 4; - th0->urgent_pointer = 0; + tcp_reuse_buffer (vm, b0); + th0 = vlib_buffer_push_tcp_net_order (b0, dst_port, src_port, seq, ack, + sizeof (tcp_header_t), flags, 0); - /* Compute checksum */ if (is_ip4) { + ih4 = vlib_buffer_push_ip4 (vm, b0, &dst_ip40, &src_ip40, + IP_PROTOCOL_TCP); th0->checksum = ip4_tcp_udp_compute_checksum (vm, b0, ih4); } else { int bogus = ~0; + ih6 = vlib_buffer_push_ip6 (vm, b0, &dst_ip60, &src_ip60, + IP_PROTOCOL_TCP); th0->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b0, ih6, &bogus); ASSERT (!bogus); } -- cgit 1.2.3-korg From db84e579ef77476e3c73780e20243ee1799530f3 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Tue, 9 May 2017 18:54:52 -0700 Subject: Improve TCP option handling, VPP-757 Change-Id: Ica634536387d1196366ec96c52770287fcab0768 Signed-off-by: Florin Coras --- src/vnet/tcp/tcp.c | 6 +++++- src/vnet/tcp/tcp_input.c | 47 +++++++++++++++++++++++++++++++++++++---------- src/vnet/tcp/tcp_output.c | 16 ++++++++++++---- 3 files changed, 54 insertions(+), 15 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 224ee0dd..a65ab7ff 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -154,6 +154,10 @@ tcp_connection_reset (tcp_connection_t * tc) return; tc->state = TCP_STATE_CLOSED; + + /* Make sure all timers are cleared */ + tcp_connection_timers_reset (tc); + stream_session_reset_notify (&tc->connection); } @@ -585,7 +589,7 @@ tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space) { if (tc->snd_wnd < tc->snd_mss) { - return tc->snd_wnd < snd_space ? tc->snd_wnd : 0; + return tc->snd_wnd <= snd_space ? tc->snd_wnd : 0; } /* If we can't write at least a segment, don't try at all */ diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 82e676d4..318d8ec5 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -112,7 +112,14 @@ tcp_segment_in_rcv_wnd (tcp_connection_t * tc, u32 seq, u32 end_seq) && seq_leq (seq, tc->rcv_nxt + tc->rcv_wnd)); } -void +/** + * Parse TCP header options. + * + * @param th TCP header + * @param to TCP options data structure to be populated + * @return -1 if parsing failed + */ +int tcp_options_parse (tcp_header_t * th, tcp_options_t * to) { const u8 *data; @@ -134,17 +141,20 @@ tcp_options_parse (tcp_header_t * th, tcp_options_t * to) if (kind == TCP_OPTION_EOL) break; else if (kind == TCP_OPTION_NOOP) - opt_len = 1; + { + opt_len = 1; + continue; + } else { /* broken options */ if (opts_len < 2) - break; + return -1; opt_len = data[1]; /* weird option length */ if (opt_len < 2 || opt_len > opts_len) - break; + return -1; } /* Parse options */ @@ -206,6 +216,7 @@ tcp_options_parse (tcp_header_t * th, tcp_options_t * to) continue; } } + return 0; } /** @@ -261,7 +272,10 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, if (PREDICT_FALSE (!tcp_ack (th0) && !tcp_rst (th0) && !tcp_syn (th0))) return -1; - tcp_options_parse (th0, &tc0->opt); + if (PREDICT_FALSE (tcp_options_parse (th0, &tc0->opt))) + { + return -1; + } if (tcp_segment_check_paws (tc0)) { @@ -1109,19 +1123,24 @@ static int tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, u16 n_data_bytes, u32 * next0) { - u32 error = 0; + u32 error = 0, n_bytes_to_drop; /* Handle out-of-order data */ if (PREDICT_FALSE (vnet_buffer (b)->tcp.seq_number != tc->rcv_nxt)) { /* Old sequence numbers allowed through because they overlapped * the rx window */ - if (seq_lt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt)) { error = TCP_ERROR_SEGMENT_OLD; *next0 = TCP_NEXT_DROP; - goto done; + + /* Chop off the bytes in the past */ + n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number; + n_data_bytes -= n_bytes_to_drop; + vlib_buffer_advance (b, n_bytes_to_drop); + + goto in_order; } error = tcp_session_enqueue_ooo (tc, b, n_data_bytes); @@ -1145,6 +1164,8 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, goto done; } +in_order: + /* In order data, enqueue. Fifo figures out by itself if any out-of-order * segments can be enqueued after fifo tail offset changes. */ error = tcp_session_enqueue_data (tc, b, n_data_bytes); @@ -1540,7 +1561,8 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, new_tc0->irs = seq0; /* Parse options */ - tcp_options_parse (tcp0, &new_tc0->opt); + if (tcp_options_parse (tcp0, &new_tc0->opt)) + goto drop; if (tcp_opts_tstamp (&new_tc0->opt)) { @@ -1943,6 +1965,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, case TCP_STATE_FIN_WAIT_2: /* Got FIN, send ACK! */ tc0->state = TCP_STATE_TIME_WAIT; + tcp_connection_timers_reset (tc0); tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); tcp_make_ack (tc0, b0); next0 = tcp_next_output (is_ip4); @@ -2149,7 +2172,10 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; } - tcp_options_parse (th0, &child0->opt); + if (tcp_options_parse (th0, &child0->opt)) + { + goto drop; + } child0->irs = vnet_buffer (b0)->tcp.seq_number; child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1; @@ -2553,6 +2579,7 @@ do { \ _(FIN_WAIT_2, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(LAST_ACK, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED); _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); #undef _ diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 39891fc3..a462d8da 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -396,20 +396,24 @@ tcp_update_snd_mss (tcp_connection_t * tc) /* XXX check if MTU has been updated */ tc->snd_mss = clib_min (tc->mss, tc->opt.mss) - tc->snd_opts_len; + ASSERT (tc->snd_mss > 0); } void tcp_init_mss (tcp_connection_t * tc) { + u16 default_min_mss = 536; tcp_update_rcv_mss (tc); /* TODO cache mss and consider PMTU discovery */ tc->snd_mss = clib_min (tc->opt.mss, tc->mss); - if (tc->snd_mss == 0) + if (tc->snd_mss < 45) { clib_warning ("snd mss is 0"); - tc->snd_mss = tc->mss; + /* Assume that at least the min default mss works */ + tc->snd_mss = default_min_mss; + tc->opt.mss = default_min_mss; } /* We should have enough space for 40 bytes of options */ @@ -1171,13 +1175,17 @@ tcp_timer_persist_handler (u32 index) vlib_buffer_t *b; u32 bi, n_bytes; - tc = tcp_connection_get (index, thread_index); + tc = tcp_connection_get_if_valid (index, thread_index); + + if (!tc) + return; /* Make sure timer handle is set to invalid */ tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID; /* Problem already solved or worse */ - if (tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc)) + if (tc->state == TCP_STATE_CLOSED + || tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc)) return; /* Increment RTO backoff */ -- cgit 1.2.3-korg From 259cdaeccb70ea1a45b422218d3f5e1920459db8 Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Mon, 15 May 2017 16:27:05 -0400 Subject: Fixes to tcp active opens and reception - Properly initialize connection index - Add new session to main hash table - Drop packets that are completely in the past with respect to the rcv wnd (e.g., retransmits due to premature timer pops) Change-Id: Ic0873018ff0a4c59e0913863dc7f0c0f822d5c34 Signed-off-by: Dave Barach Signed-off-by: Florin Coras --- src/vnet/session/session.c | 6 +++++- src/vnet/tcp/tcp_input.c | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index ed205d76..d0792fae 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -732,6 +732,10 @@ stream_session_connect_notify (transport_connection_t * tc, u8 sst, /* Cleanup session lookup */ stream_session_half_open_table_del (smm, sst, tc); + + /* Add to established lookup table */ + handle = (((u64) tc->thread_index) << 32) | (u64) new_s->session_index; + stream_session_table_add_for_tc (tc, handle); } void @@ -1104,7 +1108,7 @@ session_manager_main_enable (vlib_main_t * vm) for (i = 0; i < 200000; i++) { stream_session_t *ss; - pool_get (smm->sessions[0], ss); + pool_get_aligned (smm->sessions[0], ss, CLIB_CACHE_LINE_BYTES); memset (ss, 0, sizeof (*ss)); } diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 318d8ec5..ddee41e0 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -1135,6 +1135,10 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, error = TCP_ERROR_SEGMENT_OLD; *next0 = TCP_NEXT_DROP; + /* Completely in the past (possible retransmit) */ + if (seq_lt (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt)) + goto done; + /* Chop off the bytes in the past */ n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number; n_data_bytes -= n_bytes_to_drop; @@ -1553,6 +1557,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, clib_memcpy (new_tc0, tc0, sizeof (*new_tc0)); new_tc0->c_thread_index = my_thread_index; + new_tc0->c_c_index = new_tc0 - tm->connections[my_thread_index]; /* Cleanup half-open connection XXX lock */ pool_put (tm->half_open_connections, tc0); -- cgit 1.2.3-korg From 06d110189e54220c533c5fe0cea7f23e531284b9 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Wed, 17 May 2017 14:21:51 -0700 Subject: Improve sack bytes accounting and testing Change-Id: Iabeda0d0615b0f6fe20dd00611cb4c594d90b7eb Signed-off-by: Florin Coras --- src/vnet/tcp/tcp.c | 42 ++++++++++++++++++++++++++ src/vnet/tcp/tcp.h | 1 + src/vnet/tcp/tcp_format.c | 14 --------- src/vnet/tcp/tcp_input.c | 45 ++++++++++++++++------------ src/vnet/tcp/tcp_test.c | 75 +++++++++++++++++++++++++++++++++++++++++++---- 5 files changed, 140 insertions(+), 37 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index e365fa0e..36d85e46 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -565,6 +565,48 @@ format_tcp_half_open_session (u8 * s, va_list * args) return format (s, "%U", format_tcp_connection, tc); } +u8 * +format_tcp_sacks (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + sack_block_t *sacks = tc->snd_sacks; + sack_block_t *block; + vec_foreach (block, sacks) + { + s = format (s, " start %u end %u\n", block->start - tc->irs, + block->end - tc->irs); + } + return s; +} + +u8 * +format_tcp_sack_hole (u8 * s, va_list * args) +{ + sack_scoreboard_hole_t *hole = va_arg (*args, sack_scoreboard_hole_t *); + s = format (s, "[%u, %u]", hole->start, hole->end); + return s; +} + +u8 * +format_tcp_scoreboard (u8 * s, va_list * args) +{ + sack_scoreboard_t *sb = va_arg (*args, sack_scoreboard_t *); + sack_scoreboard_hole_t *hole; + s = format (s, "head %u tail %u snd_una_adv %u\n", sb->head, sb->tail, + sb->snd_una_adv); + s = format (s, "sacked_bytes %u last_sacked_bytes %u", sb->sacked_bytes, + sb->last_sacked_bytes); + s = format (s, " max_byte_sacked %u\n", sb->max_byte_sacked); + s = format (s, "holes:\n"); + hole = scoreboard_first_hole (sb); + while (hole) + { + s = format (s, "%U", format_tcp_sack_hole, hole); + hole = scoreboard_next_hole (sb, hole); + } + return s; +} + transport_connection_t * tcp_session_get_transport (u32 conn_index, u32 thread_index) { diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 8212ada7..8d24a70b 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -389,6 +389,7 @@ void tcp_connection_reset (tcp_connection_t * tc); u8 *format_tcp_connection (u8 * s, va_list * args); u8 *format_tcp_connection_verbose (u8 * s, va_list * args); +u8 *format_tcp_scoreboard (u8 * s, va_list * args); always_inline tcp_connection_t * tcp_listener_get (u32 tli) diff --git a/src/vnet/tcp/tcp_format.c b/src/vnet/tcp/tcp_format.c index 4de99235..1ca2f58e 100644 --- a/src/vnet/tcp/tcp_format.c +++ b/src/vnet/tcp/tcp_format.c @@ -128,20 +128,6 @@ format_tcp_header (u8 * s, va_list * args) return s; } -u8 * -format_tcp_sacks (u8 * s, va_list * args) -{ - tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - sack_block_t *sacks = tc->snd_sacks; - sack_block_t *block; - vec_foreach (block, sacks) - { - s = format (s, " start %u end %u\n", block->start - tc->irs, - block->end - tc->irs); - } - return s; -} - /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index ddee41e0..9d3f4cc3 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -533,12 +533,13 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) sack_scoreboard_t *sb = &tc->sack_sb; sack_block_t *blk, tmp; sack_scoreboard_hole_t *hole, *next_hole, *last_hole, *new_hole; - u32 blk_index = 0, old_sacked_bytes, hole_index; + u32 blk_index = 0, old_sacked_bytes, delivered_bytes, hole_index; int i, j; sb->last_sacked_bytes = 0; sb->snd_una_adv = 0; old_sacked_bytes = sb->sacked_bytes; + delivered_bytes = 0; if (!tcp_opts_sack (&tc->opt) && sb->head == TCP_INVALID_SACK_HOLE_INDEX) return; @@ -584,6 +585,8 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) last_hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX, tc->snd_una, tc->snd_una_max); sb->tail = scoreboard_hole_index (sb, last_hole); + tmp = tc->opt.sacks[vec_len (tc->opt.sacks) - 1]; + sb->max_byte_sacked = tmp.end; } else { @@ -614,37 +617,43 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { /* Bytes lost because snd_wnd left edge advances */ if (next_hole && seq_leq (next_hole->start, ack)) - sb->sacked_bytes -= next_hole->start - hole->end; + delivered_bytes += next_hole->start - hole->end; else - sb->sacked_bytes -= ack - hole->end; + delivered_bytes += ack - hole->end; } else { sb->sacked_bytes += scoreboard_hole_bytes (hole); } - /* snd_una needs to be advanced */ - if (seq_geq (ack, hole->end)) - { - if (next_hole && seq_lt (ack, next_hole->start)) - sb->snd_una_adv = next_hole->start - ack; - else - sb->snd_una_adv = sb->max_byte_sacked - ack; - - /* all these can be delivered */ - sb->sacked_bytes -= sb->snd_una_adv; - } - /* About to remove last hole */ if (hole == last_hole) { sb->tail = hole->prev; last_hole = scoreboard_last_hole (sb); - /* keep track of max byte sacked in case the last hole + /* keep track of max byte sacked for when the last hole * is acked */ if (seq_gt (hole->end, sb->max_byte_sacked)) sb->max_byte_sacked = hole->end; } + + /* snd_una needs to be advanced */ + if (blk->end == ack && seq_geq (ack, hole->end)) + { + if (next_hole && seq_lt (ack, next_hole->start)) + { + sb->snd_una_adv = next_hole->start - ack; + + /* all these can be delivered */ + delivered_bytes += sb->snd_una_adv; + } + else if (!next_hole) + { + sb->snd_una_adv = sb->max_byte_sacked - ack; + delivered_bytes += sb->snd_una_adv; + } + } + scoreboard_remove_hole (sb, hole); hole = next_hole; } @@ -693,8 +702,8 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) } } - sb->last_sacked_bytes = sb->sacked_bytes + sb->snd_una_adv - - old_sacked_bytes; + sb->last_sacked_bytes = sb->sacked_bytes - old_sacked_bytes; + sb->sacked_bytes -= delivered_bytes; } /** Update snd_wnd diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c index a457ac8f..2af38484 100644 --- a/src/vnet/tcp/tcp_test.c +++ b/src/vnet/tcp/tcp_test.c @@ -35,13 +35,19 @@ } static int -tcp_test_sack_rx () +tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) { tcp_connection_t _tc, *tc = &_tc; sack_scoreboard_t *sb = &tc->sack_sb; sack_block_t *sacks = 0, block; sack_scoreboard_hole_t *hole; - int i; + int i, verbose = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "verbose")) + verbose = 1; + } memset (tc, 0, sizeof (*tc)); @@ -69,6 +75,10 @@ tcp_test_sack_rx () tc->opt.n_sack_blocks = vec_len (tc->opt.sacks); tcp_rcv_sacks (tc, 0); + if (verbose) + vlib_cli_output (vm, "sb after even blocks:\n%U", format_tcp_scoreboard, + sb); + TCP_TEST ((pool_elts (sb->holes) == 5), "scoreboard has %d elements", pool_elts (sb->holes)); @@ -83,7 +93,8 @@ tcp_test_sack_rx () TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv); TCP_TEST ((sb->last_sacked_bytes == 400), "last sacked bytes %d", sb->last_sacked_bytes); - + TCP_TEST ((sb->max_byte_sacked == 900), + "max byte sacked %u", sb->max_byte_sacked); /* * Inject odd blocks */ @@ -96,6 +107,10 @@ tcp_test_sack_rx () tc->opt.n_sack_blocks = vec_len (tc->opt.sacks); tcp_rcv_sacks (tc, 0); + if (verbose) + vlib_cli_output (vm, "sb after odd blocks:\n%U", format_tcp_scoreboard, + sb); + hole = scoreboard_first_hole (sb); TCP_TEST ((pool_elts (sb->holes) == 1), "scoreboard has %d holes", pool_elts (sb->holes)); @@ -112,6 +127,9 @@ tcp_test_sack_rx () * Ack until byte 100, all bytes are now acked + sacked */ tcp_rcv_sacks (tc, 100); + if (verbose) + vlib_cli_output (vm, "ack until byte 100:\n%U", format_tcp_scoreboard, + sb); TCP_TEST ((pool_elts (sb->holes) == 0), "scoreboard has %d elements", pool_elts (sb->holes)); @@ -133,11 +151,17 @@ tcp_test_sack_rx () block.end = 1300; vec_add1 (tc->opt.sacks, block); + if (verbose) + vlib_cli_output (vm, "add [1200, 1300]:\n%U", format_tcp_scoreboard, sb); tc->snd_una_max = 1500; tc->snd_una = 1000; tc->snd_nxt = 1500; tcp_rcv_sacks (tc, 1000); + if (verbose) + vlib_cli_output (vm, "sb snd_una_max 1500, snd_una 1000:\n%U", + format_tcp_scoreboard, sb); + TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv after ack %u", sb->snd_una_adv); TCP_TEST ((pool_elts (sb->holes) == 2), @@ -145,6 +169,10 @@ tcp_test_sack_rx () hole = scoreboard_first_hole (sb); TCP_TEST ((hole->start == 1000 && hole->end == 1200), "first hole start %u end %u", hole->start, hole->end); + TCP_TEST ((sb->snd_una_adv == 0), + "snd_una_adv after ack %u", sb->snd_una_adv); + TCP_TEST ((sb->max_byte_sacked == 1300), + "max sacked byte %u", sb->max_byte_sacked); hole = scoreboard_last_hole (sb); TCP_TEST ((hole->start == 1300 && hole->end == 1500), "last hole start %u end %u", hole->start, hole->end); @@ -157,6 +185,10 @@ tcp_test_sack_rx () vec_reset_length (tc->opt.sacks); tcp_rcv_sacks (tc, 1200); + if (verbose) + vlib_cli_output (vm, "sb ack up to byte 1200:\n%U", format_tcp_scoreboard, + sb); + TCP_TEST ((sb->snd_una_adv == 100), "snd_una_adv after ack %u", sb->snd_una_adv); TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes); @@ -168,8 +200,41 @@ tcp_test_sack_rx () */ scoreboard_clear (sb); + if (verbose) + vlib_cli_output (vm, "sb cleared all:\n%U", format_tcp_scoreboard, sb); + TCP_TEST ((pool_elts (sb->holes) == 0), "number of holes %d", pool_elts (sb->holes)); + /* + * Re-inject odd blocks and ack them all + */ + + tc->snd_una = 0; + tc->snd_una_max = 1000; + tc->snd_nxt = 1000; + for (i = 0; i < 5; i++) + { + vec_add1 (tc->opt.sacks, sacks[i * 2 + 1]); + } + tc->opt.n_sack_blocks = vec_len (tc->opt.sacks); + tcp_rcv_sacks (tc, 0); + if (verbose) + vlib_cli_output (vm, "sb added odd blocks and ack [0, 950]:\n%U", + format_tcp_scoreboard, sb); + + tcp_rcv_sacks (tc, 950); + + if (verbose) + vlib_cli_output (vm, "sb added odd blocks and ack [0, 950]:\n%U", + format_tcp_scoreboard, sb); + + TCP_TEST ((pool_elts (sb->holes) == 0), + "scoreboard has %d elements", pool_elts (sb->holes)); + TCP_TEST ((sb->snd_una_adv == 50), "snd_una_adv %u", sb->snd_una_adv); + TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes); + TCP_TEST ((sb->last_sacked_bytes == 0), + "last sacked bytes %d", sb->last_sacked_bytes); + return 0; } @@ -290,7 +355,7 @@ tcp_test_sack (vlib_main_t * vm, unformat_input_t * input) return -1; } - if (tcp_test_sack_rx ()) + if (tcp_test_sack_rx (vm, input)) { return -1; } @@ -303,7 +368,7 @@ tcp_test_sack (vlib_main_t * vm, unformat_input_t * input) } else if (unformat (input, "rx")) { - res = tcp_test_sack_rx (); + res = tcp_test_sack_rx (vm, input); } } -- cgit 1.2.3-korg From bb292f4d3fbecfc6b1bac695f833b0da78369116 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Fri, 19 May 2017 09:49:19 -0700 Subject: Improve session debugging Also improves builtin client code. Change-Id: I8bca1aa632028f95c373726efb0abf2ee0eff414 Signed-off-by: Florin Coras --- src/svm/svm_fifo.c | 9 +++-- src/vnet/session/session_cli.c | 47 +++++++++++++-------- src/vnet/tcp/builtin_client.c | 73 ++++++++++++++++----------------- src/vnet/tcp/builtin_client.h | 3 -- src/vnet/tcp/tcp.c | 92 +++++++++++++++++++++++++++++++++++------- src/vnet/tcp/tcp.h | 4 +- src/vnet/tcp/tcp_input.c | 25 +++++++----- src/vnet/tcp/tcp_output.c | 12 +++++- 8 files changed, 176 insertions(+), 89 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c index 9b09d0c2..f13f6fea 100644 --- a/src/svm/svm_fifo.c +++ b/src/svm/svm_fifo.c @@ -54,19 +54,20 @@ format_svm_fifo (u8 * s, va_list * args) s = format (s, "cursize %u nitems %u has_event %d\n", f->cursize, f->nitems, f->has_event); - s = format (s, "head %d tail %d\n", f->head, f->tail); + s = format (s, " head %d tail %d\n", f->head, f->tail); if (verbose > 1) s = format - (s, "server session %d thread %d client session %d thread %d\n", + (s, " server session %d thread %d client session %d thread %d\n", f->master_session_index, f->master_thread_index, f->client_session_index, f->client_thread_index); if (verbose) { - s = format (s, "ooo pool %d active elts\n", + s = format (s, " ooo pool %d active elts\n", pool_elts (f->ooo_segments)); - s = format (s, "%U", format_ooo_list, f); + if (svm_fifo_has_ooo_data (f)) + s = format (s, " %U", format_ooo_list, f); } return s; } diff --git a/src/vnet/session/session_cli.c b/src/vnet/session/session_cli.c index f5990c81..509eedbb 100755 --- a/src/vnet/session/session_cli.c +++ b/src/vnet/session/session_cli.c @@ -30,33 +30,35 @@ format_stream_session (u8 * s, va_list * args) int verbose = va_arg (*args, int); transport_proto_vft_t *tp_vft; u8 *str = 0; - tp_vft = session_get_transport_vft (ss->session_type); - if (verbose) - str = format (0, "%-20llp%-20llp%-15lld", ss->server_rx_fifo, - ss->server_tx_fifo, stream_session_get_index (ss)); + if (verbose == 1) + str = format (0, "%-10u%-10u%-10lld", + svm_fifo_max_dequeue (ss->server_rx_fifo), + svm_fifo_max_enqueue (ss->server_tx_fifo), + stream_session_get_index (ss)); if (ss->session_state == SESSION_STATE_READY) { - s = format (s, "%-40U%v", tp_vft->format_connection, - ss->connection_index, ss->thread_index, str); + s = format (s, "%U", tp_vft->format_connection, ss->connection_index, + ss->thread_index, verbose); + if (verbose == 1) + s = format (s, "%v", str); } else if (ss->session_state == SESSION_STATE_LISTENING) { s = format (s, "%-40U%v", tp_vft->format_listener, ss->connection_index, str); } - else if (ss->session_state == SESSION_STATE_READY) + else if (ss->session_state == SESSION_STATE_CONNECTING) { - s = - format (s, "%-40U%v", tp_vft->format_half_open, ss->connection_index, - str); + s = format (s, "%-40U%v", tp_vft->format_half_open, + ss->connection_index, str); } else if (ss->session_state == SESSION_STATE_CLOSED) { s = format (s, "[CL] %-40U%v", tp_vft->format_connection, - ss->connection_index, ss->thread_index, str); + ss->connection_index, ss->thread_index, verbose, str); } else { @@ -85,7 +87,9 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { - if (unformat (input, "verbose")) + if (unformat (input, "verbose %d", &verbose)) + ; + else if (unformat (input, "verbose")) verbose = 1; else break; @@ -105,11 +109,11 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, i, pool_elts (pool)); if (verbose) { - if (once_per_pool) + if (once_per_pool && verbose == 1) { - str = format (str, "%-50s%-20s%-20s%-15s", - "Connection", "Rx fifo", "Tx fifo", - "Session Index"); + str = + format (str, "%-50s%-15s%-10s%-10s%-10s", "Connection", + "State", "Rx-f", "Tx-f", "S-idx"); vlib_cli_output (vm, "%v", str); vec_reset_length (str); once_per_pool = 0; @@ -118,7 +122,16 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, /* *INDENT-OFF* */ pool_foreach (s, pool, ({ - vlib_cli_output (vm, "%U", format_stream_session, s, verbose); + vec_reset_length (str); + str = format (str, "%U", format_stream_session, s, verbose); + if (verbose > 1) + { + str = format (str, " Rx fifo: %U", format_svm_fifo, + s->server_rx_fifo, 1); + str = format (str, " Tx fifo: %U", format_svm_fifo, + s->server_tx_fifo, 1); + } + vlib_cli_output (vm, "%v", str); })); /* *INDENT-ON* */ } diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index d13fd446..aaefa7eb 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -187,6 +187,9 @@ tclient_thread_fn (void *arg) clib_per_cpu_mheaps[vlib_get_thread_index ()] = clib_per_cpu_mheaps[0]; + vec_validate (session_indices, 0); + vec_reset_length (session_indices); + while (1) { /* Wait until we're told to get busy */ @@ -201,14 +204,12 @@ tclient_thread_fn (void *arg) tm->run_test = 0; rx_total = 0; - clib_warning ("Run %d iterations", tm->n_iterations); + clib_warning ("Start test..."); before = clib_time_now (&ttime); - for (i = 0; i < tm->n_iterations; i++) + do { - session_t *sp; - do { try_tx = try_rx = 0; @@ -229,24 +230,38 @@ tclient_thread_fn (void *arg) receive_test_chunk (tm, sp); try_rx = 1; } + else + { + /* Session is complete */ + vec_add1 (session_indices, sp - tm->sessions); + } })); + /* Terminate any completed sessions */ + if (PREDICT_FALSE (_vec_len(session_indices) != 0)) + { + for (i = 0; i < _vec_len (session_indices); i++) + { + sp = pool_elt_at_index (tm->sessions, session_indices[i]); + rx_total += sp->bytes_received; + dmp = vl_msg_api_alloc_as_if_client (sizeof (*dmp)); + memset (dmp, 0, sizeof (*dmp)); + dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION); + dmp->client_index = tm->my_client_index; + dmp->handle = sp->vpp_session_handle; + vl_msg_api_send_shmem (tm->vl_input_queue, (u8 *) & dmp); + pool_put (tm->sessions, sp); + } + _vec_len(session_indices) = 0; + } /* *INDENT-ON* */ } while (try_tx || try_rx); - - /* *INDENT-OFF* */ - pool_foreach (sp, tm->sessions, - ({ - rx_total += sp->bytes_received; - sp->bytes_received = 0; - sp->bytes_to_send = tm->bytes_to_send; - })); - /* *INDENT-ON* */ } + while (0); after = clib_time_now (&ttime); - clib_warning ("Done %d iterations, %lld bytes in %.2f secs", - tm->n_iterations, rx_total, (after - before)); + clib_warning ("Test complete %lld bytes in %.2f secs", + rx_total, (after - before)); if ((after - before) != 0.0) { clib_warning ("%.2f bytes/second full-duplex", @@ -255,28 +270,11 @@ tclient_thread_fn (void *arg) (((f64) rx_total * 8.0) / (after - before)) / 1e9); } - /* Disconnect sessions... */ - vec_reset_length (session_indices); - - /* *INDENT-OFF* */ - pool_foreach (sp, tm->sessions, - ({ - vec_add1 (session_indices, sp - tm->sessions); - })); - /* *INDENT-ON* */ - - for (i = 0; i < vec_len (session_indices); i++) - { - sp = pool_elt_at_index (tm->sessions, session_indices[i]); - dmp = vl_msg_api_alloc_as_if_client (sizeof (*dmp)); - memset (dmp, 0, sizeof (*dmp)); - dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION); - dmp->client_index = tm->my_client_index; - dmp->handle = sp->vpp_session_handle; - vl_msg_api_send_shmem (tm->vl_input_queue, (u8 *) & dmp); - pool_put (tm->sessions, sp); - } + if (pool_elts (tm->sessions)) + clib_warning ("BUG: %d active sessions remain...", + pool_elts (tm->sessions)); } + while (0); /* NOTREACHED */ #if TCP_BUILTIN_CLIENT_PTHREAD return 0; @@ -511,15 +509,12 @@ test_tcp_clients_command_fn (vlib_main_t * vm, u64 tmp; tm->bytes_to_send = 8192; - tm->n_iterations = 1; vec_free (tm->connect_uri); while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { if (unformat (input, "nclients %d", &n_clients)) ; - else if (unformat (input, "iterations %d", &tm->n_iterations)) - ; else if (unformat (input, "mbytes %lld", &tmp)) tm->bytes_to_send = tmp << 20; else if (unformat (input, "gbytes %lld", &tmp)) diff --git a/src/vnet/tcp/builtin_client.h b/src/vnet/tcp/builtin_client.h index 756b3d18..57d112e6 100644 --- a/src/vnet/tcp/builtin_client.h +++ b/src/vnet/tcp/builtin_client.h @@ -93,9 +93,6 @@ typedef struct /* Signal variables */ volatile int run_test; - /* Number of iterations */ - int n_iterations; - /* Bytes to send */ u64 bytes_to_send; diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 36d85e46..9b7b2f65 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -461,13 +461,12 @@ const char *tcp_fsm_states[] = { u8 * format_tcp_state (u8 * s, va_list * args) { - tcp_state_t *state = va_arg (*args, tcp_state_t *); + u32 state = va_arg (*args, u32); - if (*state < TCP_N_STATES) - s = format (s, "%s", tcp_fsm_states[*state]); + if (state < TCP_N_STATES) + s = format (s, "%s", tcp_fsm_states[state]); else - s = format (s, "UNKNOWN (%d (0x%x))", *state, *state); - + s = format (s, "UNKNOWN (%d (0x%x))", state, state); return s; } @@ -503,7 +502,51 @@ format_tcp_timers (u8 * s, va_list * args) } u8 * -format_tcp_connection (u8 * s, va_list * args) +format_tcp_congestion_status (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + if (tcp_in_recovery (tc)) + s = format (s, "recovery"); + else if (tcp_in_fastrecovery (tc)) + s = format (s, "fastrecovery"); + else + s = format (s, "none"); + return s; +} + +u8 * +format_tcp_vars (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + s = format (s, " snd_una %u snd_nxt %u snd_una_max %u\n", + tc->snd_una - tc->iss, tc->snd_nxt - tc->iss, + tc->snd_una_max - tc->iss); + s = format (s, " rcv_nxt %u rcv_las %u\n", + tc->rcv_nxt - tc->irs, tc->rcv_las - tc->irs); + s = format (s, " snd_wnd %u rcv_wnd %u snd_wl1 %u snd_wl2 %u\n", + tc->snd_wnd, tc->rcv_wnd, tc->snd_wl1 - tc->irs, + tc->snd_wl2 - tc->iss); + s = format (s, " flight size %u send space %u rcv_wnd available %d\n", + tcp_flight_size (tc), tcp_snd_space (tc), + tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las)); + s = format (s, " cong %U ", format_tcp_congestion_status, tc); + s = format (s, "cwnd %u ssthresh %u rtx_bytes %u bytes_acked %u\n", + tc->cwnd, tc->ssthresh, tc->rtx_bytes, tc->bytes_acked); + s = format (s, " prev_ssthresh %u snd_congestion %u\n", tc->prev_ssthresh, + tc->snd_congestion - tc->iss); + s = format (s, " rto %u rto_boff %u srtt %u rttvar %u rtt_ts %u ", tc->rto, + tc->rto_boff, tc->srtt, tc->rttvar, tc->rtt_ts); + s = format (s, "rtt_seq %u\n", tc->rtt_seq); + if (scoreboard_first_hole (&tc->sack_sb)) + s = format (s, " scoreboard: %U\n", format_tcp_scoreboard, &tc->sack_sb); + if (vec_len (tc->snd_sacks)) + s = format (s, " sacks tx: %U\n", format_tcp_sacks, tc); + + return s; +} + +u8 * +format_tcp_connection_id (u8 * s, va_list * args) { tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); if (!tc) @@ -527,11 +570,18 @@ format_tcp_connection (u8 * s, va_list * args) } u8 * -format_tcp_connection_verbose (u8 * s, va_list * args) +format_tcp_connection (u8 * s, va_list * args) { tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - s = format (s, "%U %U %U", format_tcp_connection, tc, format_tcp_state, - &tc->state, format_tcp_timers, tc); + u32 verbose = va_arg (*args, u32); + + s = format (s, "%-50U", format_tcp_connection_id, tc); + if (verbose) + { + s = format (s, "%-15U", format_tcp_state, tc->state); + if (verbose > 1) + s = format (s, " %U\n%U", format_tcp_timers, tc, format_tcp_vars, tc); + } return s; } @@ -540,11 +590,12 @@ format_tcp_session (u8 * s, va_list * args) { u32 tci = va_arg (*args, u32); u32 thread_index = va_arg (*args, u32); + u32 verbose = va_arg (*args, u32); tcp_connection_t *tc; tc = tcp_connection_get (tci, thread_index); if (tc) - return format (s, "%U", format_tcp_connection, tc); + return format (s, "%U", format_tcp_connection, tc, verbose); else return format (s, "empty"); } @@ -554,7 +605,7 @@ format_tcp_listener_session (u8 * s, va_list * args) { u32 tci = va_arg (*args, u32); tcp_connection_t *tc = tcp_listener_get (tci); - return format (s, "%U", format_tcp_connection, tc); + return format (s, "%U", format_tcp_connection_id, tc); } u8 * @@ -562,7 +613,7 @@ format_tcp_half_open_session (u8 * s, va_list * args) { u32 tci = va_arg (*args, u32); tcp_connection_t *tc = tcp_half_open_connection_get (tci); - return format (s, "%U", format_tcp_connection, tc); + return format (s, "%U", format_tcp_connection_id, tc); } u8 * @@ -659,12 +710,18 @@ tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space) /** * Compute tx window session is allowed to fill. + * + * Takes into account available send space, snd_mss and the congestion + * state of the connection. If possible, the value returned is a multiple + * of snd_mss. + * + * @param tc tcp connection + * @return number of bytes session is allowed to write */ u32 -tcp_session_send_space (transport_connection_t * trans_conn) +tcp_snd_space (tcp_connection_t * tc) { int snd_space; - tcp_connection_t *tc = (tcp_connection_t *) trans_conn; /* If we haven't gotten dupacks or if we did and have gotten sacked bytes * then we can still send */ @@ -697,6 +754,13 @@ tcp_session_send_space (transport_connection_t * trans_conn) return 0; } +u32 +tcp_session_send_space (transport_connection_t * trans_conn) +{ + tcp_connection_t *tc = (tcp_connection_t *) trans_conn; + return tcp_snd_space (tc); +} + u32 tcp_session_tx_fifo_offset (transport_connection_t * trans_conn) { diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 8d24a70b..c3ebe22b 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -387,8 +387,8 @@ void tcp_connection_cleanup (tcp_connection_t * tc); void tcp_connection_del (tcp_connection_t * tc); void tcp_connection_reset (tcp_connection_t * tc); +u8 *format_tcp_connection_id (u8 * s, va_list * args); u8 *format_tcp_connection (u8 * s, va_list * args); -u8 *format_tcp_connection_verbose (u8 * s, va_list * args); u8 *format_tcp_scoreboard (u8 * s, va_list * args); always_inline tcp_connection_t * @@ -481,6 +481,8 @@ tcp_available_snd_space (const tcp_connection_t * tc) return available_wnd - flight_size; } +u32 tcp_rcv_wnd_available (tcp_connection_t * tc); +u32 tcp_snd_space (tcp_connection_t * tc); void tcp_update_rcv_wnd (tcp_connection_t * tc); void tcp_retransmit_first_unacked (tcp_connection_t * tc); diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 9d3f4cc3..35bc9094 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -722,9 +722,11 @@ tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd) TCP_EVT_DBG (TCP_EVT_SND_WND, tc); /* Set probe timer if we just got 0 wnd */ - if (tc->snd_wnd < tc->snd_mss - && !tcp_timer_is_active (tc, TCP_TIMER_PERSIST)) - tcp_persist_timer_set (tc); + if (tc->snd_wnd < tc->snd_mss) + { + if (!tcp_timer_is_active (tc, TCP_TIMER_PERSIST)) + tcp_persist_timer_set (tc); + } else tcp_persist_timer_reset (tc); } @@ -763,6 +765,7 @@ static void tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) { u8 partial_ack; + u32 bytes_advanced; if (tcp_in_fastrecovery (tc)) { @@ -804,10 +807,14 @@ tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) tc->rcv_dupacks = 0; if (tcp_in_recovery (tc)) { - tc->rtx_bytes -= clib_min (tc->bytes_acked, tc->rtx_bytes); + bytes_advanced = tc->bytes_acked + tc->sack_sb.snd_una_adv; + tc->rtx_bytes -= clib_min (bytes_advanced, tc->rtx_bytes); tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); if (seq_geq (tc->snd_una, tc->snd_congestion)) - tcp_recovery_off (tc); + { + tc->rtx_bytes = 0; + tcp_recovery_off (tc); + } } } } @@ -1221,7 +1228,7 @@ format_tcp_rx_trace (u8 * s, va_list * args) s = format (s, "%U\n%U%U", format_tcp_header, &t->tcp_header, 128, format_white_space, indent, - format_tcp_connection_verbose, &t->tcp_connection); + format_tcp_connection, &t->tcp_connection, 1); return s; } @@ -1236,7 +1243,7 @@ format_tcp_rx_trace_short (u8 * s, va_list * args) s = format (s, "%d -> %d (%U)", clib_net_to_host_u16 (t->tcp_header.src_port), clib_net_to_host_u16 (t->tcp_header.dst_port), format_tcp_state, - &t->tcp_connection.state); + t->tcp_connection.state); return s; } @@ -2165,6 +2172,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, child0->c_rmt_port = th0->src_port; child0->c_is_ip4 = is_ip4; child0->c_thread_index = my_thread_index; + child0->state = TCP_STATE_SYN_RCVD; if (is_ip4) { @@ -2194,7 +2202,6 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, child0->irs = vnet_buffer (b0)->tcp.seq_number; child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1; child0->rcv_las = child0->rcv_nxt; - child0->state = TCP_STATE_SYN_RCVD; /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK} * segments are used to initialize PAWS. */ @@ -2450,7 +2457,7 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (error0 == TCP_ERROR_DISPATCH) clib_warning ("disp error state %U flags %U", - format_tcp_state, &state0, format_tcp_flags, + format_tcp_state, state0, format_tcp_flags, (int) flags0); } } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index a462d8da..49fd6bef 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -59,7 +59,7 @@ format_tcp_tx_trace (u8 * s, va_list * args) s = format (s, "%U\n%U%U", format_tcp_header, &t->tcp_header, 128, format_white_space, indent, - format_tcp_connection_verbose, &t->tcp_connection); + format_tcp_connection, &t->tcp_connection, 1); return s; } @@ -1024,6 +1024,11 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, ASSERT (n_bytes != 0); b->current_length = n_bytes; tcp_push_hdr_i (tc, b, tc->state, 0); + + /* Don't count multiple retransmits of the same segment */ + if (tc->rto_boff > 1) + goto done; + tc->rtx_bytes += n_bytes; done: @@ -1103,7 +1108,9 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) if (n_bytes == 0) { - clib_warning ("could not retransmit"); + clib_warning ("could not retransmit anything"); + /* Try again eventually */ + tcp_retransmit_timer_set (tc); return; } } @@ -1203,6 +1210,7 @@ tcp_timer_persist_handler (u32 index) /* Nothing to send */ if (n_bytes == 0) { + clib_warning ("persist found nothing to send"); tcp_return_buffer (tm); return; } -- cgit 1.2.3-korg From 93992a9048cb6e5dcd22de5091e72de778122627 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Wed, 24 May 2017 18:03:56 -0700 Subject: Implement sack based tcp loss recovery (RFC 6675) - refactor existing congestion control code (RFC 6582/5681). Handling of ack feedback now consists of: ack parsing, cc event detection, event handling, congestion control update - extend sack scoreboard to support sack based retransmissions - basic implementation of Eifel detection algorithm (RFC 3522) for detecting spurious retransmissions - actually initialize the per-thread frame freelist hash tables - increase worker stack size to 2mb - fix session queue node out-of-buffer handling - ensure that the local buffer cache vec_len matches reality - avoid 2x spurious event requeues when short of buffers - count out-of-buffer events - make the builtin server thread-safe - fix bihash template threading issue: need to paint -1 across uninitialized working_copy_length vector elements (via rebase from master) Change-Id: I646cb9f1add9a67d08f4a87badbcb117980ebfc4 Signed-off-by: Florin Coras Signed-off-by: Dave Barach --- src/svm/svm_fifo.c | 5 +- src/vlib/node.c | 1 + src/vlib/threads.c | 2 +- src/vlib/threads.h | 2 +- src/vnet/session/node.c | 53 ++-- src/vnet/session/session.c | 11 +- src/vnet/session/session.h | 6 +- src/vnet/session/session_cli.c | 26 +- src/vnet/tcp/builtin_client.c | 40 ++- src/vnet/tcp/builtin_server.c | 20 +- src/vnet/tcp/tcp.c | 57 ++-- src/vnet/tcp/tcp.h | 112 +++++-- src/vnet/tcp/tcp_debug.h | 16 +- src/vnet/tcp/tcp_input.c | 671 +++++++++++++++++++++++++++++------------ src/vnet/tcp/tcp_newreno.c | 20 +- src/vnet/tcp/tcp_output.c | 287 ++++++++++++------ src/vnet/tcp/tcp_test.c | 53 ++-- 17 files changed, 973 insertions(+), 409 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c index f13f6fea..5c8f244a 100644 --- a/src/svm/svm_fifo.c +++ b/src/svm/svm_fifo.c @@ -540,7 +540,7 @@ svm_fifo_peek (svm_fifo_t * f, u32 relative_offset, u32 max_bytes, /* read cursize, which can only increase while we're working */ cursize = svm_fifo_max_dequeue (f); - if (PREDICT_FALSE (cursize == 0)) + if (PREDICT_FALSE (cursize < relative_offset)) return -2; /* nothing in the fifo */ nitems = f->nitems; @@ -548,7 +548,8 @@ svm_fifo_peek (svm_fifo_t * f, u32 relative_offset, u32 max_bytes, real_head = real_head >= nitems ? real_head - nitems : real_head; /* Number of bytes we're going to copy */ - total_copy_bytes = (cursize < max_bytes) ? cursize : max_bytes; + total_copy_bytes = (cursize - relative_offset < max_bytes) ? + cursize - relative_offset : max_bytes; if (PREDICT_TRUE (copy_here != 0)) { diff --git a/src/vlib/node.c b/src/vlib/node.c index bbd3a42e..eecad274 100644 --- a/src/vlib/node.c +++ b/src/vlib/node.c @@ -502,6 +502,7 @@ vlib_node_main_init (vlib_main_t * vm) vlib_node_t *n; uword ni; + nm->frame_size_hash = hash_create (0, sizeof (uword)); nm->flags |= VLIB_NODE_MAIN_RUNTIME_STARTED; /* Generate sibling relationships */ diff --git a/src/vlib/threads.c b/src/vlib/threads.c index b7bc9e26..0c775e2d 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -670,7 +670,7 @@ start_workers (vlib_main_t * vm) /* zap the (per worker) frame freelists, etc */ nm_clone->frame_sizes = 0; - nm_clone->frame_size_hash = 0; + nm_clone->frame_size_hash = hash_create (0, sizeof (uword)); /* Packet trace buffers are guaranteed to be empty, nothing to do here */ diff --git a/src/vlib/threads.h b/src/vlib/threads.h index 17d35a24..572ce77f 100644 --- a/src/vlib/threads.h +++ b/src/vlib/threads.h @@ -62,7 +62,7 @@ typedef struct vlib_thread_registration_ #define VLIB_CPU_MASK (VLIB_MAX_CPUS - 1) /* 0x3f, max */ #define VLIB_OFFSET_MASK (~VLIB_CPU_MASK) -#define VLIB_LOG2_THREAD_STACK_SIZE (20) +#define VLIB_LOG2_THREAD_STACK_SIZE (21) #define VLIB_THREAD_STACK_SIZE (1<session_type]; @@ -167,9 +169,6 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, /* Check how much we can pull. If buffering, subtract the offset */ max_dequeue0 = svm_fifo_max_dequeue (s0->server_tx_fifo) - rx_offset; - /* Allow enqueuing of a new event */ - svm_fifo_unset_event (s0->server_tx_fifo); - /* Nothing to read return */ if (max_dequeue0 == 0) return 0; @@ -187,8 +186,8 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, max_len_to_snd0 = snd_space0; } - n_bytes_per_buf = vlib_buffer_free_list_buffer_size (vm, - VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + n_bytes_per_buf = vlib_buffer_free_list_buffer_size + (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); n_bytes_per_seg = MAX_HDRS_LEN + snd_mss0; n_bufs_per_seg = ceil ((double) n_bytes_per_seg / n_bytes_per_buf); n_bufs_per_evt = (ceil ((double) max_len_to_snd0 / n_bytes_per_seg)) @@ -205,24 +204,33 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, if (PREDICT_FALSE (n_bufs < VLIB_FRAME_SIZE)) { vec_validate (smm->tx_buffers[thread_index], - n_bufs + VLIB_FRAME_SIZE - 1); - n_bufs += vlib_buffer_alloc (vm, - &smm->tx_buffers[thread_index][n_bufs], - VLIB_FRAME_SIZE); - - /* buffer shortage - * XXX 0.9 because when debugging we might not get a full frame */ - if (PREDICT_FALSE (n_bufs < 0.9 * VLIB_FRAME_SIZE)) + n_bufs + 2 * VLIB_FRAME_SIZE - 1); + + buffers_allocated = 0; + do { - if (svm_fifo_set_event (s0->server_tx_fifo)) - { - vec_add1 (smm->pending_event_vector[thread_index], *e0); - } - return -1; + buffers_allocated_this_call = + vlib_buffer_alloc + (vm, + &smm->tx_buffers[thread_index][n_bufs + buffers_allocated], + 2 * VLIB_FRAME_SIZE - buffers_allocated); + buffers_allocated += buffers_allocated_this_call; } + while (buffers_allocated_this_call > 0 + && ((buffers_allocated + n_bufs < VLIB_FRAME_SIZE))); + + n_bufs += buffers_allocated; _vec_len (smm->tx_buffers[thread_index]) = n_bufs; + + if (PREDICT_FALSE (n_bufs < VLIB_FRAME_SIZE)) + { + vec_add1 (smm->pending_event_vector[thread_index], *e0); + return -1; + } } + /* Allow enqueuing of a new event */ + svm_fifo_unset_event (s0->server_tx_fifo); vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); while (left_to_snd0 && n_left_to_next >= n_bufs_per_seg) @@ -232,7 +240,9 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, */ /* Get free buffer */ + ASSERT (n_bufs >= 1); bi0 = smm->tx_buffers[thread_index][--n_bufs]; + ASSERT (bi0); _vec_len (smm->tx_buffers[thread_index]) = n_bufs; b0 = vlib_get_buffer (vm, bi0); @@ -545,9 +555,10 @@ skip_dequeue: my_thread_index, &n_tx_packets); /* Out of buffers */ - if (rv < 0) + if (PREDICT_FALSE (rv < 0)) { - vec_add1 (smm->pending_event_vector[my_thread_index], *e0); + vlib_node_increment_counter (vm, node->node_index, + SESSION_QUEUE_ERROR_NO_BUFFER, 1); continue; } break; diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 02b0cced..534598d6 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -551,7 +551,7 @@ u8 stream_session_no_space (transport_connection_t * tc, u32 thread_index, u16 data_len) { - stream_session_t *s = stream_session_get (tc->c_index, thread_index); + stream_session_t *s = stream_session_get (tc->s_index, thread_index); if (PREDICT_FALSE (s->session_state != SESSION_STATE_READY)) return 1; @@ -563,6 +563,15 @@ stream_session_no_space (transport_connection_t * tc, u32 thread_index, } u32 +stream_session_tx_fifo_max_dequeue (transport_connection_t * tc) +{ + stream_session_t *s = stream_session_get (tc->s_index, tc->thread_index); + if (s->session_state != SESSION_STATE_READY) + return 0; + return svm_fifo_max_dequeue (s->server_tx_fifo); +} + +int stream_session_peek_bytes (transport_connection_t * tc, u8 * buffer, u32 offset, u32 max_bytes) { diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index a8728649..d9c38bd1 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -352,16 +352,18 @@ stream_session_max_rx_enqueue (transport_connection_t * tc) } always_inline u32 -stream_session_fifo_size (transport_connection_t * tc) +stream_session_rx_fifo_size (transport_connection_t * tc) { stream_session_t *s = stream_session_get (tc->s_index, tc->thread_index); return s->server_rx_fifo->nitems; } +u32 stream_session_tx_fifo_max_dequeue (transport_connection_t * tc); + int stream_session_enqueue_data (transport_connection_t * tc, vlib_buffer_t * b, u32 offset, u8 queue_event, u8 is_in_order); -u32 +int stream_session_peek_bytes (transport_connection_t * tc, u8 * buffer, u32 offset, u32 max_bytes); u32 stream_session_dequeue_drop (transport_connection_t * tc, u32 max_bytes); diff --git a/src/vnet/session/session_cli.c b/src/vnet/session/session_cli.c index 509eedbb..6b8341aa 100755 --- a/src/vnet/session/session_cli.c +++ b/src/vnet/session/session_cli.c @@ -15,6 +15,15 @@ #include #include +u8 * +format_stream_session_fifos (u8 * s, va_list * args) +{ + stream_session_t *ss = va_arg (*args, stream_session_t *); + s = format (s, " Rx fifo: %U", format_svm_fifo, ss->server_rx_fifo, 1); + s = format (s, " Tx fifo: %U", format_svm_fifo, ss->server_tx_fifo, 1); + return s; +} + /** * Format stream session as per the following format * @@ -44,6 +53,8 @@ format_stream_session (u8 * s, va_list * args) ss->thread_index, verbose); if (verbose == 1) s = format (s, "%v", str); + if (verbose > 1) + s = format (s, "%U", format_stream_session_fifos, ss); } else if (ss->session_state == SESSION_STATE_LISTENING) { @@ -57,8 +68,12 @@ format_stream_session (u8 * s, va_list * args) } else if (ss->session_state == SESSION_STATE_CLOSED) { - s = format (s, "[CL] %-40U%v", tp_vft->format_connection, - ss->connection_index, ss->thread_index, verbose, str); + s = format (s, "[CL] %-40U", tp_vft->format_connection, + ss->connection_index, ss->thread_index, verbose); + if (verbose == 1) + s = format (s, "%v", str); + if (verbose > 1) + s = format (s, "%U", format_stream_session_fifos, ss); } else { @@ -124,13 +139,6 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, ({ vec_reset_length (str); str = format (str, "%U", format_stream_session, s, verbose); - if (verbose > 1) - { - str = format (str, " Rx fifo: %U", format_svm_fifo, - s->server_rx_fifo, 1); - str = format (str, " Tx fifo: %U", format_svm_fifo, - s->server_tx_fifo, 1); - } vlib_cli_output (vm, "%v", str); })); /* *INDENT-ON* */ diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index 768f0c3c..7238cda3 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -115,8 +115,17 @@ receive_test_chunk (tclient_main_t * tm, session_t * s) /* Allow enqueuing of new event */ // svm_fifo_unset_event (rx_fifo); - n_read = svm_fifo_dequeue_nowait (rx_fifo, vec_len (tm->rx_buf), - tm->rx_buf); + if (test_bytes) + { + n_read = svm_fifo_dequeue_nowait (rx_fifo, vec_len (tm->rx_buf), + tm->rx_buf); + } + else + { + n_read = svm_fifo_max_dequeue (rx_fifo); + svm_fifo_dequeue_drop (rx_fifo, n_read); + } + if (n_read > 0) { if (TCP_BUILTIN_CLIENT_DBG) @@ -165,6 +174,8 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, int i; int delete_session; u32 *connection_indices; + u32 tx_quota = 0; + u32 delta, prev_bytes_received_this_session; connection_indices = tm->connection_index_by_thread[my_thread_index]; @@ -177,14 +188,19 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, sp = pool_elt_at_index (tm->sessions, connection_indices[i]); - if (sp->bytes_to_send > 0) + if (tx_quota < 60 && sp->bytes_to_send > 0) { send_test_chunk (tm, sp); delete_session = 0; + tx_quota++; } if (sp->bytes_to_receive > 0) { + prev_bytes_received_this_session = sp->bytes_received; receive_test_chunk (tm, sp); + delta = sp->bytes_received - prev_bytes_received_this_session; + if (delta > 0) + tx_quota--; delete_session = 0; } if (PREDICT_FALSE (delete_session == 1)) @@ -195,11 +211,19 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION); dmp->client_index = tm->my_client_index; dmp->handle = sp->vpp_session_handle; - vl_msg_api_send_shmem (tm->vl_input_queue, (u8 *) & dmp); - vec_delete (connection_indices, 1, i); - tm->connection_index_by_thread[my_thread_index] = - connection_indices; - __sync_fetch_and_add (&tm->ready_connections, -1); +// vl_msg_api_send_shmem (tm->vl_input_queue, (u8 *) & dmp); + if (!unix_shared_memory_queue_add (tm->vl_input_queue, (u8 *) & dmp, + 1)) + { + vec_delete (connection_indices, 1, i); + tm->connection_index_by_thread[my_thread_index] = + connection_indices; + __sync_fetch_and_add (&tm->ready_connections, -1); + } + else + { + vl_msg_api_free (dmp); + } /* Kick the debug CLI process */ if (tm->ready_connections == 0) diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c index 4f0e211c..8bd2f360 100644 --- a/src/vnet/tcp/builtin_server.c +++ b/src/vnet/tcp/builtin_server.c @@ -39,7 +39,8 @@ typedef struct { - u8 *rx_buf; + /* Per-thread RX buffer */ + u8 **rx_buf; unix_shared_memory_queue_t **vpp_queue; u64 byte_index; @@ -117,13 +118,15 @@ void test_bytes (builtin_server_main_t * bsm, int actual_transfer) { int i; + u32 my_thread_id = vlib_get_thread_index (); for (i = 0; i < actual_transfer; i++) { - if (bsm->rx_buf[i] != ((bsm->byte_index + i) & 0xff)) + if (bsm->rx_buf[my_thread_id][i] != ((bsm->byte_index + i) & 0xff)) { clib_warning ("at %lld expected %d got %d", bsm->byte_index + i, - (bsm->byte_index + i) & 0xff, bsm->rx_buf[i]); + (bsm->byte_index + i) & 0xff, + bsm->rx_buf[my_thread_id][i]); } } bsm->byte_index += actual_transfer; @@ -138,6 +141,7 @@ builtin_server_rx_callback (stream_session_t * s) builtin_server_main_t *bsm = &builtin_server_main; session_fifo_event_t evt; static int serial_number = 0; + u32 my_thread_id = vlib_get_thread_index (); tx_fifo = s->server_tx_fifo; rx_fifo = s->server_rx_fifo; @@ -171,11 +175,12 @@ builtin_server_rx_callback (stream_session_t * s) return 0; } - vec_validate (bsm->rx_buf, max_transfer - 1); - _vec_len (bsm->rx_buf) = max_transfer; + vec_validate (bsm->rx_buf, my_thread_id); + vec_validate (bsm->rx_buf[my_thread_id], max_transfer - 1); + _vec_len (bsm->rx_buf[my_thread_id]) = max_transfer; actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, max_transfer, - bsm->rx_buf); + bsm->rx_buf[my_thread_id]); ASSERT (actual_transfer == max_transfer); // test_bytes (bsm, actual_transfer); @@ -184,7 +189,8 @@ builtin_server_rx_callback (stream_session_t * s) * Echo back */ - n_written = svm_fifo_enqueue_nowait (tx_fifo, actual_transfer, bsm->rx_buf); + n_written = svm_fifo_enqueue_nowait (tx_fifo, actual_transfer, + bsm->rx_buf[my_thread_id]); if (n_written != max_transfer) clib_warning ("short trout!"); diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 9b7b2f65..e0b67a8e 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -195,8 +195,8 @@ tcp_connection_close (tcp_connection_t * tc) TCP_EVT_DBG (TCP_EVT_CLOSE, tc); /* Send FIN if needed */ - if (tc->state == TCP_STATE_ESTABLISHED || tc->state == TCP_STATE_SYN_RCVD - || tc->state == TCP_STATE_CLOSE_WAIT) + if (tc->state == TCP_STATE_ESTABLISHED + || tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_CLOSE_WAIT) tcp_send_fin (tc); /* Switch state */ @@ -480,7 +480,7 @@ u8 * format_tcp_timers (u8 * s, va_list * args) { tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - int i, last = 0; + int i, last = -1; for (i = 0; i < TCP_N_TIMERS; i++) if (tc->timers[i] != TCP_TIMER_HANDLE_INVALID) @@ -493,7 +493,7 @@ format_tcp_timers (u8 * s, va_list * args) s = format (s, "%s,", tcp_conn_timers[i]); } - if (last > 0) + if (last >= 0) s = format (s, "%s]", tcp_conn_timers[i]); else s = format (s, "]"); @@ -526,19 +526,19 @@ format_tcp_vars (u8 * s, va_list * args) s = format (s, " snd_wnd %u rcv_wnd %u snd_wl1 %u snd_wl2 %u\n", tc->snd_wnd, tc->rcv_wnd, tc->snd_wl1 - tc->irs, tc->snd_wl2 - tc->iss); - s = format (s, " flight size %u send space %u rcv_wnd available %d\n", - tcp_flight_size (tc), tcp_snd_space (tc), - tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las)); + s = format (s, " flight size %u send space %u rcv_wnd_av %d\n", + tcp_flight_size (tc), tcp_available_snd_space (tc), + tcp_rcv_wnd_available (tc)); s = format (s, " cong %U ", format_tcp_congestion_status, tc); s = format (s, "cwnd %u ssthresh %u rtx_bytes %u bytes_acked %u\n", - tc->cwnd, tc->ssthresh, tc->rtx_bytes, tc->bytes_acked); - s = format (s, " prev_ssthresh %u snd_congestion %u\n", tc->prev_ssthresh, - tc->snd_congestion - tc->iss); + tc->cwnd, tc->ssthresh, tc->snd_rxt_bytes, tc->bytes_acked); + s = format (s, " prev_ssthresh %u snd_congestion %u dupack %u\n", + tc->prev_ssthresh, tc->snd_congestion - tc->iss, + tc->rcv_dupacks); s = format (s, " rto %u rto_boff %u srtt %u rttvar %u rtt_ts %u ", tc->rto, tc->rto_boff, tc->srtt, tc->rttvar, tc->rtt_ts); s = format (s, "rtt_seq %u\n", tc->rtt_seq); - if (scoreboard_first_hole (&tc->sack_sb)) - s = format (s, " scoreboard: %U\n", format_tcp_scoreboard, &tc->sack_sb); + s = format (s, " scoreboard: %U\n", format_tcp_scoreboard, &tc->sack_sb); if (vec_len (tc->snd_sacks)) s = format (s, " sacks tx: %U\n", format_tcp_sacks, tc); @@ -595,9 +595,10 @@ format_tcp_session (u8 * s, va_list * args) tc = tcp_connection_get (tci, thread_index); if (tc) - return format (s, "%U", format_tcp_connection, tc, verbose); + s = format (s, "%U", format_tcp_connection, tc, verbose); else - return format (s, "empty"); + s = format (s, "empty"); + return s; } u8 * @@ -643,13 +644,17 @@ format_tcp_scoreboard (u8 * s, va_list * args) { sack_scoreboard_t *sb = va_arg (*args, sack_scoreboard_t *); sack_scoreboard_hole_t *hole; - s = format (s, "head %u tail %u snd_una_adv %u\n", sb->head, sb->tail, - sb->snd_una_adv); - s = format (s, "sacked_bytes %u last_sacked_bytes %u", sb->sacked_bytes, - sb->last_sacked_bytes); - s = format (s, " max_byte_sacked %u\n", sb->max_byte_sacked); - s = format (s, "holes:\n"); + s = format (s, "sacked_bytes %u last_sacked_bytes %u lost_bytes %u\n", + sb->sacked_bytes, sb->last_sacked_bytes, sb->lost_bytes); + s = format (s, " last_bytes_delivered %u high_sacked %u snd_una_adv %u\n", + sb->last_bytes_delivered, sb->high_sacked, sb->snd_una_adv); + s = format (s, " cur_rxt_hole %u high_rxt %u rescue_rxt %u", + sb->cur_rxt_hole, sb->high_rxt, sb->rescue_rxt); + hole = scoreboard_first_hole (sb); + if (hole) + s = format (s, "\n head %u tail %u holes:\n", sb->head, sb->tail); + while (hole) { s = format (s, "%U", format_tcp_sack_hole, hole); @@ -736,7 +741,7 @@ tcp_snd_space (tcp_connection_t * tc) if (tcp_in_recovery (tc)) { tc->snd_nxt = tc->snd_una_max; - snd_space = tcp_available_wnd (tc) - tc->rtx_bytes + snd_space = tcp_available_wnd (tc) - tc->snd_rxt_bytes - (tc->snd_una_max - tc->snd_congestion); if (snd_space <= 0 || (tc->snd_una_max - tc->snd_una) >= tc->snd_wnd) return 0; @@ -744,8 +749,8 @@ tcp_snd_space (tcp_connection_t * tc) } /* If in fast recovery, send 1 SMSS if wnd allows */ - if (tcp_in_fastrecovery (tc) && tcp_available_snd_space (tc) - && tcp_fastrecovery_sent_1_smss (tc)) + if (tcp_in_fastrecovery (tc) + && tcp_available_snd_space (tc) && !tcp_fastrecovery_sent_1_smss (tc)) { tcp_fastrecovery_1_smss_on (tc); return tc->snd_mss; @@ -761,6 +766,12 @@ tcp_session_send_space (transport_connection_t * trans_conn) return tcp_snd_space (tc); } +i32 +tcp_rcv_wnd_available (tcp_connection_t * tc) +{ + return (i32) tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las); +} + u32 tcp_session_tx_fifo_offset (transport_connection_t * trans_conn) { diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index c3ebe22b..071f1ab1 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -34,6 +34,7 @@ #define TCP_MAX_RX_FIFO_SIZE 2 << 20 #define TCP_IW_N_SEGMENTS 10 #define TCP_ALWAYS_ACK 0 /**< If on, we always ack */ +#define TCP_USE_SACKS 1 /**< Disable only for testing */ /** TCP FSM state definitions as per RFC793. */ #define foreach_tcp_fsm_state \ @@ -94,7 +95,7 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler; #define TCP_DELACK_TIME 1 /* 0.1s */ #define TCP_ESTABLISH_TIME 750 /* 75s */ #define TCP_2MSL_TIME 300 /* 30s */ -#define TCP_CLOSEWAIT_TIME 1 /* 0.1s */ +#define TCP_CLOSEWAIT_TIME 20 /* 0.1s */ #define TCP_CLEANUP_TIME 5 /* 0.5s Time to wait before cleanup */ #define TCP_TIMER_PERSIST_MIN 2 /* 0.2s */ @@ -157,6 +158,7 @@ typedef struct _sack_scoreboard_hole u32 prev; /**< Index for previous entry in linked list */ u32 start; /**< Start sequence number */ u32 end; /**< End sequence number */ + u8 is_lost; /**< Mark hole as lost */ } sack_scoreboard_hole_t; typedef struct _sack_scoreboard @@ -166,8 +168,13 @@ typedef struct _sack_scoreboard u32 tail; /**< Index of last entry */ u32 sacked_bytes; /**< Number of bytes sacked in sb */ u32 last_sacked_bytes; /**< Number of bytes last sacked */ + u32 last_bytes_delivered; /**< Number of sack bytes delivered */ u32 snd_una_adv; /**< Bytes to add to snd_una */ - u32 max_byte_sacked; /**< Highest byte acked */ + u32 high_sacked; /**< Highest byte sacked (fack) */ + u32 high_rxt; /**< Highest retransmitted sequence */ + u32 rescue_rxt; /**< Rescue sequence number */ + u32 lost_bytes; /**< Bytes lost as per RFC6675 */ + u32 cur_rxt_hole; /**< Retransmitting from this hole */ } sack_scoreboard_t; typedef enum _tcp_cc_algorithm_type @@ -211,7 +218,7 @@ typedef struct _tcp_connection u32 irs; /**< initial remote sequence */ /* Options */ - tcp_options_t opt; /**< TCP connection options parsed */ + tcp_options_t rcv_opts; /**< Rx options for connection */ tcp_options_t snd_opts; /**< Tx options for connection */ u8 snd_opts_len; /**< Tx options len */ u8 rcv_wscale; /**< Window scale to advertise to peer */ @@ -229,8 +236,10 @@ typedef struct _tcp_connection u32 cwnd; /**< Congestion window */ u32 ssthresh; /**< Slow-start threshold */ u32 prev_ssthresh; /**< ssthresh before congestion */ + u32 prev_cwnd; /**< ssthresh before congestion */ u32 bytes_acked; /**< Bytes acknowledged by current segment */ - u32 rtx_bytes; /**< Retransmitted bytes */ + u32 snd_rxt_bytes; /**< Retransmitted bytes */ + u32 snd_rxt_ts; /**< Timestamp when first packet is retransmitted */ u32 tsecr_last_ack; /**< Timestamp echoed to us in last healthy ACK */ u32 snd_congestion; /**< snd_una_max when congestion is detected */ tcp_cc_algorithm_t *cc_algo; /**< Congestion control algorithm */ @@ -411,6 +420,7 @@ void tcp_send_syn (tcp_connection_t * tc); void tcp_send_fin (tcp_connection_t * tc); void tcp_init_mss (tcp_connection_t * tc); void tcp_update_snd_mss (tcp_connection_t * tc); +void tcp_update_rto (tcp_connection_t * tc); always_inline u32 tcp_end_seq (tcp_header_t * th, u32 len) @@ -428,17 +438,39 @@ tcp_end_seq (tcp_header_t * th, u32 len) #define timestamp_lt(_t1, _t2) ((i32)((_t1)-(_t2)) < 0) #define timestamp_leq(_t1, _t2) ((i32)((_t1)-(_t2)) <= 0) +/** + * Our estimate of the number of bytes that have left the network + */ +always_inline u32 +tcp_bytes_out (const tcp_connection_t * tc) +{ + if (tcp_opts_sack_permitted (&tc->rcv_opts)) + return tc->sack_sb.sacked_bytes + tc->sack_sb.lost_bytes; + else + return tc->rcv_dupacks * tc->snd_mss; +} + +/** + * Our estimate of the number of bytes in flight (pipe size) + */ always_inline u32 tcp_flight_size (const tcp_connection_t * tc) { int flight_size; - flight_size = (int) ((tc->snd_una_max - tc->snd_una) + tc->rtx_bytes) - - (tc->rcv_dupacks * tc->snd_mss) /* - tc->sack_sb.sacked_bytes */ ; + flight_size = (int) (tc->snd_una_max - tc->snd_una) - tcp_bytes_out (tc) + + tc->snd_rxt_bytes; - /* Happens if we don't clear sacked bytes */ if (flight_size < 0) - return 0; + { + if (0) + clib_warning + ("Negative: %u %u %u dupacks %u sacked bytes %u flags %d", + tc->snd_una_max - tc->snd_una, tcp_bytes_out (tc), + tc->snd_rxt_bytes, tc->rcv_dupacks, tc->sack_sb.sacked_bytes, + tc->rcv_opts.flags); + return 0; + } return flight_size; } @@ -481,14 +513,17 @@ tcp_available_snd_space (const tcp_connection_t * tc) return available_wnd - flight_size; } -u32 tcp_rcv_wnd_available (tcp_connection_t * tc); +i32 tcp_rcv_wnd_available (tcp_connection_t * tc); u32 tcp_snd_space (tcp_connection_t * tc); void tcp_update_rcv_wnd (tcp_connection_t * tc); void tcp_retransmit_first_unacked (tcp_connection_t * tc); +void tcp_fast_retransmit_no_sack (tcp_connection_t * tc); +void tcp_fast_retransmit_sack (tcp_connection_t * tc); void tcp_fast_retransmit (tcp_connection_t * tc); -void tcp_cc_congestion (tcp_connection_t * tc); -void tcp_cc_recover (tcp_connection_t * tc); +void tcp_cc_init_congestion (tcp_connection_t * tc); +int tcp_cc_recover (tcp_connection_t * tc); +void tcp_cc_fastrecovery_exit (tcp_connection_t * tc); /* Made public for unit testing only */ void tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end); @@ -563,16 +598,16 @@ tcp_retransmit_timer_set (tcp_connection_t * tc) } always_inline void -tcp_retransmit_timer_update (tcp_connection_t * tc) +tcp_retransmit_timer_reset (tcp_connection_t * tc) { - tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, - clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); + tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT); } always_inline void -tcp_retransmit_timer_reset (tcp_connection_t * tc) +tcp_retransmit_timer_force_update (tcp_connection_t * tc) { - tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT); + tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, + clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); } always_inline void @@ -598,15 +633,43 @@ tcp_persist_timer_reset (tcp_connection_t * tc) tcp_timer_reset (tc, TCP_TIMER_PERSIST); } +always_inline void +tcp_retransmit_timer_update (tcp_connection_t * tc) +{ + if (tc->snd_una == tc->snd_una_max) + { + tcp_retransmit_timer_reset (tc); + if (tc->snd_wnd < tc->snd_mss) + tcp_persist_timer_set (tc); + } + else + tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, + clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); +} + always_inline u8 tcp_timer_is_active (tcp_connection_t * tc, tcp_timers_e timer) { return tc->timers[timer] != TCP_TIMER_HANDLE_INVALID; } +#define tcp_validate_txf_size(_tc, _a) \ + ASSERT(_tc->state != TCP_STATE_ESTABLISHED \ + || stream_session_tx_fifo_max_dequeue (&_tc->connection) >= _a) + void scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole); +void scoreboard_update_lost (tcp_connection_t * tc, sack_scoreboard_t * sb); +sack_scoreboard_hole_t *scoreboard_insert_hole (sack_scoreboard_t * sb, + u32 prev_index, u32 start, + u32 end); +sack_scoreboard_hole_t *scoreboard_next_rxt_hole (sack_scoreboard_t * sb, + sack_scoreboard_hole_t * + start, u8 have_sent_1_smss, + u8 * can_rescue, + u8 * snd_limited); +void scoreboard_init_high_rxt (sack_scoreboard_t * sb); always_inline sack_scoreboard_hole_t * scoreboard_get_hole (sack_scoreboard_t * sb, u32 index) @@ -624,6 +687,14 @@ scoreboard_next_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) return 0; } +always_inline sack_scoreboard_hole_t * +scoreboard_prev_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) +{ + if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, hole->prev); + return 0; +} + always_inline sack_scoreboard_hole_t * scoreboard_first_hole (sack_scoreboard_t * sb) { @@ -643,15 +714,19 @@ scoreboard_last_hole (sack_scoreboard_t * sb) always_inline void scoreboard_clear (sack_scoreboard_t * sb) { - sack_scoreboard_hole_t *hole = scoreboard_first_hole (sb); + sack_scoreboard_hole_t *hole; while ((hole = scoreboard_first_hole (sb))) { scoreboard_remove_hole (sb, hole); } sb->sacked_bytes = 0; sb->last_sacked_bytes = 0; + sb->last_bytes_delivered = 0; sb->snd_una_adv = 0; - sb->max_byte_sacked = 0; + sb->high_sacked = 0; + sb->high_rxt = 0; + sb->lost_bytes = 0; + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; } always_inline u32 @@ -671,6 +746,7 @@ scoreboard_init (sack_scoreboard_t * sb) { sb->head = TCP_INVALID_SACK_HOLE_INDEX; sb->tail = TCP_INVALID_SACK_HOLE_INDEX; + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; } void tcp_rcv_sacks (tcp_connection_t * tc, u32 ack); diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index b4497a3b..3a16cf63 100755 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -393,7 +393,7 @@ typedef enum _tcp_dbg_evt DECLARE_ETD(_tc, _e, 4); \ ed->data[0] = _seq - _tc->irs; \ ed->data[1] = _end - _tc->irs; \ - ed->data[2] = _tc->opt.tsval; \ + ed->data[2] = _tc->rcv_opts.tsval; \ ed->data[3] = _tc->tsval_recent; \ } @@ -427,27 +427,27 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "rtx: snd_nxt %u offset %u snd %u rtx %u", \ + .format = "rxt: snd_nxt %u offset %u snd %u rxt %u", \ .format_args = "i4i4i4i4", \ }; \ DECLARE_ETD(_tc, _e, 4); \ ed->data[0] = _tc->snd_nxt - _tc->iss; \ ed->data[1] = offset; \ ed->data[2] = n_bytes; \ - ed->data[3] = _tc->rtx_bytes; \ + ed->data[3] = _tc->snd_rxt_bytes; \ } #define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "cc: %s wnd %u snd_cong %u rtx_bytes %u", \ + .format = "cc: %s wnd %u snd_cong %u rxt_bytes %u", \ .format_args = "t4i4i4i4", \ .n_enum_strings = 5, \ .enum_strings = { \ - "fast-rtx", \ - "rtx-timeout", \ - "first-rtx", \ + "fast-rxt", \ + "rxt-timeout", \ + "first-rxt", \ "recovered", \ "congestion", \ }, \ @@ -456,7 +456,7 @@ typedef enum _tcp_dbg_evt ed->data[0] = _sub_evt; \ ed->data[1] = tcp_available_snd_space (_tc); \ ed->data[2] = _tc->snd_congestion - _tc->iss; \ - ed->data[3] = _tc->rtx_bytes; \ + ed->data[3] = _tc->snd_rxt_bytes; \ } #define TCP_EVT_CC_PACK_HANDLER(_tc, ...) \ diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 35bc9094..ff2229b3 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -231,8 +231,8 @@ tcp_options_parse (tcp_header_t * th, tcp_options_t * to) always_inline int tcp_segment_check_paws (tcp_connection_t * tc) { - return tcp_opts_tstamp (&tc->opt) && tc->tsval_recent - && timestamp_lt (tc->opt.tsval, tc->tsval_recent); + return tcp_opts_tstamp (&tc->rcv_opts) && tc->tsval_recent + && timestamp_lt (tc->rcv_opts.tsval, tc->tsval_recent); } /** @@ -248,10 +248,10 @@ tcp_update_timestamp (tcp_connection_t * tc, u32 seq, u32 seq_end) * then the TSval from the segment is copied to TS.Recent; * otherwise, the TSval is ignored. */ - if (tcp_opts_tstamp (&tc->opt) && tc->tsval_recent + if (tcp_opts_tstamp (&tc->rcv_opts) && tc->tsval_recent && seq_leq (seq, tc->rcv_las) && seq_leq (tc->rcv_las, seq_end)) { - tc->tsval_recent = tc->opt.tsval; + tc->tsval_recent = tc->rcv_opts.tsval; tc->tsval_recent_age = tcp_time_now (); } } @@ -272,14 +272,21 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, if (PREDICT_FALSE (!tcp_ack (th0) && !tcp_rst (th0) && !tcp_syn (th0))) return -1; - if (PREDICT_FALSE (tcp_options_parse (th0, &tc0->opt))) + if (PREDICT_FALSE (tcp_options_parse (th0, &tc0->rcv_opts))) { return -1; } if (tcp_segment_check_paws (tc0)) { - clib_warning ("paws failed"); + if (CLIB_DEBUG > 2) + { + clib_warning ("paws failed\n%U", format_tcp_connection, tc0, 2); + clib_warning ("seq %u seq_end %u ack %u", + vnet_buffer (b0)->tcp.seq_number - tc0->irs, + vnet_buffer (b0)->tcp.seq_end - tc0->irs, + vnet_buffer (b0)->tcp.ack_number - tc0->iss); + } TCP_EVT_DBG (TCP_EVT_PAWS_FAIL, tc0, vnet_buffer (b0)->tcp.seq_number, vnet_buffer (b0)->tcp.seq_end); @@ -348,7 +355,6 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, /* If segment in window, save timestamp */ tcp_update_timestamp (tc0, vnet_buffer (b0)->tcp.seq_number, vnet_buffer (b0)->tcp.seq_end); - return 0; } @@ -391,6 +397,12 @@ tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt) } } +void +tcp_update_rto (tcp_connection_t * tc) +{ + tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); +} + /** Update RTT estimate and RTO timer * * Measure RTT: We have two sources of RTT measurements: TSOPT and ACK @@ -405,7 +417,7 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) u32 mrtt = 0; u8 rtx_acked; - /* Determine if only rtx bytes are acked. TODO fast retransmit */ + /* Determine if only rtx bytes are acked. TODO XXX fast retransmit */ rtx_acked = tc->rto_boff && (tc->bytes_acked <= tc->snd_mss); /* Karn's rule, part 1. Don't use retransmitted segments to estimate @@ -418,9 +430,10 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) * snd_una, i.e., the left side of the send window: * seq_lt (tc->snd_una, ack). Note: last condition could be dropped, we don't * try to update rtt for dupacks */ - else if (tcp_opts_tstamp (&tc->opt) && tc->opt.tsecr && tc->bytes_acked) + else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr + && tc->bytes_acked) { - mrtt = tcp_time_now () - tc->opt.tsecr; + mrtt = tcp_time_now () - tc->rcv_opts.tsecr; } /* Allow measuring of a new RTT */ @@ -436,7 +449,7 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) return 0; tcp_estimate_rtt (tc, mrtt); - tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); + tcp_update_rto (tc); return 0; } @@ -447,25 +460,46 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) static void tcp_dequeue_acked (tcp_connection_t * tc, u32 ack) { - /* Dequeue the newly ACKed bytes */ - stream_session_dequeue_drop (&tc->connection, tc->bytes_acked); + /* Dequeue the newly ACKed add SACKed bytes */ + stream_session_dequeue_drop (&tc->connection, + tc->bytes_acked + tc->sack_sb.snd_una_adv); + + tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); /* Update rtt and rto */ tcp_update_rtt (tc, ack); + + /* If everything has been acked, stop retransmit timer + * otherwise update. */ + tcp_retransmit_timer_update (tc); } /** - * Check if dupack as per RFC5681 Sec. 2 - * - * This works only if called before updating snd_wnd. - * */ -always_inline u8 -tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 new_snd_wnd) + * Check if duplicate ack as per RFC5681 Sec. 2 + */ +static u8 +tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 prev_snd_wnd, + u32 prev_snd_una) { - return ((vnet_buffer (b)->tcp.ack_number == tc->snd_una) + return ((vnet_buffer (b)->tcp.ack_number == prev_snd_una) && seq_gt (tc->snd_una_max, tc->snd_una) && (vnet_buffer (b)->tcp.seq_end == vnet_buffer (b)->tcp.seq_number) - && (new_snd_wnd == tc->snd_wnd)); + && (prev_snd_wnd == tc->snd_wnd)); +} + +/** + * Checks if ack is a congestion control event. + */ +static u8 +tcp_ack_is_cc_event (tcp_connection_t * tc, vlib_buffer_t * b, + u32 prev_snd_wnd, u32 prev_snd_una, u8 * is_dack) +{ + /* Check if ack is duplicate. Per RFC 6675, ACKs that SACK new data are + * defined to be 'duplicate' */ + *is_dack = tc->sack_sb.last_sacked_bytes + || tcp_ack_is_dupack (tc, b, prev_snd_wnd, prev_snd_una); + + return (*is_dack || tcp_in_cong_recovery (tc)); } void @@ -478,6 +512,10 @@ scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) next = pool_elt_at_index (sb->holes, hole->next); next->prev = hole->prev; } + else + { + sb->tail = hole->prev; + } if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX) { @@ -489,6 +527,9 @@ scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) sb->head = hole->next; } + if (scoreboard_hole_index (sb, hole) == sb->cur_rxt_hole) + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; + pool_put (sb->holes, hole); } @@ -527,26 +568,131 @@ scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index, return hole; } +void +scoreboard_update_lost (tcp_connection_t * tc, sack_scoreboard_t * sb) +{ + sack_scoreboard_hole_t *hole, *prev; + u32 bytes = 0, blks = 0; + + sb->lost_bytes = 0; + hole = scoreboard_last_hole (sb); + if (!hole) + return; + + if (seq_gt (sb->high_sacked, hole->end)) + { + bytes = sb->high_sacked - hole->end; + blks = 1; + } + + while ((prev = scoreboard_prev_hole (sb, hole)) + && (bytes < (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss + && blks < TCP_DUPACK_THRESHOLD)) + { + bytes += hole->start - prev->end; + blks++; + hole = prev; + } + + hole = prev; + while (hole) + { + sb->lost_bytes += scoreboard_hole_bytes (hole); + hole->is_lost = 1; + hole = scoreboard_prev_hole (sb, hole); + } +} + +/** + * Figure out the next hole to retransmit + * + * Follows logic proposed in RFC6675 Sec. 4, NextSeg() + */ +sack_scoreboard_hole_t * +scoreboard_next_rxt_hole (sack_scoreboard_t * sb, + sack_scoreboard_hole_t * start, + u8 have_sent_1_smss, + u8 * can_rescue, u8 * snd_limited) +{ + sack_scoreboard_hole_t *hole = 0; + + hole = start ? start : scoreboard_first_hole (sb); + while (hole && seq_leq (hole->end, sb->high_rxt) && hole->is_lost) + hole = scoreboard_next_hole (sb, hole); + + /* Nothing, return */ + if (!hole) + { + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; + return 0; + } + + /* Rule (1): if higher than rxt, less than high_sacked and lost */ + if (hole->is_lost && seq_lt (hole->start, sb->high_sacked)) + { + sb->cur_rxt_hole = scoreboard_hole_index (sb, hole); + } + else + { + /* Rule (2): output takes care of transmitting new data */ + if (!have_sent_1_smss) + { + hole = 0; + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; + } + /* Rule (3): if hole not lost */ + else if (seq_lt (hole->start, sb->high_sacked)) + { + *snd_limited = 1; + sb->cur_rxt_hole = scoreboard_hole_index (sb, hole); + } + /* Rule (4): if hole beyond high_sacked */ + else + { + ASSERT (seq_geq (hole->start, sb->high_sacked)); + *snd_limited = 1; + *can_rescue = 1; + /* HighRxt MUST NOT be updated */ + return 0; + } + } + + if (hole && seq_lt (sb->high_rxt, hole->start)) + sb->high_rxt = hole->start; + + return hole; +} + +void +scoreboard_init_high_rxt (sack_scoreboard_t * sb) +{ + sack_scoreboard_hole_t *hole; + hole = scoreboard_first_hole (sb); + sb->high_rxt = hole->start; + sb->cur_rxt_hole = sb->head; +} + void tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { sack_scoreboard_t *sb = &tc->sack_sb; sack_block_t *blk, tmp; sack_scoreboard_hole_t *hole, *next_hole, *last_hole, *new_hole; - u32 blk_index = 0, old_sacked_bytes, delivered_bytes, hole_index; + u32 blk_index = 0, old_sacked_bytes, hole_index; int i, j; sb->last_sacked_bytes = 0; sb->snd_una_adv = 0; old_sacked_bytes = sb->sacked_bytes; - delivered_bytes = 0; + sb->last_bytes_delivered = 0; - if (!tcp_opts_sack (&tc->opt) && sb->head == TCP_INVALID_SACK_HOLE_INDEX) + if (!tcp_opts_sack (&tc->rcv_opts) + && sb->head == TCP_INVALID_SACK_HOLE_INDEX) return; /* Remove invalid blocks */ - blk = tc->opt.sacks; - while (blk < vec_end (tc->opt.sacks)) + blk = tc->rcv_opts.sacks; + while (blk < vec_end (tc->rcv_opts.sacks)) { if (seq_lt (blk->start, blk->end) && seq_gt (blk->start, tc->snd_una) @@ -555,7 +701,7 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) blk++; continue; } - vec_del1 (tc->opt.sacks, blk - tc->opt.sacks); + vec_del1 (tc->rcv_opts.sacks, blk - tc->rcv_opts.sacks); } /* Add block for cumulative ack */ @@ -563,20 +709,20 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { tmp.start = tc->snd_una; tmp.end = ack; - vec_add1 (tc->opt.sacks, tmp); + vec_add1 (tc->rcv_opts.sacks, tmp); } - if (vec_len (tc->opt.sacks) == 0) + if (vec_len (tc->rcv_opts.sacks) == 0) return; /* Make sure blocks are ordered */ - for (i = 0; i < vec_len (tc->opt.sacks); i++) - for (j = i + 1; j < vec_len (tc->opt.sacks); j++) - if (seq_lt (tc->opt.sacks[j].start, tc->opt.sacks[i].start)) + for (i = 0; i < vec_len (tc->rcv_opts.sacks); i++) + for (j = i + 1; j < vec_len (tc->rcv_opts.sacks); j++) + if (seq_lt (tc->rcv_opts.sacks[j].start, tc->rcv_opts.sacks[i].start)) { - tmp = tc->opt.sacks[i]; - tc->opt.sacks[i] = tc->opt.sacks[j]; - tc->opt.sacks[j] = tmp; + tmp = tc->rcv_opts.sacks[i]; + tc->rcv_opts.sacks[i] = tc->rcv_opts.sacks[j]; + tc->rcv_opts.sacks[j] = tmp; } if (sb->head == TCP_INVALID_SACK_HOLE_INDEX) @@ -585,25 +731,25 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) last_hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX, tc->snd_una, tc->snd_una_max); sb->tail = scoreboard_hole_index (sb, last_hole); - tmp = tc->opt.sacks[vec_len (tc->opt.sacks) - 1]; - sb->max_byte_sacked = tmp.end; + tmp = tc->rcv_opts.sacks[vec_len (tc->rcv_opts.sacks) - 1]; + sb->high_sacked = tmp.end; } else { /* If we have holes but snd_una_max is beyond the last hole, update * last hole end */ - tmp = tc->opt.sacks[vec_len (tc->opt.sacks) - 1]; + tmp = tc->rcv_opts.sacks[vec_len (tc->rcv_opts.sacks) - 1]; last_hole = scoreboard_last_hole (sb); - if (seq_gt (tc->snd_una_max, sb->max_byte_sacked) + if (seq_gt (tc->snd_una_max, sb->high_sacked) && seq_gt (tc->snd_una_max, last_hole->end)) last_hole->end = tc->snd_una_max; } /* Walk the holes with the SACK blocks */ hole = pool_elt_at_index (sb->holes, sb->head); - while (hole && blk_index < vec_len (tc->opt.sacks)) + while (hole && blk_index < vec_len (tc->rcv_opts.sacks)) { - blk = &tc->opt.sacks[blk_index]; + blk = &tc->rcv_opts.sacks[blk_index]; if (seq_leq (blk->start, hole->start)) { @@ -617,9 +763,9 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { /* Bytes lost because snd_wnd left edge advances */ if (next_hole && seq_leq (next_hole->start, ack)) - delivered_bytes += next_hole->start - hole->end; + sb->last_bytes_delivered += next_hole->start - hole->end; else - delivered_bytes += ack - hole->end; + sb->last_bytes_delivered += ack - hole->end; } else { @@ -633,8 +779,8 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) last_hole = scoreboard_last_hole (sb); /* keep track of max byte sacked for when the last hole * is acked */ - if (seq_gt (hole->end, sb->max_byte_sacked)) - sb->max_byte_sacked = hole->end; + if (seq_gt (hole->end, sb->high_sacked)) + sb->high_sacked = hole->end; } /* snd_una needs to be advanced */ @@ -645,12 +791,12 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) sb->snd_una_adv = next_hole->start - ack; /* all these can be delivered */ - delivered_bytes += sb->snd_una_adv; + sb->last_bytes_delivered += sb->snd_una_adv; } else if (!next_hole) { - sb->snd_una_adv = sb->max_byte_sacked - ack; - delivered_bytes += sb->snd_una_adv; + sb->snd_una_adv = sb->high_sacked - ack; + sb->last_bytes_delivered += sb->snd_una_adv; } } @@ -691,28 +837,33 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) } blk_index++; - hole = scoreboard_next_hole (sb, hole); } - else + else if (seq_leq (blk->start, hole->end)) { sb->sacked_bytes += hole->end - blk->start; hole->end = blk->start; - hole = scoreboard_next_hole (sb, hole); } + + hole = scoreboard_next_hole (sb, hole); } } sb->last_sacked_bytes = sb->sacked_bytes - old_sacked_bytes; - sb->sacked_bytes -= delivered_bytes; + sb->sacked_bytes -= sb->last_bytes_delivered; + scoreboard_update_lost (tc, sb); } -/** Update snd_wnd +/** + * Try to update snd_wnd based on feedback received from peer. * - * If (SND.WL1 < SEG.SEQ or (SND.WL1 = SEG.SEQ and SND.WL2 =< SEG.ACK)), set - * SND.WND <- SEG.WND, set SND.WL1 <- SEG.SEQ, and set SND.WL2 <- SEG.ACK */ + * If successful, and new window is 'effectively' 0, activate persist + * timer. + */ static void tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd) { + /* If (SND.WL1 < SEG.SEQ or (SND.WL1 = SEG.SEQ and SND.WL2 =< SEG.ACK)), set + * SND.WND <- SEG.WND, set SND.WL1 <- SEG.SEQ, and set SND.WL2 <- SEG.ACK */ if (seq_lt (tc->snd_wl1, seq) || (tc->snd_wl1 == seq && seq_leq (tc->snd_wl2, ack))) { @@ -721,138 +872,269 @@ tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd) tc->snd_wl2 = ack; TCP_EVT_DBG (TCP_EVT_SND_WND, tc); - /* Set probe timer if we just got 0 wnd */ if (tc->snd_wnd < tc->snd_mss) { - if (!tcp_timer_is_active (tc, TCP_TIMER_PERSIST)) + /* Set persist timer if not set and we just got 0 wnd */ + if (!tcp_timer_is_active (tc, TCP_TIMER_PERSIST) + && !tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)) tcp_persist_timer_set (tc); } else - tcp_persist_timer_reset (tc); + { + tcp_persist_timer_reset (tc); + if (!tcp_in_recovery (tc) && tc->rto_boff > 0) + { + tc->rto_boff = 0; + tcp_update_rto (tc); + } + } } } void -tcp_cc_congestion (tcp_connection_t * tc) +tcp_cc_init_congestion (tcp_connection_t * tc) { - tc->snd_congestion = tc->snd_nxt; + tcp_fastrecovery_on (tc); + tc->snd_congestion = tc->snd_una_max; tc->cc_algo->congestion (tc); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 4); } -void -tcp_cc_recover (tcp_connection_t * tc) +static void +tcp_cc_recovery_exit (tcp_connection_t * tc) { - /* TODO: check if time to recover was small. It might be that RTO popped - * too soon. - */ + /* Deflate rto */ + tcp_update_rto (tc); + tc->rto_boff = 0; + tc->snd_rxt_ts = 0; + tcp_recovery_off (tc); +} +void +tcp_cc_fastrecovery_exit (tcp_connection_t * tc) +{ tc->cc_algo->recovered (tc); + tc->snd_rxt_bytes = 0; + tc->rcv_dupacks = 0; + tcp_fastrecovery_off (tc); + tcp_fastrecovery_1_smss_off (tc); +} - tc->rtx_bytes = 0; +static void +tcp_cc_congestion_undo (tcp_connection_t * tc) +{ + tc->cwnd = tc->prev_cwnd; + tc->ssthresh = tc->prev_ssthresh; + tc->snd_nxt = tc->snd_una_max; tc->rcv_dupacks = 0; - tc->snd_nxt = tc->snd_una; + if (tcp_in_recovery (tc)) + tcp_cc_recovery_exit (tc); + ASSERT (tc->rto_boff == 0); + /* TODO extend for fastrecovery */ +} - tc->cc_algo->rcv_ack (tc); - tc->tsecr_last_ack = tc->opt.tsecr; +static u8 +tcp_cc_is_spurious_retransmit (tcp_connection_t * tc) +{ + return (tc->snd_rxt_ts + && tcp_opts_tstamp (&tc->rcv_opts) + && timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts)); +} - tcp_cong_recovery_off (tc); +int +tcp_cc_recover (tcp_connection_t * tc) +{ + ASSERT (tcp_in_cong_recovery (tc)); + if (tcp_cc_is_spurious_retransmit (tc)) + { + tcp_cc_congestion_undo (tc); + return 1; + } + + if (tcp_in_recovery (tc)) + tcp_cc_recovery_exit (tc); + else if (tcp_in_fastrecovery (tc)) + tcp_cc_fastrecovery_exit (tc); + + ASSERT (tc->rto_boff == 0); + ASSERT (!tcp_in_cong_recovery (tc)); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); + return 0; } static void -tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) +tcp_cc_update (tcp_connection_t * tc, vlib_buffer_t * b) +{ + ASSERT (!tcp_in_cong_recovery (tc)); + + /* Congestion avoidance */ + tc->cc_algo->rcv_ack (tc); + tc->tsecr_last_ack = tc->rcv_opts.tsecr; + + /* If a cumulative ack, make sure dupacks is 0 */ + tc->rcv_dupacks = 0; + + /* When dupacks hits the threshold we only enter fast retransmit if + * cumulative ack covers more than snd_congestion. Should snd_una + * wrap this test may fail under otherwise valid circumstances. + * Therefore, proactively update snd_congestion when wrap detected. */ + if (PREDICT_FALSE + (seq_leq (tc->snd_congestion, tc->snd_una - tc->bytes_acked) + && seq_gt (tc->snd_congestion, tc->snd_una))) + tc->snd_congestion = tc->snd_una - 1; +} + +static u8 +tcp_should_fastrecover_sack (tcp_connection_t * tc) { - u8 partial_ack; - u32 bytes_advanced; + return (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss < tc->sack_sb.sacked_bytes; +} - if (tcp_in_fastrecovery (tc)) +static u8 +tcp_should_fastrecover (tcp_connection_t * tc) +{ + return (tc->rcv_dupacks == TCP_DUPACK_THRESHOLD + || tcp_should_fastrecover_sack (tc)); +} + +static void +tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) +{ + /* + * Duplicate ACK. Check if we should enter fast recovery, or if already in + * it account for the bytes that left the network. + */ + if (is_dack) { - partial_ack = seq_lt (tc->snd_una, tc->snd_congestion); - if (!partial_ack) + ASSERT (tc->snd_una != tc->snd_una_max + || tc->sack_sb.last_sacked_bytes); + tc->rcv_dupacks++; + + if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD && !tc->bytes_acked) { - /* Clear retransmitted bytes. */ - tcp_cc_recover (tc); + ASSERT (tcp_in_fastrecovery (tc)); + /* Pure duplicate ack. If some data got acked, it's handled lower */ + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + return; } - else + else if (tcp_should_fastrecover (tc)) { - TCP_EVT_DBG (TCP_EVT_CC_PACK, tc); + /* Things are already bad */ + if (tcp_in_cong_recovery (tc)) + { + tc->rcv_dupacks = 0; + goto partial_ack_test; + } - /* Clear retransmitted bytes. XXX should we clear all? */ - tc->rtx_bytes = 0; + /* If of of the two conditions lower hold, reset dupacks + * 1) Cumulative ack does not cover more than congestion threshold + * 2) RFC6582 heuristic to avoid multiple fast retransmits + */ + if (seq_leq (tc->snd_una, tc->snd_congestion) + || tc->rcv_opts.tsecr != tc->tsecr_last_ack) + { + tc->rcv_dupacks = 0; + return; + } + + tcp_cc_init_congestion (tc); + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); - tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK); + /* The first segment MUST be retransmitted */ + tcp_retransmit_first_unacked (tc); - /* In case snd_nxt is still in the past and output tries to - * shove some new bytes */ - tc->snd_nxt = tc->snd_una_max; + /* Post retransmit update cwnd to ssthresh and account for the + * three segments that have left the network and should've been + * buffered at the receiver XXX */ + tc->cwnd = tc->ssthresh + tc->rcv_dupacks * tc->snd_mss; - /* XXX need proper RFC6675 support */ - if (tc->sack_sb.last_sacked_bytes && !tcp_in_recovery (tc)) + /* If cwnd allows, send more data */ + if (tcp_opts_sack_permitted (&tc->rcv_opts) + && scoreboard_first_hole (&tc->sack_sb)) { - tcp_fast_retransmit (tc); + scoreboard_init_high_rxt (&tc->sack_sb); + tcp_fast_retransmit_sack (tc); } else { - /* Retransmit first unacked segment */ - tcp_retransmit_first_unacked (tc); + tcp_fast_retransmit_no_sack (tc); } + + return; } - } - else - { - tc->cc_algo->rcv_ack (tc); - tc->tsecr_last_ack = tc->opt.tsecr; - tc->rcv_dupacks = 0; - if (tcp_in_recovery (tc)) + else if (!tc->bytes_acked + || (tc->bytes_acked && !tcp_in_cong_recovery (tc))) { - bytes_advanced = tc->bytes_acked + tc->sack_sb.snd_una_adv; - tc->rtx_bytes -= clib_min (bytes_advanced, tc->rtx_bytes); - tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); - if (seq_geq (tc->snd_una, tc->snd_congestion)) - { - tc->rtx_bytes = 0; - tcp_recovery_off (tc); - } + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + return; } + else + goto partial_ack; } -} -static void -tcp_cc_rcv_dupack (tcp_connection_t * tc, u32 ack) -{ -// ASSERT (seq_geq(tc->snd_una, ack)); +partial_ack_test: + + if (!tc->bytes_acked) + return; + +partial_ack: + /* + * Legitimate ACK. 1) See if we can exit recovery + */ + /* XXX limit this only to first partial ack? */ + tcp_retransmit_timer_update (tc); - tc->rcv_dupacks++; - if (tc->rcv_dupacks == TCP_DUPACK_THRESHOLD) + if (seq_geq (tc->snd_una, tc->snd_congestion)) { - /* RFC6582 NewReno heuristic to avoid multiple fast retransmits */ - if (tc->opt.tsecr != tc->tsecr_last_ack) - { - tc->rcv_dupacks = 0; - return; - } + /* If spurious return, we've already updated everything */ + if (tcp_cc_recover (tc)) + return; + + tc->snd_nxt = tc->snd_una_max; - tcp_fastrecovery_on (tc); + /* Treat as congestion avoidance ack */ + tc->cc_algo->rcv_ack (tc); + tc->tsecr_last_ack = tc->rcv_opts.tsecr; + return; + } + + /* + * Legitimate ACK. 2) If PARTIAL ACK try to retransmit + */ + TCP_EVT_DBG (TCP_EVT_CC_PACK, tc); + + /* RFC6675: If the incoming ACK is a cumulative acknowledgment, + * reset dupacks to 0 */ + tc->rcv_dupacks = 0; - /* Handle congestion and dupack */ - tcp_cc_congestion (tc); - tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + tcp_retransmit_first_unacked (tc); - tcp_fast_retransmit (tc); + /* Post RTO timeout don't try anything fancy */ + if (tcp_in_recovery (tc)) + return; - /* Post retransmit update cwnd to ssthresh and account for the - * three segments that have left the network and should've been - * buffered at the receiver */ - tc->cwnd = tc->ssthresh + TCP_DUPACK_THRESHOLD * tc->snd_mss; + /* Remove retransmitted bytes that have been delivered */ + if (tc->sack_sb.last_bytes_delivered + && seq_gt (tc->sack_sb.high_rxt, tc->snd_una)) + { + /* If we have sacks and we haven't gotten an ack beyond high_rxt, + * remove sacked bytes delivered */ + tc->snd_rxt_bytes -= tc->sack_sb.last_bytes_delivered; } - else if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD) + else { - ASSERT (tcp_in_fastrecovery (tc)); - - tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + /* Either all retransmitted holes have been acked, or we're + * "in the blind" and retransmitting segment by segment */ + tc->snd_rxt_bytes = 0; } + + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK); + + /* + * Since this was a partial ack, try to retransmit some more data + */ + tcp_fast_retransmit (tc); } void @@ -862,14 +1144,18 @@ tcp_cc_init (tcp_connection_t * tc) tc->cc_algo->init (tc); } +/** + * Process incoming ACK + */ static int tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, tcp_header_t * th, u32 * next, u32 * error) { - u32 new_snd_wnd; + u32 prev_snd_wnd, prev_snd_una; + u8 is_dack; /* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) */ - if (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)) + if (PREDICT_FALSE (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt))) { /* If we have outstanding data and this is within the window, accept it, * probably retransmit has timed out. Otherwise ACK segment and then @@ -892,7 +1178,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, } /* If old ACK, probably it's an old dupack */ - if (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una)) + if (PREDICT_FALSE (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una))) { *error = TCP_ERROR_ACK_OLD; TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 1, @@ -900,54 +1186,50 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, if (tcp_in_fastrecovery (tc) && tc->rcv_dupacks == TCP_DUPACK_THRESHOLD) { TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc); - tcp_cc_rcv_dupack (tc, vnet_buffer (b)->tcp.ack_number); + tcp_cc_handle_event (tc, 1); } /* Don't drop yet */ return 0; } - if (tcp_opts_sack_permitted (&tc->opt)) - tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number); - - new_snd_wnd = clib_net_to_host_u16 (th->window) << tc->snd_wscale; - - if (tcp_ack_is_dupack (tc, b, new_snd_wnd)) - { - TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc, 1); - tcp_cc_rcv_dupack (tc, vnet_buffer (b)->tcp.ack_number); - *error = TCP_ERROR_ACK_DUP; - return -1; - } - /* - * Valid ACK + * Looks okay, process feedback */ - tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una; - tc->snd_una = vnet_buffer (b)->tcp.ack_number + tc->sack_sb.snd_una_adv; + TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc); + + if (tcp_opts_sack_permitted (&tc->rcv_opts)) + tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number); - /* Dequeue ACKed data and update RTT */ - tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number); + prev_snd_wnd = tc->snd_wnd; + prev_snd_una = tc->snd_una; tcp_update_snd_wnd (tc, vnet_buffer (b)->tcp.seq_number, - vnet_buffer (b)->tcp.ack_number, new_snd_wnd); + vnet_buffer (b)->tcp.ack_number, + clib_net_to_host_u16 (th->window) << tc->snd_wscale); + tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una; + tc->snd_una = vnet_buffer (b)->tcp.ack_number + tc->sack_sb.snd_una_adv; + tcp_validate_txf_size (tc, tc->bytes_acked); - /* If some of our sent bytes have been acked, update cc and retransmit - * timer. */ if (tc->bytes_acked) - { - TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc); + tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number); - /* Updates congestion control (slow start/congestion avoidance) */ - tcp_cc_rcv_ack (tc, b); + /* + * Check if we have congestion event + */ - /* If everything has been acked, stop retransmit timer - * otherwise update. */ - if (tc->snd_una == tc->snd_una_max) - tcp_retransmit_timer_reset (tc); - else - tcp_retransmit_timer_update (tc); + if (tcp_ack_is_cc_event (tc, b, prev_snd_wnd, prev_snd_una, &is_dack)) + { + tcp_cc_handle_event (tc, is_dack); + *error = TCP_ERROR_ACK_DUP; + TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc, 1); + return vnet_buffer (b)->tcp.data_len ? 0 : -1; } + /* + * Update congestion control (slow start/congestion avoidance) + */ + tcp_cc_update (tc, b); + return 0; } @@ -1059,7 +1341,7 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, } /* Update SACK list if need be */ - if (tcp_opts_sack_permitted (&tc->opt)) + if (tcp_opts_sack_permitted (&tc->rcv_opts)) { /* Remove SACK blocks that have been delivered */ tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt); @@ -1097,7 +1379,7 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, TCP_EVT_DBG (TCP_EVT_INPUT, tc, 1, data_len, data_len); /* Update SACK list if in use */ - if (tcp_opts_sack_permitted (&tc->opt)) + if (tcp_opts_sack_permitted (&tc->rcv_opts)) { ooo_segment_t *newest; u32 start, end; @@ -1294,7 +1576,6 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, u32 n_left_to_next; vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); - while (n_left_from > 0 && n_left_to_next > 0) { u32 bi0; @@ -1321,7 +1602,6 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } th0 = tcp_buffer_hdr (b0); - is_fin = (th0->flags & TCP_FLAG_FIN) != 0; /* SYNs, FINs and data consume sequence numbers */ @@ -1387,7 +1667,6 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, errors = session_manager_flush_enqueue_events (my_thread_index); tcp_established_inc_counter (vm, is_ip4, TCP_ERROR_EVENT_FIFO_FULL, errors); - return from_frame->n_vectors; } @@ -1582,17 +1861,17 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, new_tc0->irs = seq0; /* Parse options */ - if (tcp_options_parse (tcp0, &new_tc0->opt)) + if (tcp_options_parse (tcp0, &new_tc0->rcv_opts)) goto drop; - if (tcp_opts_tstamp (&new_tc0->opt)) + if (tcp_opts_tstamp (&new_tc0->rcv_opts)) { - new_tc0->tsval_recent = new_tc0->opt.tsval; + new_tc0->tsval_recent = new_tc0->rcv_opts.tsval; new_tc0->tsval_recent_age = tcp_time_now (); } - if (tcp_opts_wscale (&new_tc0->opt)) - new_tc0->snd_wscale = new_tc0->opt.wscale; + if (tcp_opts_wscale (&new_tc0->rcv_opts)) + new_tc0->snd_wscale = new_tc0->rcv_opts.wscale; /* No scaling */ new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window); @@ -1845,7 +2124,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Initialize session variables */ tc0->snd_una = vnet_buffer (b0)->tcp.ack_number; tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window) - << tc0->opt.wscale; + << tc0->rcv_opts.wscale; tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number; tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number; @@ -1903,13 +2182,21 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, break; case TCP_STATE_LAST_ACK: - /* The only thing that can arrive in this state is an + /* The only thing that [should] arrive in this state is an * acknowledgment of our FIN. If our FIN is now acknowledged, * delete the TCB, enter the CLOSED state, and return. */ if (!tcp_rcv_ack_is_acceptable (tc0, b0)) goto drop; + /* Apparently our FIN was lost */ + if (tcp_fin (tcp0)) + { + /* Don't "make" fin since that increments snd_nxt */ + tcp_send_fin (tc0); + goto drop; + } + tc0->state = TCP_STATE_CLOSED; /* Don't delete the connection/session yet. Instead, wait a @@ -1929,8 +2216,15 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, * retransmission of the remote FIN. Acknowledge it, and restart * the 2 MSL timeout. */ - /* TODO */ + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + + tcp_make_ack (tc0, b0); + tcp_timer_reset (tc0, TCP_TIMER_WAITCLOSE); + tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + goto drop; + break; default: ASSERT (0); @@ -2194,7 +2488,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; } - if (tcp_options_parse (th0, &child0->opt)) + if (tcp_options_parse (th0, &child0->rcv_opts)) { goto drop; } @@ -2205,14 +2499,14 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK} * segments are used to initialize PAWS. */ - if (tcp_opts_tstamp (&child0->opt)) + if (tcp_opts_tstamp (&child0->rcv_opts)) { - child0->tsval_recent = child0->opt.tsval; + child0->tsval_recent = child0->rcv_opts.tsval; child0->tsval_recent_age = tcp_time_now (); } - if (tcp_opts_wscale (&child0->opt)) - child0->snd_wscale = child0->opt.wscale; + if (tcp_opts_wscale (&child0->rcv_opts)) + child0->snd_wscale = child0->rcv_opts.wscale; /* No scaling */ child0->snd_wnd = clib_net_to_host_u16 (th0->window); @@ -2477,7 +2771,6 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_add_trace (vm, node, b0, sizeof (*t0)); tcp_set_rx_trace_data (t0, tc0, tcp0, b0, is_ip4); } - vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, n_left_to_next, bi0, next0); } @@ -2600,7 +2893,13 @@ do { \ _(FIN_WAIT_2, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(LAST_ACK, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, + TCP_ERROR_NONE); _(LAST_ACK, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(TIME_WAIT, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(TIME_WAIT, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, + TCP_ERROR_NONE); _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED); _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); #undef _ diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c index 3525f4e5..c66250e4 100644 --- a/src/vnet/tcp/tcp_newreno.c +++ b/src/vnet/tcp/tcp_newreno.c @@ -51,9 +51,23 @@ newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type) } else if (ack_type == TCP_CC_PARTIALACK) { - tc->cwnd -= tc->bytes_acked; - if (tc->bytes_acked > tc->snd_mss) - tc->bytes_acked += tc->snd_mss; + /* RFC 6582 Sec. 3.2 */ + if (!tcp_opts_sack_permitted (&tc->rcv_opts)) + { + /* Deflate the congestion window by the amount of new data + * acknowledged by the Cumulative Acknowledgment field. + * If the partial ACK acknowledges at least one SMSS of new data, + * then add back SMSS bytes to the congestion window. This + * artificially inflates the congestion window in order to reflect + * the additional segment that has left the network. This "partial + * window deflation" attempts to ensure that, when fast recovery + * eventually ends, approximately ssthresh amount of data will be + * outstanding in the network.*/ + tc->cwnd = (tc->cwnd > tc->bytes_acked) ? + tc->cwnd - tc->bytes_acked : 0; + if (tc->bytes_acked > tc->snd_mss) + tc->cwnd += tc->snd_mss; + } } } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 49fd6bef..47c94e6d 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -136,10 +136,10 @@ tcp_update_rcv_wnd (tcp_connection_t * tc) * Figure out how much space we have available */ available_space = stream_session_max_rx_enqueue (&tc->connection); - max_fifo = stream_session_fifo_size (&tc->connection); + max_fifo = stream_session_rx_fifo_size (&tc->connection); - ASSERT (tc->opt.mss < max_fifo); - if (available_space < tc->opt.mss && available_space < max_fifo >> 3) + ASSERT (tc->rcv_opts.mss < max_fifo); + if (available_space < tc->rcv_opts.mss && available_space < max_fifo >> 3) available_space = 0; /* @@ -276,8 +276,11 @@ tcp_make_syn_options (tcp_options_t * opts, u8 wnd_scale) opts->tsecr = 0; len += TCP_OPTION_LEN_TIMESTAMP; - opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; - len += TCP_OPTION_LEN_SACK_PERMITTED; + if (TCP_USE_SACKS) + { + opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; + len += TCP_OPTION_LEN_SACK_PERMITTED; + } /* Align to needed boundary */ len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN; @@ -293,14 +296,14 @@ tcp_make_synack_options (tcp_connection_t * tc, tcp_options_t * opts) opts->mss = tc->mss; len += TCP_OPTION_LEN_MSS; - if (tcp_opts_wscale (&tc->opt)) + if (tcp_opts_wscale (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_WSCALE; opts->wscale = tc->rcv_wscale; len += TCP_OPTION_LEN_WINDOW_SCALE; } - if (tcp_opts_tstamp (&tc->opt)) + if (tcp_opts_tstamp (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_TSTAMP; opts->tsval = tcp_time_now (); @@ -308,7 +311,7 @@ tcp_make_synack_options (tcp_connection_t * tc, tcp_options_t * opts) len += TCP_OPTION_LEN_TIMESTAMP; } - if (tcp_opts_sack_permitted (&tc->opt)) + if (tcp_opts_sack_permitted (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; len += TCP_OPTION_LEN_SACK_PERMITTED; @@ -326,14 +329,14 @@ tcp_make_established_options (tcp_connection_t * tc, tcp_options_t * opts) opts->flags = 0; - if (tcp_opts_tstamp (&tc->opt)) + if (tcp_opts_tstamp (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_TSTAMP; opts->tsval = tcp_time_now (); opts->tsecr = tc->tsval_recent; len += TCP_OPTION_LEN_TIMESTAMP; } - if (tcp_opts_sack_permitted (&tc->opt)) + if (tcp_opts_sack_permitted (&tc->rcv_opts)) { if (vec_len (tc->snd_sacks)) { @@ -395,7 +398,7 @@ tcp_update_snd_mss (tcp_connection_t * tc) tcp_make_options (tc, &tc->snd_opts, TCP_STATE_ESTABLISHED); /* XXX check if MTU has been updated */ - tc->snd_mss = clib_min (tc->mss, tc->opt.mss) - tc->snd_opts_len; + tc->snd_mss = clib_min (tc->mss, tc->rcv_opts.mss) - tc->snd_opts_len; ASSERT (tc->snd_mss > 0); } @@ -406,21 +409,21 @@ tcp_init_mss (tcp_connection_t * tc) tcp_update_rcv_mss (tc); /* TODO cache mss and consider PMTU discovery */ - tc->snd_mss = clib_min (tc->opt.mss, tc->mss); + tc->snd_mss = clib_min (tc->rcv_opts.mss, tc->mss); if (tc->snd_mss < 45) { clib_warning ("snd mss is 0"); /* Assume that at least the min default mss works */ tc->snd_mss = default_min_mss; - tc->opt.mss = default_min_mss; + tc->rcv_opts.mss = default_min_mss; } /* We should have enough space for 40 bytes of options */ ASSERT (tc->snd_mss > 45); /* If we use timestamp option, account for it */ - if (tcp_opts_tstamp (&tc->opt)) + if (tcp_opts_tstamp (&tc->rcv_opts)) tc->snd_mss -= TCP_OPTION_LEN_TIMESTAMP; } @@ -879,6 +882,7 @@ tcp_send_fin (tcp_connection_t * tc) tcp_make_fin (tc, b); tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); tc->flags |= TCP_CONN_FINSNT; + tcp_retransmit_timer_force_update (tc); TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc); } @@ -919,10 +923,7 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, if (compute_opts) tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); - /* Write pre-computed options */ tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t); - - /* Get rcv window to advertise */ advertise_wnd = tcp_window_to_advertise (tc, next_state); flags = tcp_make_state_flags (next_state); @@ -930,26 +931,25 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt, tc->rcv_nxt, tcp_hdr_opts_len, flags, advertise_wnd); - opts_write_len = tcp_options_write ((u8 *) (th + 1), &tc->snd_opts); ASSERT (opts_write_len == tc->snd_opts_len); - - /* Tag the buffer with the connection index */ vnet_buffer (b)->tcp.connection_index = tc->c_c_index; + /* + * Update connection variables + */ + tc->snd_nxt += data_len; tc->rcv_las = tc->rcv_nxt; /* TODO this is updated in output as well ... */ - if (tc->snd_nxt > tc->snd_una_max) - tc->snd_una_max = tc->snd_nxt; - - if (tc->rtt_ts == 0) + if (seq_gt (tc->snd_nxt, tc->snd_una_max)) { - tc->rtt_ts = tcp_time_now (); - tc->rtt_seq = tc->snd_nxt; + tc->snd_una_max = tc->snd_nxt; + tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); } + TCP_EVT_DBG (TCP_EVT_PKTIZE, tc); } @@ -987,13 +987,14 @@ tcp_timer_delack_handler (u32 index) * * @return the number of bytes in the segment or 0 if there's nothing to * retransmit - * */ + */ u32 tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, u32 offset, u32 max_bytes) { vlib_main_t *vm = vlib_get_main (); - u32 n_bytes = 0; + int n_bytes = 0; + u32 start; tcp_reuse_buffer (vm, b); @@ -1001,15 +1002,16 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, ASSERT (max_bytes != 0); max_bytes = clib_min (tc->snd_mss, max_bytes); + start = tc->snd_una + offset; /* Start is beyond snd_congestion */ - if (seq_geq (tc->snd_una + offset, tc->snd_congestion)) + if (seq_geq (start, tc->snd_congestion)) goto done; /* Don't overshoot snd_congestion */ - if (seq_gt (tc->snd_nxt + max_bytes, tc->snd_congestion)) + if (seq_gt (start + max_bytes, tc->snd_congestion)) { - max_bytes = tc->snd_congestion - tc->snd_nxt; + max_bytes = tc->snd_congestion - start; if (max_bytes == 0) goto done; } @@ -1021,15 +1023,12 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, n_bytes = stream_session_peek_bytes (&tc->connection, vlib_buffer_get_current (b), offset, max_bytes); - ASSERT (n_bytes != 0); + ASSERT (n_bytes > 0); b->current_length = n_bytes; tcp_push_hdr_i (tc, b, tc->state, 0); - /* Don't count multiple retransmits of the same segment */ - if (tc->rto_boff > 1) - goto done; - - tc->rtx_bytes += n_bytes; + if (tcp_in_fastrecovery (tc)) + tc->snd_rxt_bytes += n_bytes; done: TCP_EVT_DBG (TCP_EVT_CC_RTX, tc, offset, n_bytes); @@ -1042,18 +1041,15 @@ done: static void tcp_rtx_timeout_cc (tcp_connection_t * tc) { + tc->prev_ssthresh = tc->ssthresh; + tc->prev_cwnd = tc->cwnd; + /* Cleanly recover cc (also clears up fast retransmit) */ if (tcp_in_fastrecovery (tc)) - { - tcp_cc_recover (tc); - } - else - { - tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss); - } + tcp_cc_fastrecovery_exit (tc); /* Start again from the beginning */ - + tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss); tc->cwnd = tcp_loss_wnd (tc); tc->snd_congestion = tc->snd_una_max; tcp_recovery_on (tc); @@ -1081,18 +1077,31 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Make sure timer handle is set to invalid */ tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID; + if (!tcp_in_recovery (tc) && tc->rto_boff > 0 + && tc->state >= TCP_STATE_ESTABLISHED) + { + tc->rto_boff = 0; + tcp_update_rto (tc); + } + /* Increment RTO backoff (also equal to number of retries) */ tc->rto_boff += 1; /* Go back to first un-acked byte */ tc->snd_nxt = tc->snd_una; - /* Get buffer */ tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); if (tc->state >= TCP_STATE_ESTABLISHED) { + /* Lost FIN, retransmit and return */ + if (tc->flags & TCP_CONN_FINSNT) + { + tcp_send_fin (tc); + return; + } + /* First retransmit timeout */ if (tc->rto_boff == 1) tcp_rtx_timeout_cc (tc); @@ -1102,24 +1111,30 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1); - /* Send one segment. No fancy recovery for now! */ + /* Send one segment */ n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); + /* TODO be less aggressive about this */ scoreboard_clear (&tc->sack_sb); if (n_bytes == 0) { clib_warning ("could not retransmit anything"); + clib_warning ("%U", format_tcp_connection, tc, 2); + /* Try again eventually */ tcp_retransmit_timer_set (tc); + ASSERT (0 || (tc->rto_boff > 1 + && tc->snd_una == tc->snd_congestion)); return; } + + /* For first retransmit, record timestamp (Eifel detection RFC3522) */ + if (tc->rto_boff == 1) + tc->snd_rxt_ts = tcp_time_now (); } - else + /* Retransmit for SYN/SYNACK */ + else if (tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_SYN_SENT) { - /* Retransmit for SYN/SYNACK */ - ASSERT (tc->state == TCP_STATE_SYN_RCVD - || tc->state == TCP_STATE_SYN_SENT); - /* Try without increasing RTO a number of times. If this fails, * start growing RTO exponentially */ if (tc->rto_boff > TCP_RTO_SYN_RETRIES) @@ -1132,6 +1147,12 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Account for the SYN */ tc->snd_nxt += 1; } + else + { + ASSERT (tc->state == TCP_STATE_CLOSED); + clib_warning ("connection closed ..."); + return; + } if (!is_syn) { @@ -1180,7 +1201,8 @@ tcp_timer_persist_handler (u32 index) u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; - u32 bi, n_bytes; + u32 bi, old_snd_nxt; + int n_bytes = 0; tc = tcp_connection_get_if_valid (index, thread_index); @@ -1202,13 +1224,15 @@ tcp_timer_persist_handler (u32 index) /* Try to force the first unsent segment */ tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); + + tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); n_bytes = stream_session_peek_bytes (&tc->connection, vlib_buffer_get_current (b), tc->snd_una_max - tc->snd_una, tc->snd_mss); /* Nothing to send */ - if (n_bytes == 0) + if (n_bytes <= 0) { clib_warning ("persist found nothing to send"); tcp_return_buffer (tm); @@ -1216,7 +1240,13 @@ tcp_timer_persist_handler (u32 index) } b->current_length = n_bytes; + ASSERT (tc->snd_nxt == tc->snd_una_max || tc->rto_boff > 1 + || tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)); + + /* Allow updating of snd_una_max but don't update snd_nxt */ + old_snd_nxt = tc->snd_nxt; tcp_push_hdr_i (tc, b, tc->state, 0); + tc->snd_nxt = old_snd_nxt; tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); /* Re-enable persist timer */ @@ -1232,8 +1262,9 @@ tcp_retransmit_first_unacked (tcp_connection_t * tc) tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); vlib_buffer_t *b; - u32 bi, n_bytes; + u32 bi, n_bytes, old_snd_nxt; + old_snd_nxt = tc->snd_nxt; tc->snd_nxt = tc->snd_una; /* Get buffer */ @@ -1244,75 +1275,117 @@ tcp_retransmit_first_unacked (tcp_connection_t * tc) n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); if (n_bytes == 0) - goto done; + { + tcp_return_buffer (tm); + goto done; + } tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); done: - tc->snd_nxt = tc->snd_una_max; + tc->snd_nxt = old_snd_nxt; } -sack_scoreboard_hole_t * -scoreboard_first_rtx_hole (sack_scoreboard_t * sb) +/** + * Do fast retransmit with SACKs + */ +void +tcp_fast_retransmit_sack (tcp_connection_t * tc) { - sack_scoreboard_hole_t *hole = 0; - -// hole = scoreboard_first_hole (&tc->sack_sb); -// if (hole) -// { -// -// offset = hole->start - tc->snd_una; -// hole_size = hole->end - hole->start; -// -// ASSERT(hole_size); -// -// if (hole_size < max_bytes) -// max_bytes = hole_size; -// } - return hole; + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = vlib_get_main (); + u32 n_written = 0, offset = 0, max_bytes; + vlib_buffer_t *b; + sack_scoreboard_hole_t *hole; + sack_scoreboard_t *sb; + u32 bi, old_snd_nxt; + int snd_space; + u8 snd_limited = 0, can_rescue = 0; + + ASSERT (tcp_in_fastrecovery (tc)); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); + + old_snd_nxt = tc->snd_nxt; + sb = &tc->sack_sb; + snd_space = tcp_available_snd_space (tc); + + hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); + while (hole && snd_space > 0) + { + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (vm, bi); + + hole = scoreboard_next_rxt_hole (sb, hole, + tcp_fastrecovery_sent_1_smss (tc), + &can_rescue, &snd_limited); + if (!hole) + { + if (!can_rescue || !(seq_lt (sb->rescue_rxt, tc->snd_una) + || seq_gt (sb->rescue_rxt, + tc->snd_congestion))) + break; + + /* If rescue rxt undefined or less than snd_una then one segment of + * up to SMSS octets that MUST include the highest outstanding + * unSACKed sequence number SHOULD be returned, and RescueRxt set to + * RecoveryPoint. HighRxt MUST NOT be updated. + */ + max_bytes = clib_min (tc->snd_mss, snd_space); + offset = tc->snd_congestion - tc->snd_una - max_bytes; + sb->rescue_rxt = tc->snd_congestion; + tc->snd_nxt = tc->snd_una + offset; + tcp_prepare_retransmit_segment (tc, b, offset, max_bytes); + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + break; + } + + max_bytes = snd_limited ? tc->snd_mss : hole->end - sb->high_rxt; + offset = sb->high_rxt - tc->snd_una; + tc->snd_nxt = tc->snd_una + offset; + n_written = tcp_prepare_retransmit_segment (tc, b, offset, max_bytes); + + /* Nothing left to retransmit */ + if (n_written == 0) + { + tcp_return_buffer (tm); + break; + } + + sb->high_rxt += n_written; + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + snd_space -= n_written; + } + + /* If window allows, send 1 SMSS of new data */ + tc->snd_nxt = old_snd_nxt; } /** - * Do fast retransmit. + * Fast retransmit without SACK info */ void -tcp_fast_retransmit (tcp_connection_t * tc) +tcp_fast_retransmit_no_sack (tcp_connection_t * tc) { tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); - u32 bi; + u32 n_written = 0, offset = 0, bi, old_snd_nxt; int snd_space; - u32 n_written = 0, offset = 0; vlib_buffer_t *b; - u8 use_sacks = 0; ASSERT (tcp_in_fastrecovery (tc)); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); /* Start resending from first un-acked segment */ + old_snd_nxt = tc->snd_nxt; tc->snd_nxt = tc->snd_una; - snd_space = tcp_available_snd_space (tc); - TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); - - /* If we have SACKs use them */ - if (tcp_opts_sack_permitted (&tc->opt) - && scoreboard_first_hole (&tc->sack_sb)) - use_sacks = 0; while (snd_space > 0) { tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); - if (use_sacks) - { - scoreboard_first_rtx_hole (&tc->sack_sb); - } - else - { - offset += n_written; - } - + offset += n_written; n_written = tcp_prepare_retransmit_segment (tc, b, offset, snd_space); /* Nothing left to retransmit */ @@ -1326,9 +1399,21 @@ tcp_fast_retransmit (tcp_connection_t * tc) snd_space -= n_written; } - /* If window allows, send 1 SMSS of new data */ - if (seq_lt (tc->snd_nxt, tc->snd_congestion)) - tc->snd_nxt = tc->snd_congestion; + /* Restore snd_nxt. If window allows, send 1 SMSS of new data */ + tc->snd_nxt = old_snd_nxt; +} + +/** + * Do fast retransmit + */ +void +tcp_fast_retransmit (tcp_connection_t * tc) +{ + if (tcp_opts_sack_permitted (&tc->rcv_opts) + && scoreboard_first_hole (&tc->sack_sb)) + tcp_fast_retransmit_sack (tc); + else + tcp_fast_retransmit_no_sack (tc); } always_inline u32 @@ -1544,6 +1629,12 @@ tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b) tc = (tcp_connection_t *) tconn; tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED, 0); + + if (tc->rtt_ts == 0) + { + tc->rtt_ts = tcp_time_now (); + tc->rtt_seq = tc->snd_nxt; + } return 0; } diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c index 2af38484..3f8afa40 100644 --- a/src/vnet/tcp/tcp_test.c +++ b/src/vnet/tcp/tcp_test.c @@ -54,7 +54,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) tc->snd_una = 0; tc->snd_una_max = 1000; tc->snd_nxt = 1000; - tc->opt.flags |= TCP_OPTS_FLAG_SACK; + tc->rcv_opts.flags |= TCP_OPTS_FLAG_SACK; scoreboard_init (&tc->sack_sb); for (i = 0; i < 1000 / 100; i++) @@ -70,9 +70,9 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) for (i = 0; i < 1000 / 200; i++) { - vec_add1 (tc->opt.sacks, sacks[i * 2]); + vec_add1 (tc->rcv_opts.sacks, sacks[i * 2]); } - tc->opt.n_sack_blocks = vec_len (tc->opt.sacks); + tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks); tcp_rcv_sacks (tc, 0); if (verbose) @@ -93,18 +93,17 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv); TCP_TEST ((sb->last_sacked_bytes == 400), "last sacked bytes %d", sb->last_sacked_bytes); - TCP_TEST ((sb->max_byte_sacked == 900), - "max byte sacked %u", sb->max_byte_sacked); + TCP_TEST ((sb->high_sacked == 900), "max byte sacked %u", sb->high_sacked); /* * Inject odd blocks */ - vec_reset_length (tc->opt.sacks); + vec_reset_length (tc->rcv_opts.sacks); for (i = 0; i < 1000 / 200; i++) { - vec_add1 (tc->opt.sacks, sacks[i * 2 + 1]); + vec_add1 (tc->rcv_opts.sacks, sacks[i * 2 + 1]); } - tc->opt.n_sack_blocks = vec_len (tc->opt.sacks); + tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks); tcp_rcv_sacks (tc, 0); if (verbose) @@ -118,8 +117,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) "first hole start %u end %u", hole->start, hole->end); TCP_TEST ((sb->sacked_bytes == 900), "sacked bytes %d", sb->sacked_bytes); TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv); - TCP_TEST ((sb->max_byte_sacked == 1000), - "max sacked byte %u", sb->max_byte_sacked); + TCP_TEST ((sb->high_sacked == 1000), "max sacked byte %u", sb->high_sacked); TCP_TEST ((sb->last_sacked_bytes == 500), "last sacked bytes %d", sb->last_sacked_bytes); @@ -135,8 +133,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) "scoreboard has %d elements", pool_elts (sb->holes)); TCP_TEST ((sb->snd_una_adv == 900), "snd_una_adv after ack %u", sb->snd_una_adv); - TCP_TEST ((sb->max_byte_sacked == 1000), - "max sacked byte %u", sb->max_byte_sacked); + TCP_TEST ((sb->high_sacked == 1000), "max sacked byte %u", sb->high_sacked); TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes); TCP_TEST ((sb->last_sacked_bytes == 0), "last sacked bytes %d", sb->last_sacked_bytes); @@ -145,11 +142,11 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) * Add new block */ - vec_reset_length (tc->opt.sacks); + vec_reset_length (tc->rcv_opts.sacks); block.start = 1200; block.end = 1300; - vec_add1 (tc->opt.sacks, block); + vec_add1 (tc->rcv_opts.sacks, block); if (verbose) vlib_cli_output (vm, "add [1200, 1300]:\n%U", format_tcp_scoreboard, sb); @@ -171,8 +168,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) "first hole start %u end %u", hole->start, hole->end); TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv after ack %u", sb->snd_una_adv); - TCP_TEST ((sb->max_byte_sacked == 1300), - "max sacked byte %u", sb->max_byte_sacked); + TCP_TEST ((sb->high_sacked == 1300), "max sacked byte %u", sb->high_sacked); hole = scoreboard_last_hole (sb); TCP_TEST ((hole->start == 1300 && hole->end == 1500), "last hole start %u end %u", hole->start, hole->end); @@ -182,7 +178,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) * Ack first hole */ - vec_reset_length (tc->opt.sacks); + vec_reset_length (tc->rcv_opts.sacks); tcp_rcv_sacks (tc, 1200); if (verbose) @@ -196,8 +192,16 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) "scoreboard has %d elements", pool_elts (sb->holes)); /* - * Remove all + * Add some more blocks and then remove all */ + vec_reset_length (tc->rcv_opts.sacks); + for (i = 0; i < 5; i++) + { + block.start = i * 100 + 1200; + block.end = (i + 1) * 100 + 1200; + vec_add1 (tc->rcv_opts.sacks, block); + } + tcp_rcv_sacks (tc, 1900); scoreboard_clear (sb); if (verbose) @@ -205,6 +209,9 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) TCP_TEST ((pool_elts (sb->holes) == 0), "number of holes %d", pool_elts (sb->holes)); + TCP_TEST ((sb->head == TCP_INVALID_SACK_HOLE_INDEX), "head %u", sb->head); + TCP_TEST ((sb->tail == TCP_INVALID_SACK_HOLE_INDEX), "tail %u", sb->tail); + /* * Re-inject odd blocks and ack them all */ @@ -214,9 +221,9 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) tc->snd_nxt = 1000; for (i = 0; i < 5; i++) { - vec_add1 (tc->opt.sacks, sacks[i * 2 + 1]); + vec_add1 (tc->rcv_opts.sacks, sacks[i * 2 + 1]); } - tc->opt.n_sack_blocks = vec_len (tc->opt.sacks); + tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks); tcp_rcv_sacks (tc, 0); if (verbose) vlib_cli_output (vm, "sb added odd blocks and ack [0, 950]:\n%U", @@ -740,6 +747,10 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) TCP_TEST (0, "[%d] peeked %u expected %u", j, data_buf[j], data[j]); } + /* Try to peek beyond the data */ + rv = svm_fifo_peek (f, svm_fifo_max_dequeue (f), vec_len (data), data_buf); + TCP_TEST ((rv == 0), "peeked %u expected 0", rv); + vec_free (data_buf); svm_fifo_free (f); vec_free (test_data); @@ -1239,7 +1250,7 @@ tcp_test_session (vlib_main_t * vm, unformat_input_t * input) tc0->c_thread_index = 0; tc0->c_lcl_ip4.as_u32 = local.as_u32; tc0->c_rmt_ip4.as_u32 = remote.as_u32; - tc0->opt.mss = 1450; + tc0->rcv_opts.mss = 1450; tcp_connection_init_vars (tc0); TCP_EVT_DBG (TCP_EVT_OPEN, tc0); -- cgit 1.2.3-korg From f03a59ab008908f98fd7d1b187a8c0fb78b01add Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Fri, 9 Jun 2017 21:07:32 -0700 Subject: Overall tcp performance improvements (VPP-846) - limit minimum rto per connection - cleanup sack scoreboard - switched svm fifo out-of-order data handling from absolute offsets to relative offsets. - improve cwnd handling when using sacks - add cc event debug stats - improved uri tcp test client/server: bugfixes and added half-duplex mode - expanded builtin client/server - updated uri socket client/server code to work in half-duplex - ensure session node unsets fifo event for empty fifo - fix session detach Change-Id: Ia446972340e32a65e0694ee2844355167d0c170d Signed-off-by: Florin Coras --- src/svm/svm_fifo.c | 152 +++++++++-------- src/svm/svm_fifo.h | 26 ++- src/svm/svm_fifo_segment.c | 7 +- src/uri/uri_socket_server.c | 25 ++- src/uri/uri_socket_test.c | 93 +++++++---- src/uri/uri_tcp_test.c | 120 ++++++++++---- src/vnet/session/application.c | 2 +- src/vnet/session/node.c | 7 +- src/vnet/session/segment_manager.c | 5 + src/vnet/session/session.c | 25 ++- src/vnet/session/session.h | 4 +- src/vnet/session/session_api.c | 2 +- src/vnet/session/transport.h | 2 + src/vnet/tcp/builtin_client.c | 325 +++++++++++++++++++------------------ src/vnet/tcp/builtin_client.h | 107 +++++------- src/vnet/tcp/builtin_server.c | 114 ++++++++++--- src/vnet/tcp/tcp.c | 22 ++- src/vnet/tcp/tcp.h | 12 +- src/vnet/tcp/tcp_debug.h | 186 +++++++++++++++------ src/vnet/tcp/tcp_input.c | 193 ++++++++++++---------- src/vnet/tcp/tcp_newreno.c | 4 +- src/vnet/tcp/tcp_output.c | 10 +- src/vnet/tcp/tcp_test.c | 70 ++++++-- 23 files changed, 945 insertions(+), 568 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c index 5c8f244a..6ca437cf 100644 --- a/src/svm/svm_fifo.c +++ b/src/svm/svm_fifo.c @@ -15,10 +15,39 @@ #include -#define offset_lt(_a, _b) ((i32)((_a)-(_b)) < 0) -#define offset_leq(_a, _b) ((i32)((_a)-(_b)) <= 0) -#define offset_gt(_a, _b) ((i32)((_a)-(_b)) > 0) -#define offset_geq(_a, _b) ((i32)((_a)-(_b)) >= 0) +static inline u8 +position_lt (svm_fifo_t * f, u32 a, u32 b) +{ + return (ooo_segment_distance_to_tail (f, a) + < ooo_segment_distance_to_tail (f, b)); +} + +static inline u8 +position_leq (svm_fifo_t * f, u32 a, u32 b) +{ + return (ooo_segment_distance_to_tail (f, a) + <= ooo_segment_distance_to_tail (f, b)); +} + +static inline u8 +position_gt (svm_fifo_t * f, u32 a, u32 b) +{ + return (ooo_segment_distance_to_tail (f, a) + > ooo_segment_distance_to_tail (f, b)); +} + +static inline u32 +position_diff (svm_fifo_t * f, u32 posa, u32 posb) +{ + return ooo_segment_distance_to_tail (f, posa) + - ooo_segment_distance_to_tail (f, posb); +} + +static inline u32 +ooo_segment_end_pos (svm_fifo_t * f, ooo_segment_t * s) +{ + return (s->start + s->length) % f->nitems; +} u8 * format_ooo_segment (u8 * s, va_list * args) @@ -145,13 +174,17 @@ static void ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) { ooo_segment_t *s, *new_s, *prev, *next, *it; - u32 new_index, end_offset, s_sof, s_eof, s_index; + u32 new_index, s_end_pos, s_index; + u32 normalized_position, normalized_end_position; + + normalized_position = (f->tail + offset) % f->nitems; + normalized_end_position = (f->tail + offset + length) % f->nitems; - end_offset = offset + length; + f->ooos_newest = OOO_SEGMENT_INVALID_INDEX; if (f->ooos_list_head == OOO_SEGMENT_INVALID_INDEX) { - s = ooo_segment_new (f, offset, length); + s = ooo_segment_new (f, normalized_position, length); f->ooos_list_head = s - f->ooo_segments; f->ooos_newest = f->ooos_list_head; return; @@ -160,28 +193,26 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) /* Find first segment that starts after new segment */ s = pool_elt_at_index (f->ooo_segments, f->ooos_list_head); while (s->next != OOO_SEGMENT_INVALID_INDEX - && offset_leq (ooo_segment_offset (f, s), offset)) + && position_lt (f, s->start, normalized_position)) s = pool_elt_at_index (f->ooo_segments, s->next); /* If we have a previous and we overlap it, use it as starting point */ prev = ooo_segment_get_prev (f, s); - if (prev && offset_leq (offset, ooo_segment_end_offset (f, prev))) + if (prev + && position_leq (f, normalized_position, ooo_segment_end_pos (f, prev))) { s = prev; - prev = ooo_segment_get_prev (f, s); - s_sof = ooo_segment_offset (f, s); - s_eof = ooo_segment_end_offset (f, s); + s_end_pos = ooo_segment_end_pos (f, s); goto merge; } s_index = s - f->ooo_segments; - s_sof = ooo_segment_offset (f, s); - s_eof = ooo_segment_end_offset (f, s); + s_end_pos = ooo_segment_end_pos (f, s); /* No overlap, add before current segment */ - if (offset_lt (end_offset, s_sof)) + if (position_lt (f, normalized_end_position, s->start)) { - new_s = ooo_segment_new (f, offset, length); + new_s = ooo_segment_new (f, normalized_position, length); new_index = new_s - f->ooo_segments; /* Pool might've moved, get segment again */ @@ -198,28 +229,23 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) f->ooos_list_head = new_index; } - new_s->next = s - f->ooo_segments; + new_s->next = s_index; s->prev = new_index; f->ooos_newest = new_index; return; } /* No overlap, add after current segment */ - else if (offset_gt (offset, s_eof)) + else if (position_gt (f, normalized_position, s_end_pos)) { - new_s = ooo_segment_new (f, offset, length); + new_s = ooo_segment_new (f, normalized_position, length); new_index = new_s - f->ooo_segments; /* Pool might've moved, get segment again */ s = pool_elt_at_index (f->ooo_segments, s_index); - if (s->next != OOO_SEGMENT_INVALID_INDEX) - { - new_s->next = s->next; - next = pool_elt_at_index (f->ooo_segments, new_s->next); - next->prev = new_index; - } + ASSERT (s->next == OOO_SEGMENT_INVALID_INDEX); - new_s->prev = s - f->ooo_segments; + new_s->prev = s_index; s->next = new_index; f->ooos_newest = new_index; @@ -233,30 +259,32 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) merge: /* Merge at head */ - if (offset_lt (offset, s_sof)) + if (position_lt (f, normalized_position, s->start)) { - s->start = offset; - s->length = s_eof - ooo_segment_offset (f, s); + s->start = normalized_position; + s->length = position_diff (f, s_end_pos, s->start); } - /* Last but overlapping previous */ - else if (offset_gt (end_offset, s_eof)) + /* Overlapping tail */ + else if (position_gt (f, normalized_end_position, s_end_pos)) { - s->length = end_offset - ooo_segment_offset (f, s); + s->length = position_diff (f, normalized_end_position, s->start); } /* New segment completely covered by current one */ else { /* Do Nothing */ + s = 0; goto done; } /* The new segment's tail may cover multiple smaller ones */ - if (offset_geq (end_offset, s_eof)) + if (position_gt (f, normalized_end_position, s_end_pos)) { /* Remove the completely overlapped segments */ it = (s->next != OOO_SEGMENT_INVALID_INDEX) ? pool_elt_at_index (f->ooo_segments, s->next) : 0; - while (it && offset_leq (ooo_segment_end_offset (f, it), end_offset)) + while (it && position_leq (f, ooo_segment_end_pos (f, it), + normalized_end_position)) { next = (it->next != OOO_SEGMENT_INVALID_INDEX) ? pool_elt_at_index (f->ooo_segments, it->next) : 0; @@ -265,17 +293,17 @@ merge: } /* If partial overlap with last, merge */ - if (it && offset_leq (ooo_segment_offset (f, it), end_offset)) + if (it && position_leq (f, it->start, normalized_end_position)) { - s->length = ooo_segment_end_offset (f, it) - - ooo_segment_offset (f, s); + s->length = ooo_segment_end_pos (f, it) - s->start; ooo_segment_del (f, it - f->ooo_segments); } } done: /* Most recently updated segment */ - f->ooos_newest = s - f->ooo_segments; + if (s) + f->ooos_newest = s - f->ooo_segments; } /** @@ -286,32 +314,28 @@ static int ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) { ooo_segment_t *s; - u32 index, bytes = 0, diff; - u32 cursize, norm_start, nitems; - - /* current size has not yet been updated */ - cursize = svm_fifo_max_dequeue (f) + n_bytes_enqueued; - nitems = f->nitems; + u32 index, bytes = 0; + i32 diff; s = pool_elt_at_index (f->ooo_segments, f->ooos_list_head); - norm_start = s->start % nitems; - diff = (f->nitems + (i32) (f->tail - norm_start)) % nitems; + diff = (f->tail >= s->start) ? + f->tail - s->start : f->nitems + f->tail - s->start; - if (diff > cursize) + if (diff > n_bytes_enqueued) return 0; /* If last tail update overlaps one/multiple ooo segments, remove them */ - while (0 < diff && diff < cursize) + while (0 <= diff && diff < n_bytes_enqueued) { index = s - f->ooo_segments; /* Segment end is beyond the tail. Advance tail and remove segment */ - if (diff < s->length) + if (s->length > diff) { - f->tail += s->length - diff; - f->tail %= f->nitems; bytes = s->length - diff; + f->tail += bytes; + f->tail %= f->nitems; ooo_segment_del (f, index); break; } @@ -320,8 +344,8 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) if (s->next != OOO_SEGMENT_INVALID_INDEX) { s = pool_elt_at_index (f->ooo_segments, s->next); - norm_start = s->start % nitems; - diff = (f->nitems + (i32) (f->tail - norm_start)) % nitems; + diff = (f->tail >= s->start) ? + f->tail - s->start : f->nitems + f->tail - s->start; ooo_segment_del (f, index); } /* End of search */ @@ -332,18 +356,6 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) } } - /* If tail is adjacent to an ooo segment, 'consume' it */ - if (diff == 0) - { - bytes = ((nitems - cursize) >= s->length) ? s->length : - nitems - cursize; - - f->tail += bytes; - f->tail %= nitems; - - ooo_segment_del (f, s - f->ooo_segments); - } - return bytes; } @@ -355,6 +367,7 @@ svm_fifo_enqueue_internal (svm_fifo_t * f, u32 max_bytes, u8 * copy_from_here) /* read cursize, which can only increase while we're working */ cursize = svm_fifo_max_dequeue (f); + f->ooos_newest = OOO_SEGMENT_INVALID_INDEX; if (PREDICT_FALSE (cursize == f->nitems)) return -2; /* fifo stuffed */ @@ -424,13 +437,16 @@ svm_fifo_enqueue_with_offset_internal (svm_fifo_t * f, u8 * copy_from_here) { u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; - u32 cursize, nitems; - u32 normalized_offset, offset_from_tail; + u32 cursize, nitems, normalized_offset; + u32 offset_from_tail; + + f->ooos_newest = OOO_SEGMENT_INVALID_INDEX; /* read cursize, which can only increase while we're working */ cursize = svm_fifo_max_dequeue (f); nitems = f->nitems; - normalized_offset = offset % nitems; + + normalized_offset = (f->tail + offset) % nitems; /* Will this request fit? */ offset_from_tail = (nitems + normalized_offset - f->tail) % nitems; diff --git a/src/svm/svm_fifo.h b/src/svm/svm_fifo.h index 9cb93ff4..f32ef41d 100644 --- a/src/svm/svm_fifo.h +++ b/src/svm/svm_fifo.h @@ -127,21 +127,37 @@ format_function_t format_svm_fifo; always_inline ooo_segment_t * svm_fifo_newest_ooo_segment (svm_fifo_t * f) { - return f->ooo_segments + f->ooos_newest; + if (f->ooos_newest == OOO_SEGMENT_INVALID_INDEX) + return 0; + return pool_elt_at_index (f->ooo_segments, f->ooos_newest); +} + +always_inline u32 +ooo_segment_distance_to_tail (svm_fifo_t * f, u32 a) +{ + /* Ambiguous. Assumption is that ooo segments don't touch tail */ + if (a == f->tail && f->tail == f->head) + return f->nitems; + + return ((f->nitems + a - f->tail) % f->nitems); } always_inline u32 ooo_segment_offset (svm_fifo_t * f, ooo_segment_t * s) { -// return ((f->nitems + s->fifo_position - f->tail) % f->nitems); - return s->start; + return ooo_segment_distance_to_tail (f, s->start); } always_inline u32 ooo_segment_end_offset (svm_fifo_t * f, ooo_segment_t * s) { -// return ((f->nitems + s->fifo_position + s->length - f->tail) % f->nitems); - return s->start + s->length; + return ooo_segment_distance_to_tail (f, s->start) + s->length; +} + +always_inline u32 +ooo_segment_length (svm_fifo_t * f, ooo_segment_t * s) +{ + return s->length; } always_inline ooo_segment_t * diff --git a/src/svm/svm_fifo_segment.c b/src/svm/svm_fifo_segment.c index eef2168c..c4ac2352 100644 --- a/src/svm/svm_fifo_segment.c +++ b/src/svm/svm_fifo_segment.c @@ -305,14 +305,17 @@ svm_fifo_segment_free_fifo (svm_fifo_segment_private_t * s, svm_fifo_t * f, /* Remove from active list */ if (f->prev) f->prev->next = f->next; + else + fsh->fifos = f->next; if (f->next) f->next->prev = f->prev; - /* FALLTHROUGH */ + /* Fall through: we add only rx fifos to active pool */ case FIFO_SEGMENT_TX_FREELIST: /* Add to free list */ f->next = fsh->free_fifos[list_index]; + f->prev = 0; fsh->free_fifos[list_index] = f; - /* FALLTHROUGH */ + break; case FIFO_SEGMENT_FREELIST_NONE: break; diff --git a/src/uri/uri_socket_server.c b/src/uri/uri_socket_server.c index 2366f420..4f4c5f30 100644 --- a/src/uri/uri_socket_server.c +++ b/src/uri/uri_socket_server.c @@ -22,6 +22,7 @@ #include #include #include +#include volatile int signal_received; @@ -78,7 +79,10 @@ main (int argc, char *argv[]) struct sockaddr_in serv_addr; struct sockaddr_in client; struct hostent *server; - u8 *rx_buffer = 0; + u8 *rx_buffer = 0, no_echo = 0; + struct timeval start, end; + long rcvd = 0; + double deltat; if (argc > 1 && argc < 3) { @@ -86,8 +90,9 @@ main (int argc, char *argv[]) exit (0); } - if (argc >= 3) + if (argc >= 4) { + no_echo = atoi (argv[3]); portno = atoi (argv[2]); server = gethostbyname (argv[1]); if (server == NULL) @@ -137,7 +142,7 @@ main (int argc, char *argv[]) exit (1); } - vec_validate (rx_buffer, 8999 /* jumbo mtu */ ); + vec_validate (rx_buffer, 128 << 10); if (listen (sockfd, 5 /* backlog */ ) < 0) { @@ -160,6 +165,8 @@ main (int argc, char *argv[]) } fformat (stderr, "Accepted connection from: %s : %d\n", inet_ntoa (client.sin_addr), client.sin_port); + gettimeofday (&start, NULL); + while (1) { n = recv (accfd, rx_buffer, vec_len (rx_buffer), 0 /* flags */ ); @@ -167,6 +174,14 @@ main (int argc, char *argv[]) { /* Graceful exit */ close (accfd); + gettimeofday (&end, NULL); + deltat = (end.tv_sec - start.tv_sec); + deltat += (end.tv_usec - start.tv_usec) / 1000000.0; + clib_warning ("Finished in %.6f", deltat); + clib_warning ("%.4f Gbit/second %s", + (((f64) rcvd * 8.0) / deltat / 1e9), + no_echo ? "half" : "full"); + rcvd = 0; break; } if (n < 0) @@ -179,6 +194,10 @@ main (int argc, char *argv[]) if (signal_received) break; + rcvd += n; + if (no_echo) + continue; + sent = send (accfd, rx_buffer, n, 0 /* flags */ ); if (n < 0) { diff --git a/src/uri/uri_socket_test.c b/src/uri/uri_socket_test.c index 9f049bda..5f7084d5 100644 --- a/src/uri/uri_socket_test.c +++ b/src/uri/uri_socket_test.c @@ -19,6 +19,7 @@ #include #include #include +#include int main (int argc, char *argv[]) @@ -26,28 +27,44 @@ main (int argc, char *argv[]) int sockfd, portno, n; struct sockaddr_in serv_addr; struct hostent *server; - u8 *rx_buffer = 0, *tx_buffer = 0; + u8 *rx_buffer = 0, *tx_buffer = 0, no_echo = 0, test_bytes = 0; u32 offset; - int iter, i; - if (0 && argc < 3) + long bytes = 1 << 20, to_send; + int i; + struct timeval start, end; + double deltat; + + if (argc >= 3) { - fformat (stderr, "usage %s hostname port\n", argv[0]); - exit (0); + bytes = ((long) atoi (argv[4])) << 20; + no_echo = atoi (argv[3]); + portno = atoi (argv[2]); + server = gethostbyname (argv[1]); + if (server == NULL) + { + clib_unix_warning ("gethostbyname"); + exit (1); + } + } + else + { + portno = 1234; // atoi(argv[2]); + server = gethostbyname ("6.0.1.1" /* argv[1] */ ); + if (server == NULL) + { + clib_unix_warning ("gethostbyname"); + exit (1); + } } - portno = 1234; // atoi(argv[2]); + to_send = bytes; sockfd = socket (AF_INET, SOCK_STREAM, 0); if (sockfd < 0) { clib_unix_error ("socket"); exit (1); } - server = gethostbyname ("6.0.1.1" /* argv[1] */ ); - if (server == NULL) - { - clib_unix_warning ("gethostbyname"); - exit (1); - } + bzero ((char *) &serv_addr, sizeof (serv_addr)); serv_addr.sin_family = AF_INET; bcopy ((char *) server->h_addr, @@ -59,8 +76,8 @@ main (int argc, char *argv[]) exit (1); } - vec_validate (rx_buffer, 1400); - vec_validate (tx_buffer, 1400); + vec_validate (rx_buffer, 128 << 10); + vec_validate (tx_buffer, 128 << 10); for (i = 0; i < vec_len (tx_buffer); i++) tx_buffer[i] = (i + 1) % 0xff; @@ -75,19 +92,28 @@ main (int argc, char *argv[]) exit (0); } - for (iter = 0; iter < 100000; iter++) + gettimeofday (&start, NULL); + while (bytes > 0) { - if (iter < 99999) + /* + * TX + */ + n = send (sockfd, tx_buffer, vec_len (tx_buffer), 0 /* flags */ ); + if (n != vec_len (tx_buffer)) { - n = send (sockfd, tx_buffer, vec_len (tx_buffer), 0 /* flags */ ); - if (n != vec_len (tx_buffer)) - { - clib_unix_warning ("write"); - exit (0); - } + clib_unix_warning ("write"); + exit (0); } - offset = 0; + bytes -= n; + if (no_echo) + continue; + + /* + * RX + */ + + offset = 0; do { n = recv (sockfd, rx_buffer + offset, @@ -101,18 +127,27 @@ main (int argc, char *argv[]) } while (offset < vec_len (rx_buffer)); - for (i = 0; i < vec_len (rx_buffer); i++) + if (test_bytes) { - if (rx_buffer[i] != tx_buffer[i]) + for (i = 0; i < vec_len (rx_buffer); i++) { - clib_warning ("[%d] read 0x%x not 0x%x", - rx_buffer[i], tx_buffer[i]); - exit (1); + if (rx_buffer[i] != tx_buffer[i]) + { + clib_warning ("[%d] read 0x%x not 0x%x", rx_buffer[i], + tx_buffer[i]); + exit (1); + } } } - } close (sockfd); + gettimeofday (&end, NULL); + + deltat = (end.tv_sec - start.tv_sec); + deltat += (end.tv_usec - start.tv_usec) / 1000000.0; // us to ms + clib_warning ("Finished in %.6f", deltat); + clib_warning ("%.4f Gbit/second %s", (((f64) to_send * 8.0) / deltat / 1e9), + no_echo ? "half" : "full"); return 0; } diff --git a/src/uri/uri_tcp_test.c b/src/uri/uri_tcp_test.c index e201a359..d1694cf4 100755 --- a/src/uri/uri_tcp_test.c +++ b/src/uri/uri_tcp_test.c @@ -46,6 +46,8 @@ typedef struct svm_fifo_t *server_tx_fifo; u64 vpp_session_handle; + u64 bytes_received; + f64 start; } session_t; typedef enum @@ -174,7 +176,7 @@ wait_for_state_change (uri_tcp_test_main_t * utm, connection_state_t state) if (utm->state == STATE_FAILED) return -1; if (utm->time_to_stop == 1) - return -1; + return 0; } clib_warning ("timeout waiting for STATE_READY"); return -1; @@ -184,7 +186,7 @@ void application_send_attach (uri_tcp_test_main_t * utm) { vl_api_application_attach_t *bmp; - u32 fifo_size = 3 << 20; + u32 fifo_size = 4 << 20; bmp = vl_msg_api_alloc (sizeof (*bmp)); memset (bmp, 0, sizeof (*bmp)); @@ -343,11 +345,23 @@ vl_api_map_another_segment_t_handler (vl_api_map_another_segment_t * mp) mp->segment_size); } +static void +session_print_stats (uri_tcp_test_main_t * utm, session_t * session) +{ + f64 deltat; + u64 bytes; + + deltat = clib_time_now (&utm->clib_time) - session->start; + bytes = utm->i_am_master ? session->bytes_received : utm->bytes_to_send; + fformat (stdout, "Finished in %.6f\n", deltat); + fformat (stdout, "%.4f Gbit/second\n", (bytes * 8.0) / deltat / 1e9); +} + static void vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) { uri_tcp_test_main_t *utm = &uri_tcp_test_main; - session_t *session; + session_t *session = 0; vl_api_disconnect_session_reply_t *rmp; uword *p; int rv = 0; @@ -366,7 +380,7 @@ vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) rv = -11; } - utm->time_to_stop = 1; +// utm->time_to_stop = 1; rmp = vl_msg_api_alloc (sizeof (*rmp)); memset (rmp, 0, sizeof (*rmp)); @@ -375,6 +389,9 @@ vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) rmp->retval = rv; rmp->handle = mp->handle; vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & rmp); + + if (session) + session_print_stats (utm, session); } static void @@ -431,14 +448,19 @@ client_handle_fifo_event_rx (uri_tcp_test_main_t * utm, if (n_read > 0) { bytes -= n_read; - for (i = 0; i < n_read; i++) + if (utm->test_return_packets) { - if (utm->rx_buf[i] != ((utm->client_bytes_received + i) & 0xff)) + for (i = 0; i < n_read; i++) { - clib_warning ("error at byte %lld, 0x%x not 0x%x", - utm->client_bytes_received + i, - utm->rx_buf[i], - ((utm->client_bytes_received + i) & 0xff)); + if (utm->rx_buf[i] + != ((utm->client_bytes_received + i) & 0xff)) + { + clib_warning ("error at byte %lld, 0x%x not 0x%x", + utm->client_bytes_received + i, + utm->rx_buf[i], + ((utm->client_bytes_received + + i) & 0xff)); + } } } utm->client_bytes_received += n_read; @@ -545,6 +567,7 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) session->server_rx_fifo = rx_fifo; session->server_tx_fifo = tx_fifo; session->vpp_session_handle = mp->handle; + session->start = clib_time_now (&utm->clib_time); /* Save handle */ utm->connected_session_index = session_index; @@ -571,7 +594,7 @@ send_test_chunk (uri_tcp_test_main_t * utm, svm_fifo_t * tx_fifo, int mypid, u64 bytes_sent = 0; int test_buf_offset = 0; u32 bytes_to_snd; - u32 queue_max_chunk = 64 << 10, actual_write; + u32 queue_max_chunk = 128 << 10, actual_write; session_fifo_event_t evt; static int serial_number = 0; int rv; @@ -582,8 +605,8 @@ send_test_chunk (uri_tcp_test_main_t * utm, svm_fifo_t * tx_fifo, int mypid, while (bytes_to_snd > 0) { - actual_write = - bytes_to_snd > queue_max_chunk ? queue_max_chunk : bytes_to_snd; + actual_write = (bytes_to_snd > queue_max_chunk) ? + queue_max_chunk : bytes_to_snd; rv = svm_fifo_enqueue_nowait (tx_fifo, actual_write, test_data + test_buf_offset); @@ -635,9 +658,9 @@ client_send_data (uri_tcp_test_main_t * utm) if (leftover) send_test_chunk (utm, tx_fifo, mypid, leftover); - if (utm->test_return_packets) + if (!utm->drop_packets) { - f64 timeout = clib_time_now (&utm->clib_time) + 2; + f64 timeout = clib_time_now (&utm->clib_time) + 10; /* Wait for the outstanding packets */ while (utm->client_bytes_received < @@ -698,6 +721,7 @@ int client_disconnect (uri_tcp_test_main_t * utm) { client_send_disconnect (utm); + clib_warning ("Sent disconnect"); if (wait_for_state_change (utm, STATE_START)) { clib_warning ("Disconnect failed"); @@ -721,7 +745,7 @@ client_test (uri_tcp_test_main_t * utm) } /* Init test data */ - vec_validate (utm->connect_test_data, 64 * 1024 - 1); + vec_validate (utm->connect_test_data, 128 * 1024 - 1); for (i = 0; i < vec_len (utm->connect_test_data); i++) utm->connect_test_data[i] = i & 0xff; @@ -899,6 +923,9 @@ vl_api_accept_session_t_handler (vl_api_accept_session_t * mp) rmp->_vl_msg_id = ntohs (VL_API_ACCEPT_SESSION_REPLY); rmp->handle = mp->handle; vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & rmp); + + session->bytes_received = 0; + session->start = clib_time_now (&utm->clib_time); } void @@ -909,37 +936,50 @@ server_handle_fifo_event_rx (uri_tcp_test_main_t * utm, int n_read; session_fifo_event_t evt; unix_shared_memory_queue_t *q; - int rv, bytes; + session_t *session; + int rv; + u32 max_dequeue, offset, max_transfer, rx_buf_len; + rx_buf_len = vec_len (utm->rx_buf); rx_fifo = e->fifo; - tx_fifo = utm->sessions[rx_fifo->client_session_index].server_tx_fifo; + session = &utm->sessions[rx_fifo->client_session_index]; + tx_fifo = session->server_tx_fifo; - bytes = svm_fifo_max_dequeue (rx_fifo); + max_dequeue = svm_fifo_max_dequeue (rx_fifo); /* Allow enqueuing of a new event */ svm_fifo_unset_event (rx_fifo); - if (bytes == 0) - return; + if (PREDICT_FALSE (max_dequeue == 0)) + { + return; + } - /* Read the bytes */ + /* Read the max_dequeue */ do { - n_read = svm_fifo_dequeue_nowait (rx_fifo, vec_len (utm->rx_buf), - utm->rx_buf); + max_transfer = clib_min (rx_buf_len, max_dequeue); + n_read = svm_fifo_dequeue_nowait (rx_fifo, max_transfer, utm->rx_buf); if (n_read > 0) - bytes -= n_read; - - if (utm->drop_packets) - continue; + { + max_dequeue -= n_read; + session->bytes_received += n_read; + } /* Reflect if a non-drop session */ - if (n_read > 0) + if (!utm->drop_packets && n_read > 0) { + offset = 0; do { - rv = svm_fifo_enqueue_nowait (tx_fifo, n_read, utm->rx_buf); + rv = svm_fifo_enqueue_nowait (tx_fifo, n_read, + &utm->rx_buf[offset]); + if (rv > 0) + { + n_read -= rv; + offset += rv; + } } - while (rv <= 0 && !utm->time_to_stop); + while ((rv <= 0 || n_read > 0) && !utm->time_to_stop); /* If event wasn't set, add one */ if (svm_fifo_set_event (tx_fifo)) @@ -951,11 +991,11 @@ server_handle_fifo_event_rx (uri_tcp_test_main_t * utm, q = utm->vpp_event_queue; unix_shared_memory_queue_add (q, (u8 *) & evt, - 0 /* do wait for mutex */ ); + 1 /* do wait for mutex */ ); } } } - while ((n_read < 0 || bytes > 0) && !utm->time_to_stop); + while ((n_read < 0 || max_dequeue > 0) && !utm->time_to_stop); } void @@ -1068,9 +1108,18 @@ vl_api_disconnect_session_reply_t_handler (vl_api_disconnect_session_reply_t * mp) { uri_tcp_test_main_t *utm = &uri_tcp_test_main; + session_t *session; + + if (mp->retval) + { + clib_warning ("vpp complained about disconnect: %d", + ntohl (mp->retval)); + } - clib_warning ("retval %d", ntohl (mp->retval)); utm->state = STATE_START; + session = pool_elt_at_index (utm->sessions, utm->connected_session_index); + if (session) + session_print_stats (utm, session); } #define foreach_uri_msg \ @@ -1123,7 +1172,7 @@ main (int argc, char **argv) /* make the main heap thread-safe */ h->flags |= MHEAP_FLAG_THREAD_SAFE; - vec_validate (utm->rx_buf, 65536); + vec_validate (utm->rx_buf, 128 << 10); utm->session_index_by_vpp_handles = hash_create (0, sizeof (uword)); @@ -1186,6 +1235,7 @@ main (int argc, char **argv) utm->drop_packets = drop_packets; utm->test_return_packets = test_return_packets; utm->bytes_to_send = bytes_to_send; + utm->time_to_stop = 0; setup_signal_handlers (); uri_api_hookup (utm); diff --git a/src/vnet/session/application.c b/src/vnet/session/application.c index c679b1f5..4bdb1027 100644 --- a/src/vnet/session/application.c +++ b/src/vnet/session/application.c @@ -117,7 +117,7 @@ application_del (application_t * app) /* Actual listener cleanup */ for (i = 0; i < vec_len (handles); i++) { - a->app_index = app->api_client_index; + a->app_index = app->index; a->handle = handles[i]; /* seg manager is removed when unbind completes */ vnet_unbind (a); diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c index 07eeae82..c0ab1bf0 100644 --- a/src/vnet/session/node.c +++ b/src/vnet/session/node.c @@ -171,7 +171,10 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, /* Nothing to read return */ if (max_dequeue0 == 0) - return 0; + { + svm_fifo_unset_event (s0->server_tx_fifo); + return 0; + } /* Ensure we're not writing more than transport window allows */ if (max_dequeue0 < snd_space0) @@ -393,7 +396,7 @@ session_event_get_session (session_fifo_event_t * e0, u8 thread_index) s0 = stream_session_get_if_valid (session_index0, thread_index); - ASSERT (s0->thread_index == thread_index); + ASSERT (s0 == 0 || s0->thread_index == thread_index); return s0; } diff --git a/src/vnet/session/segment_manager.c b/src/vnet/session/segment_manager.c index caf8eaa3..bf571963 100644 --- a/src/vnet/session/segment_manager.c +++ b/src/vnet/session/segment_manager.c @@ -306,11 +306,13 @@ again: if (added_a_segment) { clib_warning ("added a segment, still cant allocate a fifo"); + clib_spinlock_unlock (&sm->lockp); return SESSION_ERROR_NEW_SEG_NO_SPACE; } if (session_manager_add_segment (sm)) { + clib_spinlock_unlock (&sm->lockp); return VNET_API_ERROR_URI_FIFO_CREATE_FAILED; } @@ -320,6 +322,7 @@ again: else { clib_warning ("No space to allocate fifos!"); + clib_spinlock_unlock (&sm->lockp); return SESSION_ERROR_NO_SPACE; } } @@ -361,8 +364,10 @@ segment_manager_dealloc_fifos (u32 svm_segment_index, svm_fifo_t * rx_fifo, if (sm->segment_indices[0] != svm_segment_index && !svm_fifo_segment_has_fifos (fifo_segment)) { + clib_spinlock_lock (&sm->lockp); svm_fifo_segment_delete (fifo_segment); vec_del1 (sm->segment_indices, svm_segment_index); + clib_spinlock_unlock (&sm->lockp); } } diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 534598d6..fe198044 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -700,7 +700,7 @@ stream_session_init_fifos_pointers (transport_connection_t * tc, svm_fifo_init_pointers (s->server_tx_fifo, tx_pointer); } -void +int stream_session_connect_notify (transport_connection_t * tc, u8 sst, u8 is_fail) { @@ -709,6 +709,7 @@ stream_session_connect_notify (transport_connection_t * tc, u8 sst, stream_session_t *new_s = 0; u64 handle; u32 api_context = 0; + int error = 0; handle = stream_session_half_open_lookup (smm, &tc->lcl_ip, &tc->rmt_ip, tc->lcl_port, tc->rmt_port, @@ -716,7 +717,7 @@ stream_session_connect_notify (transport_connection_t * tc, u8 sst, if (handle == HALF_OPEN_LOOKUP_INVALID_VALUE) { clib_warning ("This can't be good!"); - return; + return -1; } /* Get the app's index from the handle we stored when opening connection */ @@ -730,9 +731,12 @@ stream_session_connect_notify (transport_connection_t * tc, u8 sst, /* Create new session (svm segments are allocated if needed) */ if (stream_session_create_i (sm, tc, &new_s)) - return; - - new_s->app_index = app->index; + { + is_fail = 1; + error = -1; + } + else + new_s->app_index = app->index; } /* Notify client */ @@ -741,6 +745,8 @@ stream_session_connect_notify (transport_connection_t * tc, u8 sst, /* Cleanup session lookup */ stream_session_half_open_table_del (smm, sst, tc); + + return error; } void @@ -981,8 +987,13 @@ session_send_session_evt_to_thread (u64 session_handle, /* Based on request block (or not) for lack of space */ if (PREDICT_TRUE (q->cursize < q->maxsize)) - unix_shared_memory_queue_add (q, (u8 *) & evt, - 0 /* do wait for mutex */ ); + { + if (unix_shared_memory_queue_add (q, (u8 *) & evt, + 1 /* do wait for mutex */ )) + { + clib_warning ("failed to enqueue evt"); + } + } else { clib_warning ("queue full"); diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index d9c38bd1..5fa4225c 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -368,8 +368,8 @@ stream_session_peek_bytes (transport_connection_t * tc, u8 * buffer, u32 offset, u32 max_bytes); u32 stream_session_dequeue_drop (transport_connection_t * tc, u32 max_bytes); -void stream_session_connect_notify (transport_connection_t * tc, u8 sst, - u8 is_fail); +int stream_session_connect_notify (transport_connection_t * tc, u8 sst, + u8 is_fail); void stream_session_init_fifos_pointers (transport_connection_t * tc, u32 rx_pointer, u32 tx_pointer); diff --git a/src/vnet/session/session_api.c b/src/vnet/session/session_api.c index f772cb9f..60f764af 100755 --- a/src/vnet/session/session_api.c +++ b/src/vnet/session/session_api.c @@ -419,7 +419,7 @@ done: REPLY_MACRO (VL_API_UNBIND_URI_REPLY); } -void +static void vl_api_connect_uri_t_handler (vl_api_connect_uri_t * mp) { vl_api_connect_uri_reply_t *rmp; diff --git a/src/vnet/session/transport.h b/src/vnet/session/transport.h index e5f788be..04bd5ca0 100644 --- a/src/vnet/session/transport.h +++ b/src/vnet/session/transport.h @@ -39,6 +39,7 @@ typedef struct _transport_connection #if TRANSPORT_DEBUG elog_track_t elog_track; /**< Event logging */ + u32 cc_stat_tstamp; /**< CC stats timestamp */ #endif /** Macros for 'derived classes' where base is named "connection" */ @@ -57,6 +58,7 @@ typedef struct _transport_connection #define c_is_ip4 connection.is_ip4 #define c_thread_index connection.thread_index #define c_elog_track connection.elog_track +#define c_cc_stat_tstamp connection.cc_stat_tstamp } transport_connection_t; /* diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index 7238cda3..6f8be082 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -43,7 +43,7 @@ #include #undef vl_printfun -#define TCP_BUILTIN_CLIENT_DBG (1) +#define TCP_BUILTIN_CLIENT_DBG (0) static void send_test_chunk (tclient_main_t * tm, session_t * s) @@ -92,7 +92,7 @@ send_test_chunk (tclient_main_t * tm, session_t * s) ed->data[2] = s->bytes_to_send; } - /* Poke the TCP state machine */ + /* Poke the session layer */ if (svm_fifo_set_event (s->server_tx_fifo)) { /* Fabricate TX event, send to vpp */ @@ -100,8 +100,9 @@ send_test_chunk (tclient_main_t * tm, session_t * s) evt.event_type = FIFO_EVENT_APP_TX; evt.event_id = serial_number++; - unix_shared_memory_queue_add (tm->vpp_event_queue, (u8 *) & evt, - 0 /* do wait for mutex */ ); + if (unix_shared_memory_queue_add (tm->vpp_event_queue, (u8 *) & evt, + 0 /* do wait for mutex */ )) + clib_warning ("could not enqueue event"); } } } @@ -188,13 +189,13 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, sp = pool_elt_at_index (tm->sessions, connection_indices[i]); - if (tx_quota < 60 && sp->bytes_to_send > 0) + if ((tm->no_return || tx_quota < 60) && sp->bytes_to_send > 0) { send_test_chunk (tm, sp); delete_session = 0; tx_quota++; } - if (sp->bytes_to_receive > 0) + if (!tm->no_return && sp->bytes_to_receive > 0) { prev_bytes_received_this_session = sp->bytes_received; receive_test_chunk (tm, sp); @@ -205,13 +206,14 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, } if (PREDICT_FALSE (delete_session == 1)) { + __sync_fetch_and_add (&tm->tx_total, tm->bytes_to_send); __sync_fetch_and_add (&tm->rx_total, sp->bytes_received); + dmp = vl_msg_api_alloc_as_if_client (sizeof (*dmp)); memset (dmp, 0, sizeof (*dmp)); dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION); dmp->client_index = tm->my_client_index; dmp->handle = sp->vpp_session_handle; -// vl_msg_api_send_shmem (tm->vl_input_queue, (u8 *) & dmp); if (!unix_shared_memory_queue_add (tm->vl_input_queue, (u8 *) & dmp, 1)) { @@ -247,7 +249,6 @@ VLIB_REGISTER_NODE (builtin_client_node) = }; /* *INDENT-ON* */ - /* So we don't get "no handler for... " msgs */ static void vl_api_memclnt_create_reply_t_handler (vl_api_memclnt_create_reply_t * mp) @@ -255,76 +256,10 @@ vl_api_memclnt_create_reply_t_handler (vl_api_memclnt_create_reply_t * mp) vlib_main_t *vm = vlib_get_main (); tclient_main_t *tm = &tclient_main; tm->my_client_index = mp->index; - vlib_process_signal_event (vm, tm->node_index, 1 /* evt */ , + vlib_process_signal_event (vm, tm->cli_node_index, 1 /* evt */ , 0 /* data */ ); } -static void -vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) -{ - tclient_main_t *tm = &tclient_main; - session_t *session; - u32 session_index; - i32 retval = /* clib_net_to_host_u32 ( */ mp->retval /*) */ ; - int i; - - if (retval < 0) - { - clib_warning ("connection failed: retval %d", retval); - return; - } - - tm->our_event_queue = - uword_to_pointer (mp->vpp_event_queue_address, - unix_shared_memory_queue_t *); - tm->vpp_event_queue = - uword_to_pointer (mp->vpp_event_queue_address, - unix_shared_memory_queue_t *); - - /* - * Setup session - */ - pool_get (tm->sessions, session); - memset (session, 0, sizeof (*session)); - session_index = session - tm->sessions; - session->bytes_to_receive = session->bytes_to_send = tm->bytes_to_send; - - session->server_rx_fifo = - uword_to_pointer (mp->server_rx_fifo, svm_fifo_t *); - session->server_rx_fifo->client_session_index = session_index; - session->server_tx_fifo = - uword_to_pointer (mp->server_tx_fifo, svm_fifo_t *); - session->server_tx_fifo->client_session_index = session_index; - session->vpp_session_handle = mp->handle; - - /* Add it to the session lookup table */ - hash_set (tm->session_index_by_vpp_handles, mp->handle, session_index); - - if (tm->ready_connections == tm->expected_connections - 1) - { - vlib_thread_main_t *thread_main = vlib_get_thread_main (); - int thread_index; - - thread_index = 0; - for (i = 0; i < pool_elts (tm->sessions); i++) - { - vec_add1 (tm->connection_index_by_thread[thread_index], i); - thread_index++; - if (thread_index == thread_main->n_vlib_mains) - thread_index = 0; - } - } - __sync_fetch_and_add (&tm->ready_connections, 1); - if (tm->ready_connections == tm->expected_connections) - { - tm->run_test = 1; - tm->test_start_time = vlib_time_now (tm->vlib_main); - /* Signal the CLI process that the action is starting... */ - vlib_process_signal_event (tm->vlib_main, tm->cli_node_index, - 1, 0 /* data */ ); - } -} - static int create_api_loopback (tclient_main_t * tm) { @@ -347,12 +282,11 @@ create_api_loopback (tclient_main_t * tm) mp->_vl_msg_id = VL_API_MEMCLNT_CREATE; mp->context = 0xFEEDFACE; mp->input_queue = pointer_to_uword (tm->vl_input_queue); - strncpy ((char *) mp->name, "tcp_tester", sizeof (mp->name) - 1); + strncpy ((char *) mp->name, "tcp_clients_tester", sizeof (mp->name) - 1); vl_api_memclnt_create_t_handler (mp); /* Wait for reply */ - tm->node_index = vlib_get_current_process (vm)->node_runtime.node_index; vlib_process_wait_for_event_or_clock (vm, 1.0); event_type = vlib_process_get_events (vm, &event_data); switch (event_type) @@ -373,7 +307,6 @@ create_api_loopback (tclient_main_t * tm) #define foreach_tclient_static_api_msg \ _(MEMCLNT_CREATE_REPLY, memclnt_create_reply) \ -_(CONNECT_URI_REPLY, connect_uri_reply) static clib_error_t * tclient_api_hookup (vlib_main_t * vm) @@ -411,8 +344,8 @@ tcp_test_clients_init (vlib_main_t * vm) if (create_api_loopback (tm)) return -1; - /* Init test data */ - vec_validate (tm->connect_test_data, 64 * 1024 - 1); + /* Init test data. Big buffer */ + vec_validate (tm->connect_test_data, 1024 * 1024 - 1); for (i = 0; i < vec_len (tm->connect_test_data); i++) tm->connect_test_data[i] = i & 0xff; @@ -430,37 +363,66 @@ static int builtin_session_connected_callback (u32 app_index, u32 api_context, stream_session_t * s, u8 is_fail) { - vl_api_connect_uri_reply_t _m, *mp = &_m; - unix_shared_memory_queue_t *q; - application_t *app; - unix_shared_memory_queue_t *vpp_queue; + tclient_main_t *tm = &tclient_main; + session_t *session; + u32 session_index; + int i; - app = application_get (app_index); - q = vl_api_client_index_to_input_queue (app->api_client_index); + if (is_fail) + { + clib_warning ("connection %d failed!", api_context); + vlib_process_signal_event (tm->vlib_main, tm->cli_node_index, -1, + 0 /* data */ ); + return -1; + } - if (!q) - return -1; + /* Mark vpp session as connected */ + s->session_state = SESSION_STATE_READY; - memset (mp, 0, sizeof (*mp)); - mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_CONNECT_URI_REPLY); - mp->context = api_context; - if (!is_fail) + tm->our_event_queue = session_manager_get_vpp_event_queue (s->thread_index); + tm->vpp_event_queue = session_manager_get_vpp_event_queue (s->thread_index); + + /* + * Setup session + */ + pool_get (tm->sessions, session); + memset (session, 0, sizeof (*session)); + session_index = session - tm->sessions; + session->bytes_to_receive = session->bytes_to_send = tm->bytes_to_send; + session->server_rx_fifo = s->server_rx_fifo; + session->server_rx_fifo->client_session_index = session_index; + session->server_tx_fifo = s->server_tx_fifo; + session->server_tx_fifo->client_session_index = session_index; + session->vpp_session_handle = stream_session_handle (s); + + /* Add it to the session lookup table */ + hash_set (tm->session_index_by_vpp_handles, session->vpp_session_handle, + session_index); + + if (tm->ready_connections == tm->expected_connections - 1) { - vpp_queue = session_manager_get_vpp_event_queue (s->thread_index); - mp->server_rx_fifo = pointer_to_uword (s->server_rx_fifo); - mp->server_tx_fifo = pointer_to_uword (s->server_tx_fifo); - mp->handle = stream_session_handle (s); - mp->vpp_event_queue_address = pointer_to_uword (vpp_queue); - mp->retval = 0; - s->session_state = SESSION_STATE_READY; + vlib_thread_main_t *thread_main = vlib_get_thread_main (); + int thread_index; + + thread_index = 0; + for (i = 0; i < pool_elts (tm->sessions); i++) + { + vec_add1 (tm->connection_index_by_thread[thread_index], i); + thread_index++; + if (thread_index == thread_main->n_vlib_mains) + thread_index = 0; + } } - else + __sync_fetch_and_add (&tm->ready_connections, 1); + if (tm->ready_connections == tm->expected_connections) { - mp->retval = clib_host_to_net_u32 (VNET_API_ERROR_SESSION_CONNECT_FAIL); + tm->run_test = 1; + tm->test_start_time = vlib_time_now (tm->vlib_main); + /* Signal the CLI process that the action is starting... */ + vlib_process_signal_event (tm->vlib_main, tm->cli_node_index, 1, + 0 /* data */ ); } - vl_api_connect_uri_reply_t_handler (mp); - return 0; } @@ -489,23 +451,22 @@ builtin_server_rx_callback (stream_session_t * s) } /* *INDENT-OFF* */ -static session_cb_vft_t builtin_clients = - { - .session_reset_callback = builtin_session_reset_callback, - .session_connected_callback = builtin_session_connected_callback, - .session_accept_callback = builtin_session_create_callback, - .session_disconnect_callback = builtin_session_disconnect_callback, - .builtin_server_rx_callback = builtin_server_rx_callback - }; +static session_cb_vft_t builtin_clients = { + .session_reset_callback = builtin_session_reset_callback, + .session_connected_callback = builtin_session_connected_callback, + .session_accept_callback = builtin_session_create_callback, + .session_disconnect_callback = builtin_session_disconnect_callback, + .builtin_server_rx_callback = builtin_server_rx_callback +}; /* *INDENT-ON* */ static int -attach_builtin_test_clients () +attach_builtin_test_clients_app (void) { tclient_main_t *tm = &tclient_main; vnet_app_attach_args_t _a, *a = &_a; u8 segment_name[128]; - u32 segment_name_length; + u32 segment_name_length, prealloc_fifos; u64 options[16]; segment_name_length = ARRAY_LEN (segment_name); @@ -518,13 +479,68 @@ attach_builtin_test_clients () a->segment_name_length = segment_name_length; a->session_cb_vft = &builtin_clients; + prealloc_fifos = tm->prealloc_fifos ? tm->expected_connections : 1; + options[SESSION_OPTIONS_ACCEPT_COOKIE] = 0x12345678; - options[SESSION_OPTIONS_SEGMENT_SIZE] = (2 << 30); /*$$$$ config / arg */ + options[SESSION_OPTIONS_SEGMENT_SIZE] = (2ULL << 32); + options[SESSION_OPTIONS_RX_FIFO_SIZE] = tm->fifo_size; + options[SESSION_OPTIONS_TX_FIFO_SIZE] = tm->fifo_size / 2; + options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = prealloc_fifos; + options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP; a->options = options; - return vnet_application_attach (a); + if (vnet_application_attach (a)) + return -1; + + tm->app_index = a->app_index; + return 0; +} + +static void * +tclient_thread_fn (void *arg) +{ + return 0; +} + +/** Start a transmit thread */ +int +start_tx_pthread (tclient_main_t * tm) +{ + if (tm->client_thread_handle == 0) + { + int rv = pthread_create (&tm->client_thread_handle, + NULL /*attr */ , + tclient_thread_fn, 0); + if (rv) + { + tm->client_thread_handle = 0; + return -1; + } + } + return 0; +} + +void +clients_connect (vlib_main_t * vm, u8 * uri, u32 n_clients) +{ + tclient_main_t *tm = &tclient_main; + vnet_connect_args_t _a, *a = &_a; + int i; + for (i = 0; i < n_clients; i++) + { + memset (a, 0, sizeof (*a)); + + a->uri = (char *) uri; + a->api_context = i; + a->app_index = tm->app_index; + a->mp = 0; + vnet_connect_uri (a); + + /* Crude pacing for call setups, 100k/sec */ + vlib_process_suspend (vm, 10e-6); + } } static clib_error_t * @@ -534,17 +550,18 @@ test_tcp_clients_command_fn (vlib_main_t * vm, { tclient_main_t *tm = &tclient_main; vlib_thread_main_t *thread_main = vlib_get_thread_main (); - uword *event_data = 0; - uword event_type; - u8 *connect_uri = (u8 *) "tcp://6.0.1.1/1234"; - u8 *uri; + uword *event_data = 0, event_type; + u8 *default_connect_uri = (u8 *) "tcp://6.0.1.1/1234", *uri; + u64 tmp, total_bytes; + f64 cli_timeout = 20.0, delta; u32 n_clients = 1; + char *transfer_type; int i; - u64 tmp; - f64 cli_timeout = 20.0; - f64 delta; tm->bytes_to_send = 8192; + tm->no_return = 0; + tm->fifo_size = 64 << 10; + vec_free (tm->connect_uri); while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) @@ -561,11 +578,18 @@ test_tcp_clients_command_fn (vlib_main_t * vm, ; else if (unformat (input, "cli-timeout %f", &cli_timeout)) ; + else if (unformat (input, "no-return")) + tm->no_return = 1; + else if (unformat (input, "fifo-size %d", &tm->fifo_size)) + tm->fifo_size <<= 10; else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); } + /* Store cli process node index for signalling */ + tm->cli_node_index = vlib_get_current_process (vm)->node_runtime.node_index; + if (tm->is_init == 0) { if (tcp_test_clients_init (vm)) @@ -575,28 +599,25 @@ test_tcp_clients_command_fn (vlib_main_t * vm, tm->ready_connections = 0; tm->expected_connections = n_clients; tm->rx_total = 0; + tm->tx_total = 0; - uri = connect_uri; + uri = default_connect_uri; if (tm->connect_uri) uri = tm->connect_uri; #if TCP_BUILTIN_CLIENT_PTHREAD - /* Start a transmit thread */ - if (tm->client_thread_handle == 0) + start_tx_pthread (); +#endif + + vnet_session_enable_disable (vm, 1 /* turn on TCP, etc. */ ); + + if (tm->test_client_attached == 0) { - int rv = pthread_create (&tm->client_thread_handle, - NULL /*attr */ , - tclient_thread_fn, 0); - if (rv) + if (attach_builtin_test_clients_app ()) { - tm->client_thread_handle = 0; - return clib_error_return (0, "pthread_create returned %d", rv); + return clib_error_return (0, "app attach failed"); } } -#endif - vnet_session_enable_disable (vm, 1 /* turn on TCP, etc. */ ); - if (tm->test_client_attached == 0) - attach_builtin_test_clients (); tm->test_client_attached = 1; /* Turn on the builtin client input nodes */ @@ -604,25 +625,8 @@ test_tcp_clients_command_fn (vlib_main_t * vm, vlib_node_set_state (vlib_mains[i], builtin_client_node.index, VLIB_NODE_STATE_POLLING); - tm->cli_node_index = vlib_get_current_process (vm)->node_runtime.node_index; - /* Fire off connect requests */ - for (i = 0; i < n_clients; i++) - { - vl_api_connect_uri_t _cmp, *cmp = &_cmp; - void vl_api_connect_uri_t_handler (vl_api_connect_uri_t * cmp); - - memset (cmp, 0, sizeof (*cmp)); - - cmp->_vl_msg_id = ntohs (VL_API_CONNECT_URI); - cmp->client_index = tm->my_client_index; - cmp->context = ntohl (0xfeedface); - memcpy (cmp->uri, uri, strlen ((char *) uri) + 1); - - vl_api_connect_uri_t_handler (cmp); - /* Crude pacing for call setups, 100k/sec */ - vlib_process_suspend (vm, 10e-6); - } + clients_connect (vm, uri, n_clients); /* Park until the sessions come up, or ten seconds elapse... */ vlib_process_wait_for_event_or_clock (vm, 10.0 /* timeout, seconds */ ); @@ -668,14 +672,17 @@ test_tcp_clients_command_fn (vlib_main_t * vm, if (delta != 0.0) { + total_bytes = (tm->no_return ? tm->tx_total : tm->rx_total); + transfer_type = tm->no_return ? "half-duplex" : "full-duplex"; vlib_cli_output (vm, "%lld bytes (%lld mbytes, %lld gbytes) in %.2f seconds", - tm->rx_total, tm->rx_total / (1ULL << 20), - tm->rx_total / (1ULL << 30), delta); - vlib_cli_output (vm, "%.2f bytes/second full-duplex", - ((f64) tm->rx_total) / (delta)); - vlib_cli_output (vm, "%.4f gbit/second full-duplex", - (((f64) tm->rx_total * 8.0) / delta / 1e9)); + total_bytes, total_bytes / (1ULL << 20), + total_bytes / (1ULL << 30), delta); + vlib_cli_output (vm, "%.2f bytes/second %s", + ((f64) total_bytes) / (delta), transfer_type); + vlib_cli_output (vm, "%.4f gbit/second %s", + (((f64) total_bytes * 8.0) / delta / 1e9), + transfer_type); } else vlib_cli_output (vm, "zero delta-t?"); diff --git a/src/vnet/tcp/builtin_client.h b/src/vnet/tcp/builtin_client.h index d5d79e53..3462e0ee 100644 --- a/src/vnet/tcp/builtin_client.h +++ b/src/vnet/tcp/builtin_client.h @@ -44,78 +44,59 @@ typedef struct typedef struct { - /* API message ID base */ - u16 msg_id_base; - - /* vpe input queue */ - unix_shared_memory_queue_t *vl_input_queue; - - /* API client handle */ - u32 my_client_index; - - /* The URI we're playing with */ - u8 *uri; - - /* Session pool */ - session_t *sessions; - - /* Hash table for disconnect processing */ - uword *session_index_by_vpp_handles; - - /* intermediate rx buffer */ - u8 *rx_buf; - - /* URI for slave's connect */ - u8 *connect_uri; - - u32 connected_session_index; - - int i_am_master; - - /* drop all packets */ - int drop_packets; - - /* Our event queue */ - unix_shared_memory_queue_t *our_event_queue; - - /* $$$ single thread only for the moment */ - unix_shared_memory_queue_t *vpp_event_queue; - - pid_t my_pid; - - f64 test_start_time; - f64 test_end_time; - - u32 expected_connections; + /* + * Application setup parameters + */ + unix_shared_memory_queue_t *vl_input_queue; /**< vpe input queue */ + unix_shared_memory_queue_t *our_event_queue; /**< Our event queue */ + unix_shared_memory_queue_t *vpp_event_queue; /**< $$$ single thread */ + + u32 cli_node_index; /**< cli process node index */ + u32 my_client_index; /**< loopback API client handle */ + u32 app_index; /**< app index after attach */ + + /* + * Configuration params + */ + u8 *connect_uri; /**< URI for slave's connect */ + u64 bytes_to_send; /**< Bytes to send */ + u32 configured_segment_size; + u32 fifo_size; + u32 expected_connections; /**< Number of clients/connections */ + + /* + * Test state variables + */ + session_t *sessions; /**< Sessions pool */ + u8 *rx_buf; /**< intermediate rx buffer */ + uword *session_index_by_vpp_handles; /**< Hash table for disconnecting */ + u8 *connect_test_data; /**< Pre-computed test data */ u32 **connection_index_by_thread; + pthread_t client_thread_handle; + volatile u32 ready_connections; volatile u32 finished_connections; - volatile u64 rx_total; - u32 cli_node_index; - - /* Signal variable */ - volatile int run_test; - - /* Bytes to send */ - u64 bytes_to_send; - - u32 configured_segment_size; + volatile u64 tx_total; + volatile int run_test; /**< Signal start of test */ - /* VNET_API_ERROR_FOO -> "Foo" hash table */ - uword *error_string_by_error_number; - - u8 *connect_test_data; - pthread_t client_thread_handle; - u64 client_bytes_received; - u8 test_return_packets; + f64 test_start_time; + f64 test_end_time; + /* + * Flags + */ u8 is_init; u8 test_client_attached; + u8 no_return; + u8 test_return_packets; + int i_am_master; + int drop_packets; /**< drop all packets */ + u8 prealloc_fifos; /**< Request fifo preallocation */ - u32 node_index; - - /* convenience */ + /* + * Convenience + */ vlib_main_t *vlib_main; vnet_main_t *vnet_main; ethernet_main_t *ethernet_main; diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c index 8bd2f360..775bfc26 100644 --- a/src/vnet/tcp/builtin_server.c +++ b/src/vnet/tcp/builtin_server.c @@ -39,21 +39,30 @@ typedef struct { - /* Per-thread RX buffer */ - u8 **rx_buf; + /* + * Server app parameters + */ unix_shared_memory_queue_t **vpp_queue; - u64 byte_index; + unix_shared_memory_queue_t *vl_input_queue; /**< Sever's event queue */ - /* Sever's event queue */ - unix_shared_memory_queue_t *vl_input_queue; + u32 app_index; /**< Server app index */ + u32 my_client_index; /**< API client handle */ + u32 node_index; /**< process node index for evnt scheduling */ - /* API client handle */ - u32 my_client_index; + /* + * Config params + */ + u8 no_echo; /**< Don't echo traffic */ + u32 fifo_size; /**< Fifo size */ + u32 rcv_buffer_size; /**< Rcv buffer size */ + u32 prealloc_fifos; /**< Preallocate fifos */ - u32 app_index; + /* + * Test state + */ + u8 **rx_buf; /**< Per-thread RX buffer */ + u64 byte_index; - /* process node index for evnt scheduling */ - u32 node_index; vlib_main_t *vlib_main; } builtin_server_main_t; @@ -132,6 +141,29 @@ test_bytes (builtin_server_main_t * bsm, int actual_transfer) bsm->byte_index += actual_transfer; } +/* + * If no-echo, just read the data and be done with it + */ +int +builtin_server_rx_callback_no_echo (stream_session_t * s) +{ + builtin_server_main_t *bsm = &builtin_server_main; + u32 my_thread_id = vlib_get_thread_index (); + int actual_transfer; + svm_fifo_t *rx_fifo; + + rx_fifo = s->server_rx_fifo; + + do + { + actual_transfer = + svm_fifo_dequeue_nowait (rx_fifo, bsm->rcv_buffer_size, + bsm->rx_buf[my_thread_id]); + } + while (actual_transfer > 0); + return 0; +} + int builtin_server_rx_callback (stream_session_t * s) { @@ -143,8 +175,8 @@ builtin_server_rx_callback (stream_session_t * s) static int serial_number = 0; u32 my_thread_id = vlib_get_thread_index (); - tx_fifo = s->server_tx_fifo; rx_fifo = s->server_rx_fifo; + tx_fifo = s->server_tx_fifo; max_dequeue = svm_fifo_max_dequeue (s->server_rx_fifo); max_enqueue = svm_fifo_max_enqueue (s->server_tx_fifo); @@ -164,19 +196,22 @@ builtin_server_rx_callback (stream_session_t * s) /* Program self-tap to retry */ if (svm_fifo_set_event (rx_fifo)) { + unix_shared_memory_queue_t *q; evt.fifo = rx_fifo; evt.event_type = FIFO_EVENT_BUILTIN_RX; evt.event_id = 0; - unix_shared_memory_queue_add (bsm->vpp_queue[s->thread_index], - (u8 *) & evt, - 0 /* do wait for mutex */ ); + + q = bsm->vpp_queue[s->thread_index]; + if (PREDICT_FALSE (q->cursize == q->maxsize)) + clib_warning ("out of event queue space"); + else + unix_shared_memory_queue_add (q, (u8 *) & evt, + 0 /* don't wait for mutex */ ); } return 0; } - vec_validate (bsm->rx_buf, my_thread_id); - vec_validate (bsm->rx_buf[my_thread_id], max_transfer - 1); _vec_len (bsm->rx_buf[my_thread_id]) = max_transfer; actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, max_transfer, @@ -281,14 +316,21 @@ server_attach () memset (a, 0, sizeof (*a)); memset (options, 0, sizeof (options)); + if (bsm->no_echo) + builtin_session_cb_vft.builtin_server_rx_callback = + builtin_server_rx_callback_no_echo; + else + builtin_session_cb_vft.builtin_server_rx_callback = + builtin_server_rx_callback; a->api_client_index = bsm->my_client_index; a->session_cb_vft = &builtin_session_cb_vft; a->options = options; a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 512 << 20; - a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 64 << 10; - a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 64 << 10; + a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = bsm->fifo_size; + a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = bsm->fifo_size; a->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP; - a->options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = 8192; + a->options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = + bsm->prealloc_fifos ? bsm->prealloc_fifos : 1; a->segment_name = segment_name; a->segment_name_length = ARRAY_LEN (segment_name); @@ -316,17 +358,24 @@ static int server_create (vlib_main_t * vm) { builtin_server_main_t *bsm = &builtin_server_main; - u32 num_threads; vlib_thread_main_t *vtm = vlib_get_thread_main (); + u32 num_threads; + int i; if (bsm->my_client_index == (u32) ~ 0) { if (create_api_loopback (vm)) - return -1; + { + clib_warning ("failed to create api loopback"); + return -1; + } } num_threads = 1 /* main thread */ + vtm->n_threads; vec_validate (builtin_server_main.vpp_queue, num_threads - 1); + vec_validate (bsm->rx_buf, num_threads - 1); + for (i = 0; i < num_threads; i++) + vec_validate (bsm->rx_buf[i], bsm->rcv_buffer_size); if (server_attach ()) { @@ -381,23 +430,35 @@ tcp_builtin_server_api_hookup (vlib_main_t * vm) } static clib_error_t * -server_create_command_fn (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) +server_create_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) { + builtin_server_main_t *bsm = &builtin_server_main; int rv; -#if 0 + + bsm->no_echo = 0; + bsm->fifo_size = 64 << 10; + bsm->rcv_buffer_size = 128 << 10; + bsm->prealloc_fifos = 0; + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { - if (unformat (input, "whatever %d", &whatever)) + if (unformat (input, "no-echo")) + bsm->no_echo = 1; + else if (unformat (input, "fifo-size %d", &bsm->fifo_size)) + bsm->fifo_size <<= 10; + else if (unformat (input, "rcv-buf-size %d", &bsm->rcv_buffer_size)) + ; + else if (unformat (input, "prealloc-fifos", &bsm->prealloc_fifos)) ; else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); } -#endif tcp_builtin_server_api_hookup (vm); vnet_session_enable_disable (vm, 1 /* turn on TCP, etc. */ ); + rv = server_create (vm); switch (rv) { @@ -406,6 +467,7 @@ server_create_command_fn (vlib_main_t * vm, default: return clib_error_return (0, "server_create returned %d", rv); } + return 0; } diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index e0b67a8e..5c554bac 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -726,15 +726,25 @@ tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space) u32 tcp_snd_space (tcp_connection_t * tc) { - int snd_space; + int snd_space, snt_limited; - /* If we haven't gotten dupacks or if we did and have gotten sacked bytes - * then we can still send */ - if (PREDICT_TRUE (tcp_in_cong_recovery (tc) == 0 - && (tc->rcv_dupacks == 0 - || tc->sack_sb.last_sacked_bytes))) + if (PREDICT_TRUE (tcp_in_cong_recovery (tc) == 0)) { snd_space = tcp_available_snd_space (tc); + + /* If we haven't gotten dupacks or if we did and have gotten sacked + * bytes then we can still send as per Limited Transmit (RFC3042) */ + if (PREDICT_FALSE (tc->rcv_dupacks != 0 + && (tcp_opts_sack_permitted (tc) + && tc->sack_sb.last_sacked_bytes == 0))) + { + if (tc->rcv_dupacks == 1 && tc->limited_transmit != tc->snd_nxt) + tc->limited_transmit = tc->snd_nxt; + ASSERT (seq_leq (tc->limited_transmit, tc->snd_nxt)); + + snt_limited = tc->snd_nxt - tc->limited_transmit; + snd_space = clib_max (2 * tc->snd_mss - snt_limited, 0); + } return tcp_round_snd_space (tc, snd_space); } diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 071f1ab1..e8398718 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -31,9 +31,9 @@ #define TCP_MAX_OPTION_SPACE 40 #define TCP_DUPACK_THRESHOLD 3 -#define TCP_MAX_RX_FIFO_SIZE 2 << 20 +#define TCP_MAX_RX_FIFO_SIZE 4 << 20 #define TCP_IW_N_SEGMENTS 10 -#define TCP_ALWAYS_ACK 0 /**< If on, we always ack */ +#define TCP_ALWAYS_ACK 1 /**< On/off delayed acks */ #define TCP_USE_SACKS 1 /**< Disable only for testing */ /** TCP FSM state definitions as per RFC793. */ @@ -100,6 +100,7 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler; #define TCP_TIMER_PERSIST_MIN 2 /* 0.2s */ #define TCP_RTO_MAX 60 * THZ /* Min max RTO (60s) as per RFC6298 */ +#define TCP_RTO_MIN 0.2 * THZ /* Min RTO (200ms) - lower than standard */ #define TCP_RTT_MAX 30 * THZ /* 30s (probably too much) */ #define TCP_RTO_SYN_RETRIES 3 /* SYN retries without doubling RTO */ #define TCP_RTO_INIT 1 * THZ /* Initial retransmit timer */ @@ -149,7 +150,7 @@ enum #undef _ }; -#define TCP_MAX_SACK_BLOCKS 5 /**< Max number of SACK blocks stored */ +#define TCP_MAX_SACK_BLOCKS 15 /**< Max number of SACK blocks stored */ #define TCP_INVALID_SACK_HOLE_INDEX ((u32)~0) typedef struct _sack_scoreboard_hole @@ -208,6 +209,7 @@ typedef struct _tcp_connection u32 snd_wl1; /**< seq number used for last snd.wnd update */ u32 snd_wl2; /**< ack number used for last snd.wnd update */ u32 snd_nxt; /**< next seq number to be sent */ + u16 snd_mss; /**< Effective send max seg (data) size */ /** Receive sequence variables RFC793 */ u32 rcv_nxt; /**< next sequence number expected */ @@ -252,8 +254,8 @@ typedef struct _tcp_connection u32 rtt_ts; /**< Timestamp for tracked ACK */ u32 rtt_seq; /**< Sequence number for tracked ACK */ - u16 snd_mss; /**< Effective send max seg (data) size */ u16 mss; /**< Our max seg size that includes options */ + u32 limited_transmit; /**< snd_nxt when limited transmit starts */ } tcp_connection_t; struct _tcp_cc_algorithm @@ -433,6 +435,7 @@ tcp_end_seq (tcp_header_t * th, u32 len) #define seq_leq(_s1, _s2) ((i32)((_s1)-(_s2)) <= 0) #define seq_gt(_s1, _s2) ((i32)((_s1)-(_s2)) > 0) #define seq_geq(_s1, _s2) ((i32)((_s1)-(_s2)) >= 0) +#define seq_max(_s1, _s2) (seq_gt((_s1), (_s2)) ? (_s1) : (_s2)) /* Modulo arithmetic for timestamps */ #define timestamp_lt(_t1, _t2) ((i32)((_t1)-(_t2)) < 0) @@ -719,6 +722,7 @@ scoreboard_clear (sack_scoreboard_t * sb) { scoreboard_remove_hole (sb, hole); } + ASSERT (sb->head == sb->tail && sb->head == TCP_INVALID_SACK_HOLE_INDEX); sb->sacked_bytes = 0; sb->last_sacked_bytes = 0; sb->last_bytes_delivered = 0; diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index 3a16cf63..ae68ad1b 100755 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -19,8 +19,10 @@ #include #define TCP_DEBUG (1) +#define TCP_DEBUG_SM (0) #define TCP_DEBUG_CC (1) -#define TCP_DEBUG_VERBOSE (0) +#define TCP_DEBUG_CC_STAT (1) +#define TCP_DEBUG_SM_VERBOSE (0) #define foreach_tcp_dbg_evt \ _(INIT, "") \ @@ -49,6 +51,8 @@ _(CC_RTX, "retransmit") \ _(CC_EVT, "cc event") \ _(CC_PACK, "cc partial ack") \ + _(CC_STAT, "cc stats") \ + _(CC_RTO_STAT, "cc rto stats") \ _(SEG_INVALID, "invalid segment") \ _(PAWS_FAIL, "failed paws check") \ _(ACK_RCV_ERR, "invalid ack") \ @@ -72,6 +76,10 @@ typedef enum _tcp_dbg_evt #define TRANSPORT_DEBUG (1) +/* + * Infra and evt track setup + */ + #define TCP_DBG(_tc, _evt, _args...) \ { \ u8 *_tmp = 0; \ @@ -158,6 +166,30 @@ typedef enum _tcp_dbg_evt TCP_EVT_DEALLOC_HANDLER(_tc); \ } +#define TCP_EVT_SYN_RCVD_HANDLER(_tc, ...) \ +{ \ + TCP_EVT_INIT_HANDLER(_tc, "s%d%c"); \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "SYNrx: irs %u", \ + .format_args = "i4", \ + }; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->irs; \ +} + +#define CONCAT_HELPER(_a, _b) _a##_b +#define CC(_a, _b) CONCAT_HELPER(_a, _b) +#define TCP_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args) +#else +#define TCP_EVT_DBG(_evt, _args...) +#endif + +/* + * State machine + */ +#if TCP_DEBUG_SM + #define TCP_EVT_ACK_SENT_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ @@ -234,18 +266,6 @@ typedef enum _tcp_dbg_evt ed->data[1] = _tc->rcv_nxt - _tc->irs; \ } -#define TCP_EVT_SYN_RCVD_HANDLER(_tc, ...) \ -{ \ - TCP_EVT_INIT_HANDLER(_tc, "s%d%c"); \ - ELOG_TYPE_DECLARE (_e) = \ - { \ - .format = "SYNrx: irs %u", \ - .format_args = "i4", \ - }; \ - DECLARE_ETD(_tc, _e, 1); \ - ed->data[0] = _tc->irs; \ -} - #define TCP_EVT_FIN_RCVD_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ @@ -418,6 +438,74 @@ typedef enum _tcp_dbg_evt ed->data[4] = _tc->snd_una_max - _tc->iss; \ } +#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...) \ +{ \ +if (_av > 0) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "huh?: rcv_wnd %u obsd %u av %u rcv_nxt %u rcv_las %u", \ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->rcv_wnd; \ + ed->data[1] = _obs; \ + ed->data[2] = _av; \ + ed->data[3] = _tc->rcv_nxt - _tc->irs; \ + ed->data[4] = _tc->rcv_las - _tc->irs; \ +} \ +} +#else +#define TCP_EVT_ACK_SENT_HANDLER(_tc, ...) +#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...) +#define TCP_EVT_SYN_SENT_HANDLER(_tc, ...) +#define TCP_EVT_SYN_RTX_HANDLER(_tc, ...) +#define TCP_EVT_FIN_SENT_HANDLER(_tc, ...) +#define TCP_EVT_RST_SENT_HANDLER(_tc, ...) +#define TCP_EVT_FIN_RCVD_HANDLER(_tc, ...) +#define TCP_EVT_RST_RCVD_HANDLER(_tc, ...) +#define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...) +#define TCP_EVT_DUPACK_RCVD_HANDLER(_tc, ...) +#define TCP_EVT_PKTIZE_HANDLER(_tc, ...) +#define TCP_EVT_INPUT_HANDLER(_tc, _type, _len, _written, ...) +#define TCP_EVT_TIMER_POP_HANDLER(_tc_index, _timer_id, ...) +#define TCP_EVT_SEG_INVALID_HANDLER(_tc, _seq, _end, ...) +#define TCP_EVT_PAWS_FAIL_HANDLER(_tc, _seq, _end, ...) +#define TCP_EVT_ACK_RCV_ERR_HANDLER(_tc, _type, _ack, ...) +#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...) +#endif + +/* + * State machine verbose + */ +#if TCP_DBG_SM_VERBOSE +#define TCP_EVT_SND_WND_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "snd_wnd update: %u ", \ + .format_args = "i4", \ + }; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->snd_wnd; \ +} + +#define TCP_EVT_OUTPUT_HANDLER(_tc, flags, n_bytes,...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "out: flags %x, bytes %u", \ + .format_args = "i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = flags; \ + ed->data[1] = n_bytes; \ +} +#else +#define TCP_EVT_SND_WND_HANDLER(_tc, ...) +#define TCP_EVT_OUTPUT_HANDLER(_tc, flags, n_bytes,...) +#endif + /* * Congestion Control */ @@ -471,67 +559,59 @@ typedef enum _tcp_dbg_evt ed->data[1] = _tc->snd_una_max - _tc->iss; \ } -#else -#define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...) -#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, _snd_space, ...) -#define TCP_EVT_CC_PACK_HANDLER(_tc, ...) -#endif +/* + * Congestion control stats + */ +#if TCP_DEBUG_CC_STAT -#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...) \ +#define STATS_INTERVAL 1 + +#define TCP_EVT_CC_RTO_STAT_HANDLER(_tc, ...) \ { \ -if (_av > 0) \ +if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now()) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "huh?: rcv_wnd %u obsd %u av %u rcv_nxt %u rcv_las %u", \ - .format_args = "i4i4i4i4i4", \ + .format = "rto_stat: rto %u srtt %u rttvar %u ", \ + .format_args = "i4i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 5); \ - ed->data[0] = _tc->rcv_wnd; \ - ed->data[1] = _obs; \ - ed->data[2] = _av; \ - ed->data[3] = _tc->rcv_nxt - _tc->irs; \ - ed->data[4] = _tc->rcv_las - _tc->irs; \ + DECLARE_ETD(_tc, _e, 3); \ + ed->data[0] = _tc->rto; \ + ed->data[1] = _tc->srtt; \ + ed->data[2] = _tc->rttvar; \ } \ } -#if TCP_DBG_VERBOSE -#define TCP_EVT_SND_WND_HANDLER(_tc, ...) \ +#define TCP_EVT_CC_STAT_HANDLER(_tc, ...) \ { \ - ELOG_TYPE_DECLARE (_e) = \ - { \ - .format = "snd_wnd update: %u ", \ - .format_args = "i4", \ - }; \ - DECLARE_ETD(_tc, _e, 1); \ - ed->data[0] = _tc->snd_wnd; \ -} - -#define TCP_EVT_OUTPUT_HANDLER(_tc, flags, n_bytes,...) \ +if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now()) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "out: flags %x, bytes %u", \ - .format_args = "i4i4", \ + .format = "cc_stat: cwnd %u flight %u space %u ssthresh %u snd_wnd %u",\ + .format_args = "i4i4i4i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 2); \ - ed->data[0] = flags; \ - ed->data[1] = n_bytes; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->cwnd; \ + ed->data[1] = tcp_flight_size (_tc); \ + ed->data[2] = tcp_snd_space (_tc); \ + ed->data[3] = _tc->ssthresh; \ + ed->data[4] = _tc->snd_wnd; \ + TCP_EVT_CC_RTO_STAT_HANDLER (_tc); \ + _tc->c_cc_stat_tstamp = tcp_time_now(); \ +} \ } + #else -#define TCP_EVT_SND_WND_HANDLER(_tc, ...) -#define TCP_EVT_OUTPUT_HANDLER(_tc, flags, n_bytes,...) +#define TCP_EVT_CC_STAT_HANDLER(_tc, ...) #endif -#define CONCAT_HELPER(_a, _b) _a##_b -#define CC(_a, _b) CONCAT_HELPER(_a, _b) -#define TCP_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args) - #else -#define TCP_EVT_DBG(_evt, _args...) +#define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...) +#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...) +#define TCP_EVT_CC_PACK_HANDLER(_tc, ...) #endif - #endif /* SRC_VNET_TCP_TCP_DEBUG_H_ */ /* * fd.io coding-style-patch-verification: ON diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index ff2229b3..a2e6dad1 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -378,16 +378,20 @@ tcp_rcv_ack_is_acceptable (tcp_connection_t * tc0, vlib_buffer_t * tb0) static void tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt) { - int err; + int err, diff; if (tc->srtt != 0) { err = mrtt - tc->srtt; - tc->srtt += err >> 3; +// tc->srtt += err >> 3; /* XXX Drop in RTT results in RTTVAR increase and bigger RTO. * The increase should be bound */ - tc->rttvar += ((int) clib_abs (err) - (int) tc->rttvar) >> 2; +// tc->rttvar += ((int) clib_abs (err) - (int) tc->rttvar) >> 2; + + tc->srtt = clib_max ((int) tc->srtt + (err >> 3), 1); + diff = (clib_abs (err) - (int) tc->rttvar) >> 2; + tc->rttvar = clib_max ((int) tc->rttvar + diff, 1); } else { @@ -401,6 +405,7 @@ void tcp_update_rto (tcp_connection_t * tc) { tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); + tc->rto = clib_max (tc->rto, TCP_RTO_MIN); } /** Update RTT estimate and RTO timer @@ -417,8 +422,8 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) u32 mrtt = 0; u8 rtx_acked; - /* Determine if only rtx bytes are acked. TODO XXX fast retransmit */ - rtx_acked = tc->rto_boff && (tc->bytes_acked <= tc->snd_mss); + /* Determine if only rtx bytes are acked. */ + rtx_acked = tcp_in_cong_recovery (tc) || !tc->bytes_acked; /* Karn's rule, part 1. Don't use retransmitted segments to estimate * RTT because they're ambiguous. */ @@ -428,8 +433,7 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) } /* As per RFC7323 TSecr can be used for RTTM only if the segment advances * snd_una, i.e., the left side of the send window: - * seq_lt (tc->snd_una, ack). Note: last condition could be dropped, we don't - * try to update rtt for dupacks */ + * seq_lt (tc->snd_una, ack). */ else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr && tc->bytes_acked) { @@ -550,11 +554,13 @@ scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index, prev = scoreboard_get_hole (sb, prev_index); if (prev) { - hole->prev = prev - sb->holes; + hole->prev = prev_index; hole->next = prev->next; if ((next = scoreboard_next_hole (sb, hole))) next->prev = hole_index; + else + sb->tail = hole_index; prev->next = hole_index; } @@ -569,12 +575,13 @@ scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index, } void -scoreboard_update_lost (tcp_connection_t * tc, sack_scoreboard_t * sb) +scoreboard_update_bytes (tcp_connection_t * tc, sack_scoreboard_t * sb) { sack_scoreboard_hole_t *hole, *prev; u32 bytes = 0, blks = 0; sb->lost_bytes = 0; + sb->sacked_bytes = 0; hole = scoreboard_last_hole (sb); if (!hole) return; @@ -594,13 +601,16 @@ scoreboard_update_lost (tcp_connection_t * tc, sack_scoreboard_t * sb) hole = prev; } - hole = prev; while (hole) { sb->lost_bytes += scoreboard_hole_bytes (hole); hole->is_lost = 1; + prev = hole; hole = scoreboard_prev_hole (sb, hole); + if (hole) + bytes += prev->start - hole->end; } + sb->sacked_bytes = bytes; } /** @@ -677,7 +687,7 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { sack_scoreboard_t *sb = &tc->sack_sb; sack_block_t *blk, tmp; - sack_scoreboard_hole_t *hole, *next_hole, *last_hole, *new_hole; + sack_scoreboard_hole_t *hole, *next_hole, *last_hole; u32 blk_index = 0, old_sacked_bytes, hole_index; int i, j; @@ -743,6 +753,10 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) if (seq_gt (tc->snd_una_max, sb->high_sacked) && seq_gt (tc->snd_una_max, last_hole->end)) last_hole->end = tc->snd_una_max; + /* keep track of max byte sacked for when the last hole + * is acked */ + if (seq_gt (tmp.end, sb->high_sacked)) + sb->high_sacked = tmp.end; } /* Walk the holes with the SACK blocks */ @@ -758,45 +772,20 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { next_hole = scoreboard_next_hole (sb, hole); - /* Byte accounting */ - if (seq_leq (hole->end, ack)) - { - /* Bytes lost because snd_wnd left edge advances */ - if (next_hole && seq_leq (next_hole->start, ack)) - sb->last_bytes_delivered += next_hole->start - hole->end; - else - sb->last_bytes_delivered += ack - hole->end; - } - else - { - sb->sacked_bytes += scoreboard_hole_bytes (hole); - } - - /* About to remove last hole */ - if (hole == last_hole) - { - sb->tail = hole->prev; - last_hole = scoreboard_last_hole (sb); - /* keep track of max byte sacked for when the last hole - * is acked */ - if (seq_gt (hole->end, sb->high_sacked)) - sb->high_sacked = hole->end; - } - - /* snd_una needs to be advanced */ - if (blk->end == ack && seq_geq (ack, hole->end)) + /* Byte accounting: snd_una needs to be advanced */ + if (blk->end == ack) { - if (next_hole && seq_lt (ack, next_hole->start)) + if (next_hole) { - sb->snd_una_adv = next_hole->start - ack; - - /* all these can be delivered */ - sb->last_bytes_delivered += sb->snd_una_adv; + if (seq_lt (ack, next_hole->start)) + sb->snd_una_adv = next_hole->start - ack; + sb->last_bytes_delivered += + next_hole->start - hole->end; } else if (!next_hole) { sb->snd_una_adv = sb->high_sacked - ack; - sb->last_bytes_delivered += sb->snd_una_adv; + sb->last_bytes_delivered += sb->high_sacked - hole->end; } } @@ -808,7 +797,6 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { if (seq_gt (blk->end, hole->start)) { - sb->sacked_bytes += blk->end - hole->start; hole->start = blk->end; } blk_index++; @@ -819,28 +807,16 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) /* Hole must be split */ if (seq_lt (blk->end, hole->end)) { - sb->sacked_bytes += blk->end - blk->start; hole_index = scoreboard_hole_index (sb, hole); - new_hole = scoreboard_insert_hole (sb, hole_index, blk->end, - hole->end); + scoreboard_insert_hole (sb, hole_index, blk->end, hole->end); /* Pool might've moved */ hole = scoreboard_get_hole (sb, hole_index); hole->end = blk->start; - - /* New or split of tail */ - if ((last_hole->end == new_hole->end) - || seq_lt (last_hole->end, new_hole->start)) - { - last_hole = new_hole; - sb->tail = scoreboard_hole_index (sb, new_hole); - } - blk_index++; } - else if (seq_leq (blk->start, hole->end)) + else if (seq_lt (blk->start, hole->end)) { - sb->sacked_bytes += hole->end - blk->start; hole->end = blk->start; } @@ -848,9 +824,13 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) } } - sb->last_sacked_bytes = sb->sacked_bytes - old_sacked_bytes; - sb->sacked_bytes -= sb->last_bytes_delivered; - scoreboard_update_lost (tc, sb); + scoreboard_update_bytes (tc, sb); + sb->last_sacked_bytes = sb->sacked_bytes + - (old_sacked_bytes - sb->last_bytes_delivered); + ASSERT (sb->sacked_bytes == 0 + || sb->sacked_bytes < tc->snd_una_max - seq_max (tc->snd_una, ack)); + ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_una_max + - seq_max (tc->snd_una, ack)); } /** @@ -998,9 +978,14 @@ tcp_should_fastrecover (tcp_connection_t * tc) || tcp_should_fastrecover_sack (tc)); } +/** + * One function to rule them all ... and in the darkness bind them + */ static void tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) { + u32 rxt_delivered; + /* * Duplicate ACK. Check if we should enter fast recovery, or if already in * it account for the bytes that left the network. @@ -1028,10 +1013,15 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) } /* If of of the two conditions lower hold, reset dupacks - * 1) Cumulative ack does not cover more than congestion threshold + * 1) Cumulative ack does not cover more than congestion threshold, + * and the following doesn't hold: the congestion window is + * greater than SMSS bytes and the difference between highest_ack + * and prev_highest_ack is at most 4*SMSS bytes (XXX) * 2) RFC6582 heuristic to avoid multiple fast retransmits */ - if (seq_leq (tc->snd_una, tc->snd_congestion) + if ((seq_gt (tc->snd_una, tc->snd_congestion) + || !(tc->cwnd > tc->snd_mss + && tc->bytes_acked <= 4 * tc->snd_mss)) || tc->rcv_opts.tsecr != tc->tsecr_last_ack) { tc->rcv_dupacks = 0; @@ -1089,7 +1079,10 @@ partial_ack: { /* If spurious return, we've already updated everything */ if (tcp_cc_recover (tc)) - return; + { + tc->tsecr_last_ack = tc->rcv_opts.tsecr; + return; + } tc->snd_nxt = tc->snd_una_max; @@ -1115,12 +1108,16 @@ partial_ack: return; /* Remove retransmitted bytes that have been delivered */ - if (tc->sack_sb.last_bytes_delivered - && seq_gt (tc->sack_sb.high_rxt, tc->snd_una)) + ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv + >= tc->sack_sb.last_bytes_delivered); + rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv + - tc->sack_sb.last_bytes_delivered; + if (rxt_delivered && seq_gt (tc->sack_sb.high_rxt, tc->snd_una)) { /* If we have sacks and we haven't gotten an ack beyond high_rxt, * remove sacked bytes delivered */ - tc->snd_rxt_bytes -= tc->sack_sb.last_bytes_delivered; + ASSERT (tc->snd_rxt_bytes >= rxt_delivered); + tc->snd_rxt_bytes -= rxt_delivered; } else { @@ -1154,6 +1151,8 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, u32 prev_snd_wnd, prev_snd_una; u8 is_dack; + TCP_EVT_DBG (TCP_EVT_CC_STAT, tc); + /* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) */ if (PREDICT_FALSE (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt))) { @@ -1282,6 +1281,10 @@ tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end) { vec_add1 (new_list, tc->snd_sacks[i]); } + else + { + clib_warning ("sack discarded"); + } } ASSERT (vec_len (new_list) <= TCP_MAX_SACK_BLOCKS); @@ -1358,16 +1361,18 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, stream_session_t *s0; int rv; + ASSERT (seq_gt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt)); + /* Pure ACK. Do nothing */ if (PREDICT_FALSE (data_len == 0)) { return TCP_ERROR_PURE_ACK; } - /* Enqueue out-of-order data with absolute offset */ + /* Enqueue out-of-order data with relative offset */ rv = stream_session_enqueue_data (&tc->connection, b, - vnet_buffer (b)->tcp.seq_number, - 0 /* queue event */ , 0); + vnet_buffer (b)->tcp.seq_number - + tc->rcv_nxt, 0 /* queue event */ , 0); /* Nothing written */ if (rv) @@ -1388,10 +1393,15 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, /* Get the newest segment from the fifo */ newest = svm_fifo_newest_ooo_segment (s0->server_rx_fifo); - start = ooo_segment_offset (s0->server_rx_fifo, newest); - end = ooo_segment_end_offset (s0->server_rx_fifo, newest); + if (newest) + { + start = + tc->rcv_nxt + ooo_segment_offset (s0->server_rx_fifo, newest); + end = start + ooo_segment_length (s0->server_rx_fifo, newest); + tcp_update_sack_list (tc, start, end); - tcp_update_sack_list (tc, start, end); + ASSERT (seq_gt (start, tc->rcv_nxt)); + } } return TCP_ERROR_ENQUEUED; @@ -1411,7 +1421,7 @@ tcp_can_delack (tcp_connection_t * tc) /* constrained to send ack */ || (tc->flags & TCP_CONN_SNDACK) != 0 /* we're almost out of tx wnd */ - || tcp_available_snd_space (tc) < 2 * tc->snd_mss) + || tcp_available_snd_space (tc) < 4 * tc->snd_mss) return 0; return 1; @@ -1434,7 +1444,7 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, *next0 = TCP_NEXT_DROP; /* Completely in the past (possible retransmit) */ - if (seq_lt (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt)) + if (seq_leq (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt)) goto done; /* Chop off the bytes in the past */ @@ -1873,8 +1883,8 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (tcp_opts_wscale (&new_tc0->rcv_opts)) new_tc0->snd_wscale = new_tc0->rcv_opts.wscale; - /* No scaling */ - new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window); + new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window) + << new_tc0->snd_wscale; new_tc0->snd_wl1 = seq0; new_tc0->snd_wl2 = ack0; @@ -1892,8 +1902,15 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Make sure las is initialized for the wnd computation */ new_tc0->rcv_las = new_tc0->rcv_nxt; - /* Notify app that we have connection */ - stream_session_connect_notify (&new_tc0->connection, sst, 0); + /* Notify app that we have connection. If session layer can't + * allocate session send reset */ + if (stream_session_connect_notify (&new_tc0->connection, sst, + 0)) + { + tcp_connection_cleanup (new_tc0); + tcp_send_reset (b0, is_ip4); + goto drop; + } stream_session_init_fifos_pointers (&new_tc0->connection, new_tc0->irs + 1, @@ -1907,7 +1924,14 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, new_tc0->state = TCP_STATE_SYN_RCVD; /* Notify app that we have connection */ - stream_session_connect_notify (&new_tc0->connection, sst, 0); + if (stream_session_connect_notify + (&new_tc0->connection, sst, 0)) + { + tcp_connection_cleanup (new_tc0); + tcp_send_reset (b0, is_ip4); + goto drop; + } + stream_session_init_fifos_pointers (&new_tc0->connection, new_tc0->irs + 1, new_tc0->iss + 1); @@ -2508,8 +2532,8 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (tcp_opts_wscale (&child0->rcv_opts)) child0->snd_wscale = child0->rcv_opts.wscale; - /* No scaling */ - child0->snd_wnd = clib_net_to_host_u16 (th0->window); + child0->snd_wnd = clib_net_to_host_u16 (th0->window) + << child0->snd_wscale; child0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number; child0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number; @@ -2892,6 +2916,9 @@ do { \ _(FIN_WAIT_2, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(FIN_WAIT_2, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(CLOSE_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(CLOSE_WAIT, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, + TCP_ERROR_NONE); _(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(LAST_ACK, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c index c66250e4..c825e952 100644 --- a/src/vnet/tcp/tcp_newreno.c +++ b/src/vnet/tcp/tcp_newreno.c @@ -18,7 +18,6 @@ void newreno_congestion (tcp_connection_t * tc) { - tc->prev_ssthresh = tc->ssthresh; tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss); } @@ -47,7 +46,8 @@ newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type) { if (ack_type == TCP_CC_DUPACK) { - tc->cwnd += tc->snd_mss; + if (!tcp_opts_sack_permitted (tc)) + tc->cwnd += tc->snd_mss; } else if (ack_type == TCP_CC_PARTIALACK) { diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 47c94e6d..554a981d 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -1052,6 +1052,7 @@ tcp_rtx_timeout_cc (tcp_connection_t * tc) tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss); tc->cwnd = tcp_loss_wnd (tc); tc->snd_congestion = tc->snd_una_max; + tcp_recovery_on (tc); } @@ -1213,7 +1214,7 @@ tcp_timer_persist_handler (u32 index) tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID; /* Problem already solved or worse */ - if (tc->state == TCP_STATE_CLOSED + if (tc->state == TCP_STATE_CLOSED || tc->state > TCP_STATE_ESTABLISHED || tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc)) return; @@ -1505,10 +1506,7 @@ tcp46_output_inline (vlib_main_t * vm, /* Stop DELACK timer and fix flags */ tc0->flags &= ~(TCP_CONN_SNDACK); - if (tcp_timer_is_active (tc0, TCP_TIMER_DELACK)) - { - tcp_timer_reset (tc0, TCP_TIMER_DELACK); - } + tcp_timer_reset (tc0, TCP_TIMER_DELACK); /* If not retransmitting * 1) update snd_una_max (SYN, SYNACK, FIN) @@ -1630,7 +1628,7 @@ tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b) tc = (tcp_connection_t *) tconn; tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED, 0); - if (tc->rtt_ts == 0) + if (tc->rtt_ts == 0 && !tcp_in_cong_recovery (tc)) { tc->rtt_ts = tcp_time_now (); tc->rtt_seq = tc->snd_nxt; diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c index 3f8afa40..a461e3b8 100644 --- a/src/vnet/tcp/tcp_test.c +++ b/src/vnet/tcp/tcp_test.c @@ -190,11 +190,18 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes); TCP_TEST ((pool_elts (sb->holes) == 1), "scoreboard has %d elements", pool_elts (sb->holes)); + hole = scoreboard_first_hole (sb); + TCP_TEST ((hole->prev == TCP_INVALID_SACK_HOLE_INDEX + && hole->next == TCP_INVALID_SACK_HOLE_INDEX), "hole is valid"); + TCP_TEST ((sb->last_bytes_delivered == 100), "last bytes delivered %d", + sb->last_bytes_delivered); /* * Add some more blocks and then remove all */ vec_reset_length (tc->rcv_opts.sacks); + tc->snd_una += sb->snd_una_adv; + tc->snd_una_max = 1900; for (i = 0; i < 5; i++) { block.start = i * 100 + 1200; @@ -242,6 +249,39 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) TCP_TEST ((sb->last_sacked_bytes == 0), "last sacked bytes %d", sb->last_sacked_bytes); + /* + * Inject one block, ack it and overlap hole + */ + + tc->snd_una = 0; + tc->snd_una_max = 1000; + tc->snd_nxt = 1000; + + block.start = 100; + block.end = 500; + vec_add1 (tc->rcv_opts.sacks, block); + tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks); + + tcp_rcv_sacks (tc, 0); + + if (verbose) + vlib_cli_output (vm, "sb added [100, 500]:\n%U", + format_tcp_scoreboard, sb); + + tcp_rcv_sacks (tc, 800); + + if (verbose) + vlib_cli_output (vm, "sb ack [0, 800]:\n%U", format_tcp_scoreboard, sb); + + TCP_TEST ((pool_elts (sb->holes) == 1), + "scoreboard has %d elements", pool_elts (sb->holes)); + TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv); + TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes); + TCP_TEST ((sb->last_sacked_bytes == 0), + "last sacked bytes %d", sb->last_sacked_bytes); + TCP_TEST ((sb->last_bytes_delivered == 400), + "last bytes delivered %d", sb->last_bytes_delivered); + return 0; } @@ -571,7 +611,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) */ for (i = 0; i < 3; i++) { - offset = (2 * i + 1) * sizeof (u32); + offset = (2 * i + 1) * sizeof (u32) - f->tail; data = (u8 *) (test_data + (2 * i + 1)); if (i == 0) { @@ -600,7 +640,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) /* * Try adding a completely overlapped segment */ - offset = 3 * sizeof (u32); + offset = 3 * sizeof (u32) - f->tail; data = (u8 *) (test_data + 3); rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data); if (rv) @@ -626,7 +666,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) */ for (i = 3; i > 1; i--) { - offset = (2 * i + 0) * sizeof (u32); + offset = (2 * i + 0) * sizeof (u32) - f->tail; data = (u8 *) (test_data + (2 * i + 0)); rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data); if (verbose) @@ -688,7 +728,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) for (i = 0; i < 4; i++) { - offset = (2 * i + 1) * sizeof (u32); + offset = (2 * i + 1) * sizeof (u32) - f->tail; data = (u8 *) (test_data + (2 * i + 1)); rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data); if (verbose) @@ -701,7 +741,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) } } - rv = svm_fifo_enqueue_with_offset (f, 8, 21, data); + rv = svm_fifo_enqueue_with_offset (f, 8 - f->tail, 21, data); TCP_TEST ((rv == 0), "ooo enqueued %u", rv); TCP_TEST ((svm_fifo_number_ooo_segments (f) == 1), "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); @@ -722,7 +762,7 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) for (i = 0; i < 4; i++) { - offset = (2 * i + 1) * sizeof (u32); + offset = (2 * i + 1) * sizeof (u32) - f->tail; data = (u8 *) (test_data + (2 * i + 1)); rv = svm_fifo_enqueue_with_offset (f, offset, sizeof (u32), data); if (verbose) @@ -735,7 +775,13 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) } } + if (verbose) + vlib_cli_output (vm, "fifo after enqueue: %U", format_svm_fifo, f, 1); + rv = svm_fifo_enqueue_nowait (f, 29, data); + if (verbose) + vlib_cli_output (vm, "fifo after enqueueing 29: %U", format_svm_fifo, f, + 1); TCP_TEST ((rv == 32), "ooo enqueued %u", rv); TCP_TEST ((svm_fifo_number_ooo_segments (f) == 0), "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); @@ -788,7 +834,8 @@ tcp_test_fifo2 (vlib_main_t * vm) { tp = vp + i; data64 = tp->offset; - svm_fifo_enqueue_with_offset (f, tp->offset, tp->len, (u8 *) & data64); + svm_fifo_enqueue_with_offset (f, tp->offset - f->tail, tp->len, + (u8 *) & data64); } /* Expected result: one big fat chunk at offset 4 */ @@ -817,7 +864,7 @@ tcp_test_fifo2 (vlib_main_t * vm) { tp = &test_data[i]; data64 = tp->offset; - rv = svm_fifo_enqueue_with_offset (f, tp->offset, tp->len, + rv = svm_fifo_enqueue_with_offset (f, tp->offset - f->tail, tp->len, (u8 *) & data64); if (rv) { @@ -991,8 +1038,9 @@ tcp_test_fifo3 (vlib_main_t * vm, unformat_input_t * input) for (i = !randomize; i < vec_len (generate); i++) { tp = generate + i; - svm_fifo_enqueue_with_offset (f, fifo_initial_offset + tp->offset, - tp->len, + svm_fifo_enqueue_with_offset (f, + fifo_initial_offset + tp->offset - + f->tail, tp->len, (u8 *) data_pattern + tp->offset); } @@ -1107,7 +1155,7 @@ tcp_test_fifo4 (vlib_main_t * vm, unformat_input_t * input) for (i = test_n_bytes - 1; i > 0; i--) { - rv = svm_fifo_enqueue_with_offset (f, fifo_initial_offset + i, + rv = svm_fifo_enqueue_with_offset (f, fifo_initial_offset + i - f->tail, sizeof (u8), &test_data[i]); if (verbose) vlib_cli_output (vm, "add [%d] [%d, %d]", i, i, i + sizeof (u8)); -- cgit 1.2.3-korg From 2c25a62cc1cc4937165de740a3b32d78429c72d6 Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Mon, 26 Jun 2017 11:35:07 -0400 Subject: Horizontal (nSessions) scaling draft - Data structure preallocation. - Input state machine fixes for mid-stream 3-way handshake retries. - Batch connections in the builtin_client - Multiple private fifo segment support - Fix elog simultaneous event type registration - Fix sacks when segment hole is added after highest sacked - Add "accepting" session state for sessions pending accept - Add ssvm non-recursive locking - Estimate RTT for syn-ack - Don't init fifo pointers. We're using relative offsets for ooo segments - CLI to dump individual session Change-Id: Ie0598563fd246537bafba4feed7985478ea1d415 Signed-off-by: Dave Barach Signed-off-by: Florin Coras --- src/svm/ssvm.h | 17 +++ src/svm/svm_fifo.c | 56 +++++--- src/svm/svm_fifo.h | 16 ++- src/svm/svm_fifo_segment.c | 114 +++++++++++----- src/svm/svm_fifo_segment.h | 4 +- src/svm/test_svm_fifo1.c | 10 +- src/uri/uri_udp_test.c | 2 +- src/vnet/session/application.c | 2 + src/vnet/session/application_interface.c | 21 --- src/vnet/session/application_interface.h | 12 +- src/vnet/session/node.c | 23 +--- src/vnet/session/segment_manager.c | 26 ++-- src/vnet/session/segment_manager.h | 4 + src/vnet/session/session.c | 72 +++++++--- src/vnet/session/session.h | 30 ++++- src/vnet/session/session_cli.c | 99 +++++++++++--- src/vnet/session/transport.h | 6 + src/vnet/tcp/builtin_client.c | 118 +++++++++++----- src/vnet/tcp/builtin_client.h | 7 +- src/vnet/tcp/builtin_server.c | 66 +++++++-- src/vnet/tcp/tcp.c | 225 ++++++++++++++++++++++++++++--- src/vnet/tcp/tcp.h | 13 ++ src/vnet/tcp/tcp_debug.h | 13 +- src/vnet/tcp/tcp_input.c | 97 ++++++++----- src/vnet/tcp/tcp_newreno.c | 4 +- src/vnet/tcp/tcp_output.c | 53 +++++--- src/vnet/tcp/tcp_packet.h | 1 + src/vnet/tcp/tcp_test.c | 10 +- src/vnet/udp/udp_input.c | 2 +- 29 files changed, 838 insertions(+), 285 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/svm/ssvm.h b/src/svm/ssvm.h index bccfc164..8466e155 100644 --- a/src/svm/ssvm.h +++ b/src/svm/ssvm.h @@ -101,6 +101,15 @@ ssvm_lock (ssvm_shared_header_t * h, u32 my_pid, u32 tag) h->tag = tag; } +always_inline void +ssvm_lock_non_recursive (ssvm_shared_header_t * h, u32 tag) +{ + while (__sync_lock_test_and_set (&h->lock, 1)) + ; + + h->tag = tag; +} + always_inline void ssvm_unlock (ssvm_shared_header_t * h) { @@ -113,6 +122,14 @@ ssvm_unlock (ssvm_shared_header_t * h) } } +always_inline void +ssvm_unlock_non_recursive (ssvm_shared_header_t * h) +{ + h->tag = 0; + CLIB_MEMORY_BARRIER (); + h->lock = 0; +} + static inline void * ssvm_push_heap (ssvm_shared_header_t * sh) { diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c index aed5d6a7..da60fee5 100644 --- a/src/svm/svm_fifo.c +++ b/src/svm/svm_fifo.c @@ -19,29 +19,29 @@ static inline u8 position_lt (svm_fifo_t * f, u32 a, u32 b) { - return (ooo_segment_distance_to_tail (f, a) - < ooo_segment_distance_to_tail (f, b)); + return (ooo_segment_distance_from_tail (f, a) + < ooo_segment_distance_from_tail (f, b)); } static inline u8 position_leq (svm_fifo_t * f, u32 a, u32 b) { - return (ooo_segment_distance_to_tail (f, a) - <= ooo_segment_distance_to_tail (f, b)); + return (ooo_segment_distance_from_tail (f, a) + <= ooo_segment_distance_from_tail (f, b)); } static inline u8 position_gt (svm_fifo_t * f, u32 a, u32 b) { - return (ooo_segment_distance_to_tail (f, a) - > ooo_segment_distance_to_tail (f, b)); + return (ooo_segment_distance_from_tail (f, a) + > ooo_segment_distance_from_tail (f, b)); } static inline u32 position_diff (svm_fifo_t * f, u32 posa, u32 posb) { - return ooo_segment_distance_to_tail (f, posa) - - ooo_segment_distance_to_tail (f, posb); + return ooo_segment_distance_from_tail (f, posa) + - ooo_segment_distance_from_tail (f, posb); } static inline u32 @@ -113,7 +113,7 @@ svm_fifo_create (u32 data_size_in_bytes) if (f == 0) return 0; - memset (f, 0, sizeof (*f) + data_size_in_bytes); + memset (f, 0, sizeof (*f)); f->nitems = data_size_in_bytes; f->ooos_list_head = OOO_SEGMENT_INVALID_INDEX; @@ -204,7 +204,19 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) { s = prev; s_end_pos = ooo_segment_end_pos (f, s); - goto merge; + + /* Check head and tail now since segment may be wider at both ends so + * merge tests lower won't work */ + if (position_lt (f, normalized_position, s->start)) + { + s->start = normalized_position; + s->length = position_diff (f, s_end_pos, s->start); + } + if (position_gt (f, normalized_end_position, s_end_pos)) + { + s->length = position_diff (f, normalized_end_position, s->start); + } + goto check_tail; } s_index = s - f->ooo_segments; @@ -257,8 +269,6 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) * Merge needed */ -merge: - /* Merge at head */ if (position_lt (f, normalized_position, s->start)) { @@ -278,6 +288,7 @@ merge: goto done; } +check_tail: /* The new segment's tail may cover multiple smaller ones */ if (position_gt (f, normalized_end_position, s_end_pos)) { @@ -296,7 +307,8 @@ merge: /* If partial overlap with last, merge */ if (it && position_leq (f, it->start, normalized_end_position)) { - s->length = ooo_segment_end_pos (f, it) - s->start; + s->length = + position_diff (f, ooo_segment_end_pos (f, it), s->start); ooo_segment_del (f, it - f->ooo_segments); } } @@ -319,9 +331,9 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) i32 diff; s = pool_elt_at_index (f->ooo_segments, f->ooos_list_head); + diff = ooo_segment_distance_to_tail (f, s->start); - diff = (f->tail >= s->start) ? - f->tail - s->start : f->nitems + f->tail - s->start; + ASSERT (diff != n_bytes_enqueued); if (diff > n_bytes_enqueued) return 0; @@ -345,8 +357,7 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) if (s->next != OOO_SEGMENT_INVALID_INDEX) { s = pool_elt_at_index (f->ooo_segments, s->next); - diff = (f->tail >= s->start) ? - f->tail - s->start : f->nitems + f->tail - s->start; + diff = ooo_segment_distance_to_tail (f, s->start); ooo_segment_del (f, index); } /* End of search */ @@ -357,6 +368,7 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) } } + ASSERT (bytes >= 0 && bytes <= f->nitems); return bytes; } @@ -401,6 +413,8 @@ svm_fifo_enqueue_internal (svm_fifo_t * f, u32 max_bytes, u8 * copy_from_here) } else { + ASSERT (0); + /* Account for a zero-copy enqueue done elsewhere */ ASSERT (max_bytes <= (nitems - cursize)); f->tail += max_bytes; @@ -413,6 +427,7 @@ svm_fifo_enqueue_internal (svm_fifo_t * f, u32 max_bytes, u8 * copy_from_here) total_copy_bytes += ooo_segment_try_collect (f, total_copy_bytes); /* Atomically increase the queue length */ + ASSERT (cursize + total_copy_bytes <= nitems); __sync_fetch_and_add (&f->cursize, total_copy_bytes); return (total_copy_bytes); @@ -475,6 +490,8 @@ svm_fifo_enqueue_with_offset_internal (svm_fifo_t * f, cursize = svm_fifo_max_dequeue (f); nitems = f->nitems; + ASSERT (required_bytes < nitems); + normalized_offset = (f->tail + offset) % nitems; /* Will this request fit? */ @@ -557,6 +574,7 @@ svm_fifo_dequeue_internal (svm_fifo_t * f, u32 max_bytes, u8 * copy_here) } else { + ASSERT (0); /* Account for a zero-copy dequeue done elsewhere */ ASSERT (max_bytes <= cursize); f->head += max_bytes; @@ -565,6 +583,8 @@ svm_fifo_dequeue_internal (svm_fifo_t * f, u32 max_bytes, u8 * copy_here) total_copy_bytes = max_bytes; } + ASSERT (f->head <= nitems); + ASSERT (cursize >= total_copy_bytes); __sync_fetch_and_sub (&f->cursize, total_copy_bytes); return (total_copy_bytes); @@ -702,6 +722,8 @@ svm_fifo_dequeue_drop (svm_fifo_t * f, u32 max_bytes) f->head = (f->head == nitems) ? 0 : f->head; } + ASSERT (f->head <= nitems); + ASSERT (cursize >= total_drop_bytes); __sync_fetch_and_sub (&f->cursize, total_drop_bytes); return total_drop_bytes; diff --git a/src/svm/svm_fifo.h b/src/svm/svm_fifo.h index f32ef41d..fe21de47 100644 --- a/src/svm/svm_fifo.h +++ b/src/svm/svm_fifo.h @@ -133,25 +133,31 @@ svm_fifo_newest_ooo_segment (svm_fifo_t * f) } always_inline u32 -ooo_segment_distance_to_tail (svm_fifo_t * f, u32 a) +ooo_segment_distance_from_tail (svm_fifo_t * f, u32 pos) { /* Ambiguous. Assumption is that ooo segments don't touch tail */ - if (a == f->tail && f->tail == f->head) + if (PREDICT_FALSE (pos == f->tail && f->tail == f->head)) return f->nitems; - return ((f->nitems + a - f->tail) % f->nitems); + return (((f->nitems + pos) - f->tail) % f->nitems); +} + +always_inline u32 +ooo_segment_distance_to_tail (svm_fifo_t * f, u32 pos) +{ + return (((f->nitems + f->tail) - pos) % f->nitems); } always_inline u32 ooo_segment_offset (svm_fifo_t * f, ooo_segment_t * s) { - return ooo_segment_distance_to_tail (f, s->start); + return ooo_segment_distance_from_tail (f, s->start); } always_inline u32 ooo_segment_end_offset (svm_fifo_t * f, ooo_segment_t * s) { - return ooo_segment_distance_to_tail (f, s->start) + s->length; + return ooo_segment_distance_from_tail (f, s->start) + s->length; } always_inline u32 diff --git a/src/svm/svm_fifo_segment.c b/src/svm/svm_fifo_segment.c index c4ac2352..69d4ecb9 100644 --- a/src/svm/svm_fifo_segment.c +++ b/src/svm/svm_fifo_segment.c @@ -35,6 +35,11 @@ preallocate_fifo_pairs (svm_fifo_segment_header_t * fsh, rx_fifo_size = (sizeof (*f) + a->rx_fifo_size) * a->preallocated_fifo_pairs; tx_fifo_size = (sizeof (*f) + a->tx_fifo_size) * a->preallocated_fifo_pairs; + if (0) + clib_warning ("rx_fifo_size %u (%d mb), tx_fifo_size %u (%d mb)", + rx_fifo_size, rx_fifo_size >> 20, + tx_fifo_size, tx_fifo_size >> 20); + /* Allocate rx fifo space. May fail. */ rx_fifo_space = clib_mem_alloc_aligned_at_offset (rx_fifo_size, CLIB_CACHE_LINE_BYTES, 0 /* align_offset */ , @@ -129,7 +134,7 @@ svm_fifo_segment_create (svm_fifo_segment_create_args_t * a) ssvm_pop_heap (oldheap); sh->ready = 1; - a->new_segment_index = s - sm->segments; + vec_add1 (a->new_segment_indices, s - sm->segments); return (0); } @@ -141,35 +146,81 @@ svm_fifo_segment_create_process_private (svm_fifo_segment_create_args_t * a) svm_fifo_segment_main_t *sm = &svm_fifo_segment_main; ssvm_shared_header_t *sh; svm_fifo_segment_header_t *fsh; + void *oldheap; + u8 **heaps = 0; + mheap_t *heap_header; + int segment_count = 1; + int i; - /* Allocate a fresh segment */ - pool_get (sm->segments, s); - memset (s, 0, sizeof (*s)); - - s->ssvm.ssvm_size = ~0; - s->ssvm.i_am_master = 1; - s->ssvm.my_pid = getpid (); - s->ssvm.name = (u8 *) a->segment_name; - s->ssvm.requested_va = ~0; - - /* Allocate a [sic] shared memory header, in process memory... */ - sh = clib_mem_alloc_aligned (sizeof (*sh), CLIB_CACHE_LINE_BYTES); - s->ssvm.sh = sh; + if (a->private_segment_count && a->private_segment_size) + { + void *mem; + u8 *heap; + u32 pagesize = clib_mem_get_page_size (); + u32 rnd_size; - memset (sh, 0, sizeof (*sh)); - sh->heap = clib_mem_get_heap (); + for (i = 0; i < a->private_segment_count; i++) + { + rnd_size = (a->private_segment_size + (pagesize - 1)) & ~pagesize; + + mem = mmap (0, rnd_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + -1 /* fd */ , 0 /* offset */ ); + + if (mem == MAP_FAILED) + { + clib_unix_warning ("mmap"); + return -1; + } + heap = mheap_alloc (mem, rnd_size); + heap_header = mheap_header (heap); + heap_header->flags |= MHEAP_FLAG_THREAD_SAFE; + vec_add1 (heaps, heap); + } + segment_count = a->private_segment_count; + } - /* Set up svm_fifo_segment shared header */ - fsh = clib_mem_alloc (sizeof (*fsh)); - memset (fsh, 0, sizeof (*fsh)); - sh->opaque[0] = fsh; - s->h = fsh; - fsh->segment_name = format (0, "%s%c", a->segment_name, 0); + /* Spread preallocated fifo pairs across segments */ + a->preallocated_fifo_pairs /= segment_count; - preallocate_fifo_pairs (fsh, a); + /* Allocate segments */ + for (i = 0; i < segment_count; i++) + { + pool_get (sm->segments, s); + memset (s, 0, sizeof (*s)); + + s->ssvm.ssvm_size = ~0; + s->ssvm.i_am_master = 1; + s->ssvm.my_pid = getpid (); + s->ssvm.name = (u8 *) a->segment_name; + s->ssvm.requested_va = ~0; + + /* Allocate a [sic] shared memory header, in process memory... */ + sh = clib_mem_alloc_aligned (sizeof (*sh), CLIB_CACHE_LINE_BYTES); + s->ssvm.sh = sh; + + memset (sh, 0, sizeof (*sh)); + sh->heap = a->private_segment_count ? heaps[i] : clib_mem_get_heap (); + + /* Set up svm_fifo_segment shared header */ + fsh = clib_mem_alloc (sizeof (*fsh)); + memset (fsh, 0, sizeof (*fsh)); + sh->opaque[0] = fsh; + s->h = fsh; + fsh->segment_name = format (0, "%s%c", a->segment_name, 0); + + if (a->private_segment_count) + { + oldheap = clib_mem_get_heap (); + clib_mem_set_heap (sh->heap); + preallocate_fifo_pairs (fsh, a); + clib_mem_set_heap (oldheap); + } - sh->ready = 1; - a->new_segment_index = s - sm->segments; + sh->ready = 1; + vec_add1 (a->new_segment_indices, s - sm->segments); + } + vec_free (heaps); return (0); } @@ -205,7 +256,7 @@ svm_fifo_segment_attach (svm_fifo_segment_create_args_t * a) fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; s->h = fsh; - a->new_segment_index = s - sm->segments; + vec_add1 (a->new_segment_indices, s - sm->segments); return (0); } @@ -230,7 +281,7 @@ svm_fifo_segment_alloc_fifo (svm_fifo_segment_private_t * s, sh = s->ssvm.sh; fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; - ssvm_lock (sh, 1, 0); + ssvm_lock_non_recursive (sh, 1); oldheap = ssvm_push_heap (sh); switch (list_index) @@ -261,7 +312,7 @@ svm_fifo_segment_alloc_fifo (svm_fifo_segment_private_t * s, if (PREDICT_FALSE (f == 0)) { ssvm_pop_heap (oldheap); - ssvm_unlock (sh); + ssvm_unlock_non_recursive (sh); return (0); } @@ -281,7 +332,7 @@ found: } ssvm_pop_heap (oldheap); - ssvm_unlock (sh); + ssvm_unlock_non_recursive (sh); return (f); } @@ -293,10 +344,11 @@ svm_fifo_segment_free_fifo (svm_fifo_segment_private_t * s, svm_fifo_t * f, svm_fifo_segment_header_t *fsh; void *oldheap; + sh = s->ssvm.sh; fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; - ssvm_lock (sh, 1, 0); + ssvm_lock_non_recursive (sh, 2); oldheap = ssvm_push_heap (sh); switch (list_index) @@ -325,7 +377,7 @@ svm_fifo_segment_free_fifo (svm_fifo_segment_private_t * s, svm_fifo_t * f, } ssvm_pop_heap (oldheap); - ssvm_unlock (sh); + ssvm_unlock_non_recursive (sh); } void diff --git a/src/svm/svm_fifo_segment.h b/src/svm/svm_fifo_segment.h index 31e14db5..a7a3f469 100644 --- a/src/svm/svm_fifo_segment.h +++ b/src/svm/svm_fifo_segment.h @@ -57,10 +57,12 @@ typedef struct { char *segment_name; u32 segment_size; - u32 new_segment_index; + u32 *new_segment_indices; u32 rx_fifo_size; u32 tx_fifo_size; u32 preallocated_fifo_pairs; + u32 private_segment_count; + u32 private_segment_size; } svm_fifo_segment_create_args_t; static inline svm_fifo_segment_private_t * diff --git a/src/svm/test_svm_fifo1.c b/src/svm/test_svm_fifo1.c index 63b4a9b7..63d75845 100644 --- a/src/svm/test_svm_fifo1.c +++ b/src/svm/test_svm_fifo1.c @@ -39,7 +39,7 @@ hello_world (int verbose) if (rv) return clib_error_return (0, "svm_fifo_segment_create returned %d", rv); - sp = svm_fifo_get_segment (a->new_segment_index); + sp = svm_fifo_get_segment (a->new_segment_indices[0]); f = svm_fifo_segment_alloc_fifo (sp, 4096, FIFO_SEGMENT_RX_FREELIST); @@ -92,7 +92,7 @@ master (int verbose) if (rv) return clib_error_return (0, "svm_fifo_segment_create returned %d", rv); - sp = svm_fifo_get_segment (a->new_segment_index); + sp = svm_fifo_get_segment (a->new_segment_indices[0]); f = svm_fifo_segment_alloc_fifo (sp, 4096, FIFO_SEGMENT_RX_FREELIST); @@ -128,7 +128,7 @@ mempig (int verbose) if (rv) return clib_error_return (0, "svm_fifo_segment_create returned %d", rv); - sp = svm_fifo_get_segment (a->new_segment_index); + sp = svm_fifo_get_segment (a->new_segment_indices[0]); for (i = 0; i < 1000; i++) { @@ -186,7 +186,7 @@ offset (int verbose) if (rv) return clib_error_return (0, "svm_fifo_segment_create returned %d", rv); - sp = svm_fifo_get_segment (a->new_segment_index); + sp = svm_fifo_get_segment (a->new_segment_indices[0]); f = svm_fifo_segment_alloc_fifo (sp, 200 << 10, FIFO_SEGMENT_RX_FREELIST); @@ -246,7 +246,7 @@ slave (int verbose) if (rv) return clib_error_return (0, "svm_fifo_segment_attach returned %d", rv); - sp = svm_fifo_get_segment (a->new_segment_index); + sp = svm_fifo_get_segment (a->new_segment_indices[0]); sh = sp->ssvm.sh; fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; diff --git a/src/uri/uri_udp_test.c b/src/uri/uri_udp_test.c index 45ad35a4..a8e39eaa 100644 --- a/src/uri/uri_udp_test.c +++ b/src/uri/uri_udp_test.c @@ -707,7 +707,7 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) return; } - segment_index = a->new_segment_index; + segment_index = a->new_segment_indices[0]; vec_add2 (utm->seg, seg, 1); memcpy (seg, sm->segments + segment_index, sizeof (*seg)); sleep (1); diff --git a/src/vnet/session/application.c b/src/vnet/session/application.c index 3cc56f37..8a953719 100644 --- a/src/vnet/session/application.c +++ b/src/vnet/session/application.c @@ -174,6 +174,8 @@ application_init (application_t * app, u32 api_client_index, u64 * options, props->preallocated_fifo_pairs = options[APP_OPTIONS_PREALLOC_FIFO_PAIRS]; props->use_private_segment = options[APP_OPTIONS_FLAGS] & APP_OPTIONS_FLAGS_BUILTIN_APP; + props->private_segment_count = options[APP_OPTIONS_PRIVATE_SEGMENT_COUNT]; + props->private_segment_size = options[APP_OPTIONS_PRIVATE_SEGMENT_SIZE]; first_seg_size = options[SESSION_OPTIONS_SEGMENT_SIZE]; if ((rv = segment_manager_init (sm, props, first_seg_size))) diff --git a/src/vnet/session/application_interface.c b/src/vnet/session/application_interface.c index 338ae857..566a52d7 100644 --- a/src/vnet/session/application_interface.c +++ b/src/vnet/session/application_interface.c @@ -275,27 +275,6 @@ vnet_application_detach (vnet_app_detach_args_t * a) return 0; } -session_type_t -session_type_from_proto_and_ip (session_api_proto_t proto, u8 is_ip4) -{ - if (proto == SESSION_PROTO_TCP) - { - if (is_ip4) - return SESSION_TYPE_IP4_TCP; - else - return SESSION_TYPE_IP6_TCP; - } - else - { - if (is_ip4) - return SESSION_TYPE_IP4_UDP; - else - return SESSION_TYPE_IP6_UDP; - } - - return SESSION_N_TYPES; -} - int vnet_bind_uri (vnet_bind_args_t * a) { diff --git a/src/vnet/session/application_interface.h b/src/vnet/session/application_interface.h index 4d6f9def..ed9f89b3 100644 --- a/src/vnet/session/application_interface.h +++ b/src/vnet/session/application_interface.h @@ -22,12 +22,6 @@ #include #include -typedef enum _session_api_proto -{ - SESSION_PROTO_TCP, - SESSION_PROTO_UDP -} session_api_proto_t; - typedef struct _vnet_app_attach_args_t { /** Binary API client index */ @@ -65,7 +59,7 @@ typedef struct _vnet_bind_args_t struct { transport_endpoint_t tep; - session_api_proto_t proto; + transport_proto_t proto; }; }; @@ -98,7 +92,7 @@ typedef struct _vnet_connect_args struct { transport_endpoint_t tep; - session_api_proto_t proto; + transport_proto_t proto; }; }; u32 app_index; @@ -120,6 +114,8 @@ typedef enum APP_EVT_QUEUE_SIZE, APP_OPTIONS_FLAGS, APP_OPTIONS_PREALLOC_FIFO_PAIRS, + APP_OPTIONS_PRIVATE_SEGMENT_COUNT, + APP_OPTIONS_PRIVATE_SEGMENT_SIZE, SESSION_OPTIONS_SEGMENT_SIZE, SESSION_OPTIONS_ADD_SEGMENT_SIZE, SESSION_OPTIONS_RX_FIFO_SIZE, diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c index b24f5fd9..56e62637 100644 --- a/src/vnet/session/node.c +++ b/src/vnet/session/node.c @@ -378,24 +378,12 @@ session_tx_fifo_dequeue_and_snd (vlib_main_t * vm, vlib_node_runtime_t * node, n_tx_pkts, 0); } -stream_session_t * -session_event_get_session (session_fifo_event_t * e0, u8 thread_index) +always_inline stream_session_t * +session_event_get_session (session_fifo_event_t * e, u8 thread_index) { - svm_fifo_t *f0; - stream_session_t *s0; - u32 session_index0; - - f0 = e0->fifo; - session_index0 = f0->master_session_index; - - /* $$$ add multiple event queues, per vpp worker thread */ - ASSERT (f0->master_thread_index == thread_index); - - s0 = stream_session_get_if_valid (session_index0, thread_index); - - ASSERT (s0 == 0 || s0->thread_index == thread_index); - - return s0; + ASSERT (e->fifo->master_thread_index == thread_index); + return stream_session_get_if_valid (e->fifo->master_session_index, + thread_index); } void @@ -569,7 +557,6 @@ skip_dequeue: case FIFO_EVENT_BUILTIN_RX: s0 = session_event_get_session (e0, my_thread_index); svm_fifo_unset_event (s0->server_rx_fifo); - /* Get session's server */ app = application_get (s0->app_index); app->cb_fns.builtin_server_rx_callback (s0); break; diff --git a/src/vnet/session/segment_manager.c b/src/vnet/session/segment_manager.c index dcef6261..262b7faa 100644 --- a/src/vnet/session/segment_manager.c +++ b/src/vnet/session/segment_manager.c @@ -30,7 +30,7 @@ segment_manager_t *segment_managers = 0; /** * Process private segment index */ -u32 private_segment_index = ~0; +u32 *private_segment_indices; /** * Default fifo and segment size. TODO config. @@ -70,7 +70,8 @@ session_manager_add_segment_i (segment_manager_t * sm, u32 segment_size, return VNET_API_ERROR_SVM_SEGMENT_CREATE_FAIL; } - vec_add1 (sm->segment_indices, ca->new_segment_index); + vec_append (sm->segment_indices, ca->new_segment_indices); + vec_free (ca->new_segment_indices); return 0; } @@ -111,22 +112,23 @@ static void { svm_fifo_segment_create_args_t _a, *a = &_a; - if (private_segment_index != ~0) + if (private_segment_indices) return; memset (a, 0, sizeof (*a)); a->segment_name = "process-private-segment"; a->segment_size = ~0; - a->new_segment_index = ~0; a->rx_fifo_size = props->rx_fifo_size; a->tx_fifo_size = props->tx_fifo_size; a->preallocated_fifo_pairs = props->preallocated_fifo_pairs; + a->private_segment_count = props->private_segment_count; + a->private_segment_size = props->private_segment_size; if (svm_fifo_segment_create_process_private (a)) clib_warning ("Failed to create process private segment"); - private_segment_index = a->new_segment_index; - ASSERT (private_segment_index != ~0); + private_segment_indices = a->new_segment_indices; + ASSERT (vec_len (private_segment_indices)); } /** @@ -156,10 +158,10 @@ segment_manager_init (segment_manager_t * sm, } else { - if (private_segment_index == ~0) + if (vec_len (private_segment_indices) == 0) segment_manager_alloc_process_private_segment (properties); - ASSERT (private_segment_index != ~0); - vec_add1 (sm->segment_indices, private_segment_index); + ASSERT (vec_len (private_segment_indices)); + vec_append (sm->segment_indices, private_segment_indices); } clib_spinlock_init (&sm->lockp); @@ -320,7 +322,7 @@ again: /* See if we're supposed to create another segment */ if (*server_rx_fifo == 0) { - if (sm->properties->add_segment) + if (sm->properties->add_segment && !sm->properties->use_private_segment) { if (added_a_segment) { @@ -379,6 +381,10 @@ segment_manager_dealloc_fifos (u32 svm_segment_index, svm_fifo_t * rx_fifo, svm_fifo_segment_free_fifo (fifo_segment, tx_fifo, FIFO_SEGMENT_TX_FREELIST); + /* Don't try to delete process-private segments */ + if (sm->properties->private_segment_count > 0) + return; + /* Remove segment only if it holds no fifos and not the first */ if (sm->segment_indices[0] != svm_segment_index && !svm_fifo_segment_has_fifos (fifo_segment)) diff --git a/src/vnet/session/segment_manager.h b/src/vnet/session/segment_manager.h index df38d2b3..41abeb22 100644 --- a/src/vnet/session/segment_manager.h +++ b/src/vnet/session/segment_manager.h @@ -39,6 +39,10 @@ typedef struct _segment_manager_properties /** Use private memory segment instead of shared memory */ u8 use_private_segment; + + /** Use one or more private mheaps, instead of the global heap */ + u32 private_segment_count; + u32 private_segment_size; } segment_manager_properties_t; typedef struct _segment_manager diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index fe198044..0a86d563 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -198,21 +198,28 @@ stream_session_lookup_listener4 (ip4_address_t * lcl, u16 lcl_port, u8 proto) */ stream_session_t * stream_session_lookup4 (ip4_address_t * lcl, ip4_address_t * rmt, - u16 lcl_port, u16 rmt_port, u8 proto, - u32 my_thread_index) + u16 lcl_port, u16 rmt_port, u8 proto) { session_manager_main_t *smm = &session_manager_main; session_kv4_t kv4; + stream_session_t *s; int rv; /* Lookup session amongst established ones */ make_v4_ss_kv (&kv4, lcl, rmt, lcl_port, rmt_port, proto); rv = clib_bihash_search_inline_16_8 (&smm->v4_session_hash, &kv4); if (rv == 0) - return stream_session_get_tsi (kv4.value, my_thread_index); + return stream_session_get_from_handle (kv4.value); /* If nothing is found, check if any listener is available */ - return stream_session_lookup_listener4 (lcl, lcl_port, proto); + if ((s = stream_session_lookup_listener4 (lcl, lcl_port, proto))) + return s; + + /* Finally, try half-open connections */ + rv = clib_bihash_search_inline_16_8 (&smm->v4_half_open_hash, &kv4); + if (rv == 0) + return stream_session_get_from_handle (kv4.value); + return 0; } stream_session_t * @@ -242,20 +249,27 @@ stream_session_lookup_listener6 (ip6_address_t * lcl, u16 lcl_port, u8 proto) * wildcarded local source (listener bound to all interfaces) */ stream_session_t * stream_session_lookup6 (ip6_address_t * lcl, ip6_address_t * rmt, - u16 lcl_port, u16 rmt_port, u8 proto, - u32 my_thread_index) + u16 lcl_port, u16 rmt_port, u8 proto) { session_manager_main_t *smm = vnet_get_session_manager_main (); session_kv6_t kv6; + stream_session_t *s; int rv; make_v6_ss_kv (&kv6, lcl, rmt, lcl_port, rmt_port, proto); rv = clib_bihash_search_inline_48_8 (&smm->v6_session_hash, &kv6); if (rv == 0) - return stream_session_get_tsi (kv6.value, my_thread_index); + return stream_session_get_from_handle (kv6.value); /* If nothing is found, check if any listener is available */ - return stream_session_lookup_listener6 (lcl, lcl_port, proto); + if ((s = stream_session_lookup_listener6 (lcl, lcl_port, proto))) + return s; + + /* Finally, try half-open connections */ + rv = clib_bihash_search_inline_48_8 (&smm->v6_half_open_hash, &kv6); + if (rv == 0) + return stream_session_get_from_handle (kv6.value); + return 0; } stream_session_t * @@ -340,7 +354,6 @@ stream_session_lookup_transport4 (ip4_address_t * lcl, ip4_address_t * rmt, rv = clib_bihash_search_inline_16_8 (&smm->v4_half_open_hash, &kv4); if (rv == 0) return tp_vfts[proto].get_half_open (kv4.value & 0xFFFFFFFF); - return 0; } @@ -390,6 +403,8 @@ stream_session_create_i (segment_manager_t * sm, transport_connection_t * tc, u32 thread_index = tc->thread_index; int rv; + ASSERT (thread_index == vlib_get_thread_index ()); + if ((rv = segment_manager_alloc_session_fifos (sm, &server_rx_fifo, &server_tx_fifo, &fifo_segment_index))) @@ -854,6 +869,7 @@ stream_session_accept (transport_connection_t * tc, u32 listener_index, s->app_index = server->index; s->listener_index = listener_index; + s->session_state = SESSION_STATE_ACCEPTING; /* Shoulder-tap the server */ if (notify) @@ -1088,6 +1104,27 @@ session_vpp_event_queue_allocate (session_manager_main_t * smm, } } +session_type_t +session_type_from_proto_and_ip (transport_proto_t proto, u8 is_ip4) +{ + if (proto == TRANSPORT_PROTO_TCP) + { + if (is_ip4) + return SESSION_TYPE_IP4_TCP; + else + return SESSION_TYPE_IP6_TCP; + } + else + { + if (is_ip4) + return SESSION_TYPE_IP4_UDP; + else + return SESSION_TYPE_IP6_UDP; + } + + return SESSION_N_TYPES; +} + static clib_error_t * session_manager_main_enable (vlib_main_t * vm) { @@ -1131,14 +1168,13 @@ session_manager_main_enable (vlib_main_t * vm) session_vpp_event_queue_allocate (smm, i); /* $$$$ preallocate hack config parameter */ - for (i = 0; i < 200000; i++) + for (i = 0; i < smm->preallocated_sessions; i++) { - stream_session_t *ss; + stream_session_t *ss __attribute__ ((unused)); pool_get_aligned (smm->sessions[0], ss, CLIB_CACHE_LINE_BYTES); - memset (ss, 0, sizeof (*ss)); } - for (i = 0; i < 200000; i++) + for (i = 0; i < smm->preallocated_sessions; i++) pool_put_index (smm->sessions[0], i); clib_bihash_init_16_8 (&smm->v4_session_hash, "v4 session table", @@ -1208,9 +1244,10 @@ session_manager_main_init (vlib_main_t * vm) return 0; } -VLIB_INIT_FUNCTION (session_manager_main_init) - static clib_error_t *session_config_fn (vlib_main_t * vm, - unformat_input_t * input) +VLIB_INIT_FUNCTION (session_manager_main_init); + +static clib_error_t * +session_config_fn (vlib_main_t * vm, unformat_input_t * input) { session_manager_main_t *smm = &session_manager_main; u32 nitems; @@ -1224,6 +1261,9 @@ VLIB_INIT_FUNCTION (session_manager_main_init) else clib_warning ("event queue length %d too small, ignored", nitems); } + if (unformat (input, "preallocated-sessions %d", + &smm->preallocated_sessions)) + ; else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index 5fa4225c..b4507d4e 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -80,6 +80,10 @@ typedef enum SESSION_N_TYPES, } session_type_t; + +session_type_t +session_type_from_proto_and_ip (transport_proto_t proto, u8 is_ip4); + /* * Application session state */ @@ -87,6 +91,7 @@ typedef enum { SESSION_STATE_LISTENING, SESSION_STATE_CONNECTING, + SESSION_STATE_ACCEPTING, SESSION_STATE_READY, SESSION_STATE_CLOSED, SESSION_STATE_N_STATES, @@ -211,8 +216,12 @@ struct _session_manager_main /** Per transport rx function that can either dequeue or peek */ session_fifo_rx_fn *session_tx_fns[SESSION_N_TYPES]; + /** Session manager is enabled */ u8 is_enabled; + /** Preallocate session config parameter */ + u32 preallocated_sessions; + /* Convenience */ vlib_main_t *vlib_main; vnet_main_t *vnet_main; @@ -247,13 +256,12 @@ stream_session_t *stream_session_lookup_listener4 (ip4_address_t * lcl, u16 lcl_port, u8 proto); stream_session_t *stream_session_lookup4 (ip4_address_t * lcl, ip4_address_t * rmt, u16 lcl_port, - u16 rmt_port, u8 proto, - u32 thread_index); + u16 rmt_port, u8 proto); stream_session_t *stream_session_lookup_listener6 (ip6_address_t * lcl, u16 lcl_port, u8 proto); stream_session_t *stream_session_lookup6 (ip6_address_t * lcl, ip6_address_t * rmt, u16 lcl_port, - u16 rmt_port, u8, u32 thread_index); + u16 rmt_port, u8 proto); transport_connection_t * stream_session_lookup_transport4 (ip4_address_t * lcl, ip4_address_t * rmt, u16 lcl_port, @@ -277,9 +285,24 @@ stream_session_get_tsi (u64 ti_and_si, u32 thread_index) ti_and_si & 0xFFFFFFFFULL); } +always_inline u8 +stream_session_is_valid (u32 si, u8 thread_index) +{ + stream_session_t *s; + s = pool_elt_at_index (session_manager_main.sessions[thread_index], si); + if (s->thread_index != thread_index || s->session_index != si + || s->server_rx_fifo->master_session_index != si + || s->server_tx_fifo->master_session_index != si + || s->server_rx_fifo->master_thread_index != thread_index + || s->server_tx_fifo->master_thread_index != thread_index) + return 0; + return 1; +} + always_inline stream_session_t * stream_session_get (u32 si, u32 thread_index) { + ASSERT (stream_session_is_valid (si, thread_index)); return pool_elt_at_index (session_manager_main.sessions[thread_index], si); } @@ -292,6 +315,7 @@ stream_session_get_if_valid (u64 si, u32 thread_index) if (pool_is_free_index (session_manager_main.sessions[thread_index], si)) return 0; + ASSERT (stream_session_is_valid (si, thread_index)); return pool_elt_at_index (session_manager_main.sessions[thread_index], si); } diff --git a/src/vnet/session/session_cli.c b/src/vnet/session/session_cli.c index 6b8341aa..e06bc586 100755 --- a/src/vnet/session/session_cli.c +++ b/src/vnet/session/session_cli.c @@ -47,7 +47,8 @@ format_stream_session (u8 * s, va_list * args) svm_fifo_max_enqueue (ss->server_tx_fifo), stream_session_get_index (ss)); - if (ss->session_state == SESSION_STATE_READY) + if (ss->session_state == SESSION_STATE_READY + || ss->session_state == SESSION_STATE_ACCEPTING) { s = format (s, "%U", tp_vft->format_connection, ss->connection_index, ss->thread_index, verbose); @@ -68,8 +69,9 @@ format_stream_session (u8 * s, va_list * args) } else if (ss->session_state == SESSION_STATE_CLOSED) { - s = format (s, "[CL] %-40U", tp_vft->format_connection, - ss->connection_index, ss->thread_index, verbose); + s = + format (s, "[CL] %U", tp_vft->format_connection, ss->connection_index, + ss->thread_index, verbose); if (verbose == 1) s = format (s, "%v", str); if (verbose > 1) @@ -93,7 +95,13 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, int verbose = 0, i; stream_session_t *pool; stream_session_t *s; - u8 *str = 0; + u8 *str = 0, one_session = 0, proto_set = 0, proto = 0; + u8 is_ip4 = 0, s_type = 0; + ip4_address_t lcl_ip4, rmt_ip4; + u32 lcl_port = 0, rmt_port = 0; + + memset (&lcl_ip4, 0, sizeof (lcl_ip4)); + memset (&rmt_ip4, 0, sizeof (rmt_ip4)); if (!smm->is_enabled) { @@ -106,10 +114,43 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, ; else if (unformat (input, "verbose")) verbose = 1; + else if (unformat (input, "tcp")) + { + proto_set = 1; + proto = TRANSPORT_PROTO_TCP; + } + else if (unformat (input, "%U:%d->%U:%d", + unformat_ip4_address, &lcl_ip4, &lcl_port, + unformat_ip4_address, &rmt_ip4, &rmt_port)) + { + one_session = 1; + is_ip4 = 1; + } + else break; } + if (one_session) + { + if (!proto_set) + { + vlib_cli_output (vm, "proto not set"); + return clib_error_return (0, "proto not set"); + } + + s_type = session_type_from_proto_and_ip (proto, is_ip4); + s = stream_session_lookup4 (&lcl_ip4, &rmt_ip4, + clib_host_to_net_u16 (lcl_port), + clib_host_to_net_u16 (rmt_port), s_type); + if (s) + vlib_cli_output (vm, "%U", format_stream_session, s, 2); + else + vlib_cli_output (vm, "session does not exist"); + + return 0; + } + for (i = 0; i < vec_len (smm->sessions); i++) { u32 once_per_pool; @@ -146,6 +187,7 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, } else vlib_cli_output (vm, "Thread %d: no active sessions", i); + vec_reset_length (str); } vec_free (str); @@ -161,15 +203,22 @@ VLIB_CLI_COMMAND (show_session_command, static) = }; /* *INDENT-ON* */ +static int +clear_session (stream_session_t * s) +{ + application_t *server = application_get (s->app_index); + server->cb_fns.session_disconnect_callback (s); + return 0; +} + static clib_error_t * clear_session_command_fn (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) { session_manager_main_t *smm = &session_manager_main; - u32 thread_index = 0; + u32 thread_index = 0, clear_all = 0; u32 session_index = ~0; - stream_session_t *pool, *session; - application_t *server; + stream_session_t **pool, *session; if (!smm->is_enabled) { @@ -182,28 +231,36 @@ clear_session_command_fn (vlib_main_t * vm, unformat_input_t * input, ; else if (unformat (input, "session %d", &session_index)) ; + else if (unformat (input, "all")) + clear_all = 1; else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); } - if (session_index == ~0) + if (!clear_all && session_index == ~0) return clib_error_return (0, "session required, but not set."); - if (thread_index > vec_len (smm->sessions)) - return clib_error_return (0, "thread %d out of range [0-%d]", - thread_index, vec_len (smm->sessions)); - - pool = smm->sessions[thread_index]; - - if (pool_is_free_index (pool, session_index)) - return clib_error_return (0, "session %d not active", session_index); - - session = pool_elt_at_index (pool, session_index); - server = application_get (session->app_index); + if (session_index != ~0) + { + session = stream_session_get_if_valid (session_index, thread_index); + if (!session) + return clib_error_return (0, "no session %d on thread %d", + session_index, thread_index); + clear_session (session); + } - /* Disconnect both app and transport */ - server->cb_fns.session_disconnect_callback (session); + if (clear_all) + { + /* *INDENT-OFF* */ + vec_foreach (pool, smm->sessions) + { + pool_foreach(session, *pool, ({ + clear_session (session); + })); + }; + /* *INDENT-ON* */ + } return 0; } diff --git a/src/vnet/session/transport.h b/src/vnet/session/transport.h index 561a9257..9c38bab9 100644 --- a/src/vnet/session/transport.h +++ b/src/vnet/session/transport.h @@ -225,6 +225,12 @@ make_v6_ss_kv_from_tc (session_kv6_t * kv, transport_connection_t * t) t->rmt_port, t->proto); } +typedef enum _transport_proto +{ + TRANSPORT_PROTO_TCP, + TRANSPORT_PROTO_UDP +} transport_proto_t; + typedef struct _transport_endpoint { ip46_address_t ip; /** ip address */ diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index 6f8be082..a6c8a235 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -170,62 +170,90 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, { tclient_main_t *tm = &tclient_main; int my_thread_index = vlib_get_thread_index (); - vl_api_disconnect_session_t *dmp; session_t *sp; int i; int delete_session; u32 *connection_indices; - u32 tx_quota = 0; - u32 delta, prev_bytes_received_this_session; + u32 *connections_this_batch; + u32 nconnections_this_batch; connection_indices = tm->connection_index_by_thread[my_thread_index]; + connections_this_batch = + tm->connections_this_batch_by_thread[my_thread_index]; - if (tm->run_test == 0 || vec_len (connection_indices) == 0) + if ((tm->run_test == 0) || + ((vec_len (connection_indices) == 0) + && vec_len (connections_this_batch) == 0)) return 0; - for (i = 0; i < vec_len (connection_indices); i++) + /* Grab another pile of connections */ + if (PREDICT_FALSE (vec_len (connections_this_batch) == 0)) + { + nconnections_this_batch = + clib_min (tm->connections_per_batch, vec_len (connection_indices)); + + ASSERT (nconnections_this_batch > 0); + vec_validate (connections_this_batch, nconnections_this_batch - 1); + clib_memcpy (connections_this_batch, + connection_indices + vec_len (connection_indices) + - nconnections_this_batch, + nconnections_this_batch * sizeof (u32)); + _vec_len (connection_indices) -= nconnections_this_batch; + } + + if (PREDICT_FALSE (tm->prev_conns != tm->connections_per_batch + && tm->prev_conns == vec_len (connections_this_batch))) + { + tm->repeats++; + tm->prev_conns = vec_len (connections_this_batch); + if (tm->repeats == 500000) + { + clib_warning ("stuck clients"); + } + } + else + { + tm->prev_conns = vec_len (connections_this_batch); + tm->repeats = 0; + } + + for (i = 0; i < vec_len (connections_this_batch); i++) { delete_session = 1; - sp = pool_elt_at_index (tm->sessions, connection_indices[i]); + sp = pool_elt_at_index (tm->sessions, connections_this_batch[i]); - if ((tm->no_return || tx_quota < 60) && sp->bytes_to_send > 0) + if (sp->bytes_to_send > 0) { send_test_chunk (tm, sp); delete_session = 0; - tx_quota++; } - if (!tm->no_return && sp->bytes_to_receive > 0) + if (sp->bytes_to_receive > 0) { - prev_bytes_received_this_session = sp->bytes_received; receive_test_chunk (tm, sp); - delta = sp->bytes_received - prev_bytes_received_this_session; - if (delta > 0) - tx_quota--; delete_session = 0; } if (PREDICT_FALSE (delete_session == 1)) { - __sync_fetch_and_add (&tm->tx_total, tm->bytes_to_send); + u32 index, thread_index; + stream_session_t *s; + + __sync_fetch_and_add (&tm->tx_total, sp->bytes_sent); __sync_fetch_and_add (&tm->rx_total, sp->bytes_received); - dmp = vl_msg_api_alloc_as_if_client (sizeof (*dmp)); - memset (dmp, 0, sizeof (*dmp)); - dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION); - dmp->client_index = tm->my_client_index; - dmp->handle = sp->vpp_session_handle; - if (!unix_shared_memory_queue_add (tm->vl_input_queue, (u8 *) & dmp, - 1)) + stream_session_parse_handle (sp->vpp_session_handle, + &index, &thread_index); + s = stream_session_get_if_valid (index, thread_index); + + if (s) { - vec_delete (connection_indices, 1, i); - tm->connection_index_by_thread[my_thread_index] = - connection_indices; + stream_session_disconnect (s); + vec_delete (connections_this_batch, 1, i); + i--; __sync_fetch_and_add (&tm->ready_connections, -1); } else - { - vl_msg_api_free (dmp); - } + clib_warning ("session AWOL?"); /* Kick the debug CLI process */ if (tm->ready_connections == 0) @@ -236,6 +264,10 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, } } } + + tm->connection_index_by_thread[my_thread_index] = connection_indices; + tm->connections_this_batch_by_thread[my_thread_index] = + connections_this_batch; return 0; } @@ -356,6 +388,8 @@ tcp_test_clients_init (vlib_main_t * vm) tm->vlib_main = vm; vec_validate (tm->connection_index_by_thread, thread_main->n_vlib_mains); + vec_validate (tm->connections_this_batch_by_thread, + thread_main->n_vlib_mains); return 0; } @@ -388,7 +422,8 @@ builtin_session_connected_callback (u32 app_index, u32 api_context, pool_get (tm->sessions, session); memset (session, 0, sizeof (*session)); session_index = session - tm->sessions; - session->bytes_to_receive = session->bytes_to_send = tm->bytes_to_send; + session->bytes_to_send = tm->bytes_to_send; + session->bytes_to_receive = tm->no_return ? 0ULL : tm->bytes_to_send; session->server_rx_fifo = s->server_rx_fifo; session->server_rx_fifo->client_session_index = session_index; session->server_tx_fifo = s->server_tx_fifo; @@ -485,6 +520,8 @@ attach_builtin_test_clients_app (void) options[SESSION_OPTIONS_SEGMENT_SIZE] = (2ULL << 32); options[SESSION_OPTIONS_RX_FIFO_SIZE] = tm->fifo_size; options[SESSION_OPTIONS_TX_FIFO_SIZE] = tm->fifo_size / 2; + options[APP_OPTIONS_PRIVATE_SEGMENT_COUNT] = tm->private_segment_count; + options[APP_OPTIONS_PRIVATE_SEGMENT_SIZE] = tm->private_segment_size; options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = prealloc_fifos; options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP; @@ -561,6 +598,9 @@ test_tcp_clients_command_fn (vlib_main_t * vm, tm->bytes_to_send = 8192; tm->no_return = 0; tm->fifo_size = 64 << 10; + tm->connections_per_batch = 1000; + tm->private_segment_count = 0; + tm->private_segment_size = 0; vec_free (tm->connect_uri); @@ -582,6 +622,20 @@ test_tcp_clients_command_fn (vlib_main_t * vm, tm->no_return = 1; else if (unformat (input, "fifo-size %d", &tm->fifo_size)) tm->fifo_size <<= 10; + else if (unformat (input, "private-segment-count %d", + &tm->private_segment_count)) + ; + else if (unformat (input, "private-segment-size %dm", &tmp)) + tm->private_segment_size = tmp << 20; + else if (unformat (input, "private-segment-size %dg", &tmp)) + tm->private_segment_size = tmp << 30; + else if (unformat (input, "private-segment-size %d", &tmp)) + tm->private_segment_size = tmp; + else if (unformat (input, "preallocate-fifos")) + tm->prealloc_fifos = 1; + else + if (unformat (input, "client-batch %d", &tm->connections_per_batch)) + ; else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); @@ -688,9 +742,13 @@ test_tcp_clients_command_fn (vlib_main_t * vm, vlib_cli_output (vm, "zero delta-t?"); cleanup: - pool_free (tm->sessions); + tm->run_test = 0; for (i = 0; i < vec_len (tm->connection_index_by_thread); i++) - vec_reset_length (tm->connection_index_by_thread[i]); + { + vec_reset_length (tm->connection_index_by_thread[i]); + vec_reset_length (tm->connections_this_batch_by_thread[i]); + } + pool_free (tm->sessions); return 0; } diff --git a/src/vnet/tcp/builtin_client.h b/src/vnet/tcp/builtin_client.h index 3462e0ee..38af231d 100644 --- a/src/vnet/tcp/builtin_client.h +++ b/src/vnet/tcp/builtin_client.h @@ -63,6 +63,9 @@ typedef struct u32 configured_segment_size; u32 fifo_size; u32 expected_connections; /**< Number of clients/connections */ + u32 connections_per_batch; /**< Connections to rx/tx at once */ + u32 private_segment_count; /**< Number of private fifo segs */ + u32 private_segment_size; /**< size of private fifo segs */ /* * Test state variables @@ -72,6 +75,7 @@ typedef struct uword *session_index_by_vpp_handles; /**< Hash table for disconnecting */ u8 *connect_test_data; /**< Pre-computed test data */ u32 **connection_index_by_thread; + u32 **connections_this_batch_by_thread; /**< active connection batch */ pthread_t client_thread_handle; volatile u32 ready_connections; @@ -82,7 +86,8 @@ typedef struct f64 test_start_time; f64 test_end_time; - + u32 prev_conns; + u32 repeats; /* * Flags */ diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c index 775bfc26..8e958ac0 100644 --- a/src/vnet/tcp/builtin_server.c +++ b/src/vnet/tcp/builtin_server.c @@ -56,12 +56,15 @@ typedef struct u32 fifo_size; /**< Fifo size */ u32 rcv_buffer_size; /**< Rcv buffer size */ u32 prealloc_fifos; /**< Preallocate fifos */ + u32 private_segment_count; /**< Number of private segments */ + u32 private_segment_size; /**< Size of private segments */ /* * Test state */ u8 **rx_buf; /**< Per-thread RX buffer */ u64 byte_index; + u32 **rx_retries; vlib_main_t *vlib_main; } builtin_server_main_t; @@ -77,6 +80,8 @@ builtin_session_accept_callback (stream_session_t * s) session_manager_get_vpp_event_queue (s->thread_index); s->session_state = SESSION_STATE_READY; bsm->byte_index = 0; + vec_validate (bsm->rx_retries[s->thread_index], s->session_index); + bsm->rx_retries[s->thread_index][s->session_index] = 0; return 0; } @@ -173,11 +178,16 @@ builtin_server_rx_callback (stream_session_t * s) builtin_server_main_t *bsm = &builtin_server_main; session_fifo_event_t evt; static int serial_number = 0; - u32 my_thread_id = vlib_get_thread_index (); + u32 thread_index = vlib_get_thread_index (); + + ASSERT (s->thread_index == thread_index); rx_fifo = s->server_rx_fifo; tx_fifo = s->server_tx_fifo; + ASSERT (rx_fifo->master_thread_index == thread_index); + ASSERT (tx_fifo->master_thread_index == thread_index); + max_dequeue = svm_fifo_max_dequeue (s->server_rx_fifo); max_enqueue = svm_fifo_max_enqueue (s->server_tx_fifo); @@ -201,21 +211,31 @@ builtin_server_rx_callback (stream_session_t * s) evt.event_type = FIFO_EVENT_BUILTIN_RX; evt.event_id = 0; - q = bsm->vpp_queue[s->thread_index]; + q = bsm->vpp_queue[thread_index]; if (PREDICT_FALSE (q->cursize == q->maxsize)) clib_warning ("out of event queue space"); - else - unix_shared_memory_queue_add (q, (u8 *) & evt, - 0 /* don't wait for mutex */ ); + else if (unix_shared_memory_queue_add (q, (u8 *) & evt, 0 /* don't wait for mutex */ + )) + clib_warning ("failed to enqueue self-tap"); + + bsm->rx_retries[thread_index][s->session_index]++; + if (bsm->rx_retries[thread_index][s->session_index] == 500000) + { + clib_warning ("session stuck: %U", format_stream_session, s, 2); + } + } + else + { + bsm->rx_retries[thread_index][s->session_index] = 0; } return 0; } - _vec_len (bsm->rx_buf[my_thread_id]) = max_transfer; + _vec_len (bsm->rx_buf[thread_index]) = max_transfer; actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, max_transfer, - bsm->rx_buf[my_thread_id]); + bsm->rx_buf[thread_index]); ASSERT (actual_transfer == max_transfer); // test_bytes (bsm, actual_transfer); @@ -225,7 +245,7 @@ builtin_server_rx_callback (stream_session_t * s) */ n_written = svm_fifo_enqueue_nowait (tx_fifo, actual_transfer, - bsm->rx_buf[my_thread_id]); + bsm->rx_buf[thread_index]); if (n_written != max_transfer) clib_warning ("short trout!"); @@ -237,11 +257,13 @@ builtin_server_rx_callback (stream_session_t * s) evt.event_type = FIFO_EVENT_APP_TX; evt.event_id = serial_number++; - unix_shared_memory_queue_add (bsm->vpp_queue[s->thread_index], - (u8 *) & evt, 0 /* do wait for mutex */ ); + if (unix_shared_memory_queue_add (bsm->vpp_queue[s->thread_index], + (u8 *) & evt, + 0 /* do wait for mutex */ )) + clib_warning ("failed to enqueue tx evt"); } - if (PREDICT_FALSE (max_enqueue < max_dequeue)) + if (PREDICT_FALSE (n_written < max_dequeue)) goto rx_event; return 0; @@ -328,9 +350,13 @@ server_attach () a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 512 << 20; a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = bsm->fifo_size; a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = bsm->fifo_size; - a->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP; + a->options[APP_OPTIONS_PRIVATE_SEGMENT_COUNT] = bsm->private_segment_count; + a->options[APP_OPTIONS_PRIVATE_SEGMENT_SIZE] = bsm->private_segment_size; a->options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = bsm->prealloc_fifos ? bsm->prealloc_fifos : 1; + + a->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_BUILTIN_APP; + a->segment_name = segment_name; a->segment_name_length = ARRAY_LEN (segment_name); @@ -374,6 +400,8 @@ server_create (vlib_main_t * vm) num_threads = 1 /* main thread */ + vtm->n_threads; vec_validate (builtin_server_main.vpp_queue, num_threads - 1); vec_validate (bsm->rx_buf, num_threads - 1); + vec_validate (bsm->rx_retries, num_threads - 1); + for (i = 0; i < num_threads; i++) vec_validate (bsm->rx_buf[i], bsm->rcv_buffer_size); @@ -435,11 +463,14 @@ server_create_command_fn (vlib_main_t * vm, unformat_input_t * input, { builtin_server_main_t *bsm = &builtin_server_main; int rv; + u32 tmp; bsm->no_echo = 0; bsm->fifo_size = 64 << 10; bsm->rcv_buffer_size = 128 << 10; bsm->prealloc_fifos = 0; + bsm->private_segment_count = 0; + bsm->private_segment_size = 0; while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { @@ -449,8 +480,17 @@ server_create_command_fn (vlib_main_t * vm, unformat_input_t * input, bsm->fifo_size <<= 10; else if (unformat (input, "rcv-buf-size %d", &bsm->rcv_buffer_size)) ; - else if (unformat (input, "prealloc-fifos", &bsm->prealloc_fifos)) + else if (unformat (input, "prealloc-fifos %d", &bsm->prealloc_fifos)) + ; + else if (unformat (input, "private-segment-count %d", + &bsm->private_segment_count)) ; + else if (unformat (input, "private-segment-size %dm", &tmp)) + bsm->private_segment_size = tmp << 20; + else if (unformat (input, "private-segment-size %dg", &tmp)) + bsm->private_segment_size = tmp << 30; + else if (unformat (input, "private-segment-size %d", &tmp)) + bsm->private_segment_size = tmp; else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 4e85eb3f..f379e699 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -74,8 +74,16 @@ static void tcp_connection_unbind (u32 listener_index) { tcp_main_t *tm = vnet_get_tcp_main (); - TCP_EVT_DBG (TCP_EVT_UNBIND, - pool_elt_at_index (tm->listener_pool, listener_index)); + tcp_connection_t *tc; + + tc = pool_elt_at_index (tm->listener_pool, listener_index); + + TCP_EVT_DBG (TCP_EVT_UNBIND, tc); + + /* Poison the entry */ + if (CLIB_DEBUG > 0) + memset (tc, 0xFA, sizeof (*tc)); + pool_put_index (tm->listener_pool, listener_index); } @@ -124,9 +132,20 @@ tcp_connection_cleanup (tcp_connection_t * tc) /* Check if half-open */ if (tc->state == TCP_STATE_SYN_SENT) - pool_put (tm->half_open_connections, tc); + { + /* Poison the entry */ + if (CLIB_DEBUG > 0) + memset (tc, 0xFA, sizeof (*tc)); + pool_put (tm->half_open_connections, tc); + } else - pool_put (tm->connections[tc->c_thread_index], tc); + { + int thread_index = tc->c_thread_index; + /* Poison the entry */ + if (CLIB_DEBUG > 0) + memset (tc, 0xFA, sizeof (*tc)); + pool_put (tm->connections[thread_index], tc); + } } /** @@ -168,13 +187,14 @@ tcp_connection_reset (tcp_connection_t * tc) /* Make sure all timers are cleared */ tcp_connection_timers_reset (tc); - stream_session_reset_notify (&tc->connection); + + /* Wait for cleanup from session layer but not forever */ + tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); break; case TCP_STATE_CLOSED: return; } - } /** @@ -278,6 +298,9 @@ tcp_allocate_local_port (tcp_main_t * tm, ip46_address_t * ip) tries = max - min; time_now = tcp_time_now (); + /* Only support active opens from thread 0 */ + ASSERT (vlib_get_thread_index () == 0); + /* Start at random point or max */ pool_get (tm->local_endpoints, tep); clib_memcpy (&tep->ip, ip, sizeof (*ip)); @@ -343,6 +366,7 @@ tcp_connection_timers_reset (tcp_connection_t * tc) } } +#if 0 typedef struct ip4_tcp_hdr { ip4_header_t ip; @@ -435,6 +459,7 @@ tcp_connection_fib_attach (tcp_connection_t * tc) tcp_connection_stack_on_fib_entry (tc); } +#endif /* 0 */ /** Initialize tcp connection variables * @@ -447,7 +472,7 @@ tcp_connection_init_vars (tcp_connection_t * tc) tcp_init_mss (tc); scoreboard_init (&tc->sack_sb); tcp_cc_init (tc); - tcp_connection_fib_attach (tc); + // tcp_connection_fib_attach (tc); } int @@ -485,14 +510,38 @@ tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4) if (is_ip4) { ip4_address_t *ip4; - ip4 = ip_interface_get_first_ip (sw_if_index, 1); - lcl_addr.ip4.as_u32 = ip4->as_u32; + int index; + if (vec_len (tm->ip4_src_addresses)) + { + index = tm->last_v4_address_rotor++; + if (tm->last_v4_address_rotor >= vec_len (tm->ip4_src_addresses)) + tm->last_v4_address_rotor = 0; + lcl_addr.ip4.as_u32 = tm->ip4_src_addresses[index].as_u32; + } + else + { + ip4 = ip_interface_get_first_ip (sw_if_index, 1); + lcl_addr.ip4.as_u32 = ip4->as_u32; + } } else { ip6_address_t *ip6; - ip6 = ip_interface_get_first_ip (sw_if_index, 0); - clib_memcpy (&lcl_addr.ip6, ip6, sizeof (*ip6)); + int index; + + if (vec_len (tm->ip6_src_addresses)) + { + index = tm->last_v6_address_rotor++; + if (tm->last_v6_address_rotor >= vec_len (tm->ip6_src_addresses)) + tm->last_v6_address_rotor = 0; + clib_memcpy (&lcl_addr.ip6, &tm->ip6_src_addresses[index], + sizeof (*ip6)); + } + else + { + ip6 = ip_interface_get_first_ip (sw_if_index, 0); + clib_memcpy (&lcl_addr.ip6, ip6, sizeof (*ip6)); + } } /* Allocate source port */ @@ -614,7 +663,7 @@ u8 * format_tcp_vars (u8 * s, va_list * args) { tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - s = format (s, " snd_una %u snd_nxt %u snd_una_max %u\n", + s = format (s, " snd_una %u snd_nxt %u snd_una_max %u", tc->snd_una - tc->iss, tc->snd_nxt - tc->iss, tc->snd_una_max - tc->iss); s = format (s, " rcv_nxt %u rcv_las %u\n", @@ -628,12 +677,17 @@ format_tcp_vars (u8 * s, va_list * args) s = format (s, " cong %U ", format_tcp_congestion_status, tc); s = format (s, "cwnd %u ssthresh %u rtx_bytes %u bytes_acked %u\n", tc->cwnd, tc->ssthresh, tc->snd_rxt_bytes, tc->bytes_acked); - s = format (s, " prev_ssthresh %u snd_congestion %u dupack %u\n", + s = format (s, " prev_ssthresh %u snd_congestion %u dupack %u", tc->prev_ssthresh, tc->snd_congestion - tc->iss, tc->rcv_dupacks); + s = format (s, " limited_transmit %u\n", tc->limited_transmit - tc->iss); + s = format (s, " tsecr %u tsecr_last_ack %u\n", tc->rcv_opts.tsecr, + tc->tsecr_last_ack); s = format (s, " rto %u rto_boff %u srtt %u rttvar %u rtt_ts %u ", tc->rto, tc->rto_boff, tc->srtt, tc->rttvar, tc->rtt_ts); s = format (s, "rtt_seq %u\n", tc->rtt_seq); + s = format (s, " tsval_recent %u tsval_recent_age %u\n", tc->tsval_recent, + tcp_time_now () - tc->tsval_recent_age); s = format (s, " scoreboard: %U\n", format_tcp_scoreboard, &tc->sack_sb); if (vec_len (tc->snd_sacks)) s = format (s, " sacks tx: %U\n", format_tcp_sacks, tc); @@ -719,11 +773,21 @@ format_tcp_sacks (u8 * s, va_list * args) tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); sack_block_t *sacks = tc->snd_sacks; sack_block_t *block; - vec_foreach (block, sacks) - { - s = format (s, " start %u end %u\n", block->start - tc->irs, - block->end - tc->irs); - } + int i, len = 0; + + len = vec_len (sacks); + for (i = 0; i < len - 1; i++) + { + block = &sacks[i]; + s = format (s, " start %u end %u\n", block->start - tc->irs, + block->end - tc->irs); + } + if (len) + { + block = &sacks[len - 1]; + s = format (s, " start %u end %u", block->start - tc->irs, + block->end - tc->irs); + } return s; } @@ -796,14 +860,18 @@ tcp_session_send_mss (transport_connection_t * trans_conn) always_inline u32 tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space) { - if (tc->snd_wnd < tc->snd_mss) + if (PREDICT_FALSE (tc->snd_wnd < tc->snd_mss)) { return tc->snd_wnd <= snd_space ? tc->snd_wnd : 0; } /* If we can't write at least a segment, don't try at all */ - if (snd_space < tc->snd_mss) - return 0; + if (PREDICT_FALSE (snd_space < tc->snd_mss)) + { + if (snd_space > clib_min (tc->mss, tc->rcv_opts.mss) - TCP_HDR_LEN_MAX) + return snd_space; + return 0; + } /* round down to mss multiple */ return snd_space - (snd_space % tc->snd_mss); @@ -1042,6 +1110,8 @@ tcp_main_enable (vlib_main_t * vm) vlib_thread_main_t *vtm = vlib_get_thread_main (); clib_error_t *error = 0; u32 num_threads; + int thread, i; + tcp_connection_t *tc __attribute__ ((unused)); if ((error = vlib_call_init_function (vm, ip_main_init))) return error; @@ -1074,6 +1144,27 @@ tcp_main_enable (vlib_main_t * vm) num_threads = 1 /* main thread */ + vtm->n_threads; vec_validate (tm->connections, num_threads - 1); + /* + * Preallocate connections + */ + for (thread = 0; thread < num_threads; thread++) + { + for (i = 0; i < tm->preallocated_connections; i++) + pool_get (tm->connections[thread], tc); + + for (i = 0; i < tm->preallocated_connections; i++) + pool_put_index (tm->connections[thread], i); + } + + /* + * Preallocate half-open connections + */ + for (i = 0; i < tm->preallocated_half_open_connections; i++) + pool_get (tm->half_open_connections, tc); + + for (i = 0; i < tm->preallocated_half_open_connections; i++) + pool_put_index (tm->half_open_connections, i); + /* Initialize per worker thread tx buffers (used for control messages) */ vec_validate (tm->tx_buffers, num_threads - 1); @@ -1116,7 +1207,6 @@ tcp_init (vlib_main_t * vm) { tcp_main_t *tm = vnet_get_tcp_main (); - tm->vlib_main = vm; tm->vnet_main = vnet_get_main (); tm->is_enabled = 0; @@ -1125,6 +1215,97 @@ tcp_init (vlib_main_t * vm) VLIB_INIT_FUNCTION (tcp_init); + +static clib_error_t * +tcp_config_fn (vlib_main_t * vm, unformat_input_t * input) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (input, "preallocated-connections %d", + &tm->preallocated_connections)) + ; + else if (unformat (input, "preallocated-half-open-connections %d", + &tm->preallocated_half_open_connections)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + return 0; +} + +VLIB_CONFIG_FUNCTION (tcp_config_fn, "tcp"); + +static clib_error_t * +tcp_src_address (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd_arg) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + ip4_address_t v4start, v4end; + ip6_address_t v6start, v6end; + int v4set = 0; + int v6set = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "%U - %U", unformat_ip4_address, &v4start, + unformat_ip4_address, &v4end)) + v4set = 1; + else if (unformat (input, "%U", unformat_ip4_address, &v4start)) + { + memcpy (&v4end, &v4start, sizeof (v4start)); + v4set = 1; + } + else if (unformat (input, "%U - %U", unformat_ip6_address, &v6start, + unformat_ip4_address, &v6end)) + v6set = 1; + else if (unformat (input, "%U", unformat_ip6_address, &v6start)) + { + memcpy (&v6end, &v6start, sizeof (v4start)); + v6set = 1; + } + else + break; + } + + if (!v4set && !v6set) + return clib_error_return (0, "at least one v4 or v6 address required"); + + if (v4set) + { + u32 tmp; + + do + { + vec_add1 (tm->ip4_src_addresses, v4start); + tmp = clib_net_to_host_u32 (v4start.as_u32); + tmp++; + v4start.as_u32 = clib_host_to_net_u32 (tmp); + } + while (clib_host_to_net_u32 (v4start.as_u32) <= + clib_host_to_net_u32 (v4end.as_u32)); + } + if (v6set) + { + clib_warning ("v6 src address list unimplemented..."); + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (tcp_src_address_command, static) = +{ + .path = "tcp src-address", + .short_help = "tcp src-address [- ] add src address range", + .function = tcp_src_address, +}; +/* *INDENT-ON* */ + + + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 12d804b8..37b10fd4 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -348,6 +348,16 @@ typedef struct _tcp_main /* Flag that indicates if stack is on or off */ u8 is_enabled; + /** Number of preallocated connections */ + u32 preallocated_connections; + u32 preallocated_half_open_connections; + + /** Vectors of src addresses. Optional unless one needs > 63K active-opens */ + ip4_address_t *ip4_src_addresses; + u32 last_v4_address_rotor; + u32 last_v6_address_rotor; + ip6_address_t *ip6_src_addresses; + /* convenience */ vlib_main_t *vlib_main; vnet_main_t *vnet_main; @@ -569,6 +579,7 @@ tcp_connection_force_ack (tcp_connection_t * tc, vlib_buffer_t * b) always_inline void tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval) { + ASSERT (tc->c_thread_index == vlib_get_thread_index ()); tc->timers[timer_id] = tw_timer_start_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index], tc->c_c_index, timer_id, interval); @@ -577,6 +588,7 @@ tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval) always_inline void tcp_timer_reset (tcp_connection_t * tc, u8 timer_id) { + ASSERT (tc->c_thread_index == vlib_get_thread_index ()); if (tc->timers[timer_id] == TCP_TIMER_HANDLE_INVALID) return; @@ -588,6 +600,7 @@ tcp_timer_reset (tcp_connection_t * tc, u8 timer_id) always_inline void tcp_timer_update (tcp_connection_t * tc, u8 timer_id, u32 interval) { + ASSERT (tc->c_thread_index == vlib_get_thread_index ()); if (tc->timers[timer_id] != TCP_TIMER_HANDLE_INVALID) tw_timer_stop_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index], tc->timers[timer_id]); diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index ae68ad1b..be51bca2 100755 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -383,9 +383,16 @@ typedef enum _tcp_dbg_evt "establish", \ }, \ }; \ - DECLARE_ETD(_tc, _e, 2); \ - ed->data[0] = _timer_id; \ - ed->data[1] = _timer_id; \ + if (_tc) \ + { \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = _timer_id; \ + ed->data[1] = _timer_id; \ + } \ + else \ + { \ + clib_warning ("pop for unexisting connection %d", _tc_index); \ + } \ } #define TCP_EVT_SEG_INVALID_HANDLER(_tc, _seq, _end, ...) \ diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index a2e6dad1..45db0da6 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -251,6 +251,7 @@ tcp_update_timestamp (tcp_connection_t * tc, u32 seq, u32 seq_end) if (tcp_opts_tstamp (&tc->rcv_opts) && tc->tsval_recent && seq_leq (seq, tc->rcv_las) && seq_leq (tc->rcv_las, seq_end)) { + ASSERT (timestamp_leq (tc->tsval_recent, tc->rcv_opts.tsval)); tc->tsval_recent = tc->rcv_opts.tsval; tc->tsval_recent_age = tcp_time_now (); } @@ -383,12 +384,9 @@ tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt) if (tc->srtt != 0) { err = mrtt - tc->srtt; -// tc->srtt += err >> 3; /* XXX Drop in RTT results in RTTVAR increase and bigger RTO. * The increase should be bound */ -// tc->rttvar += ((int) clib_abs (err) - (int) tc->rttvar) >> 2; - tc->srtt = clib_max ((int) tc->srtt + (err >> 3), 1); diff = (clib_abs (err) - (int) tc->rttvar) >> 2; tc->rttvar = clib_max ((int) tc->rttvar + diff, 1); @@ -491,6 +489,14 @@ tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 prev_snd_wnd, && (prev_snd_wnd == tc->snd_wnd)); } +static u8 +tcp_is_lost_fin (tcp_connection_t * tc) +{ + if ((tc->flags & TCP_CONN_FINSNT) && tc->snd_una_max - tc->snd_una == 1) + return 1; + return 0; +} + /** * Checks if ack is a congestion control event. */ @@ -503,7 +509,7 @@ tcp_ack_is_cc_event (tcp_connection_t * tc, vlib_buffer_t * b, *is_dack = tc->sack_sb.last_sacked_bytes || tcp_ack_is_dupack (tc, b, prev_snd_wnd, prev_snd_una); - return (*is_dack || tcp_in_cong_recovery (tc)); + return ((*is_dack || tcp_in_cong_recovery (tc)) && !tcp_is_lost_fin (tc)); } void @@ -750,10 +756,20 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) * last hole end */ tmp = tc->rcv_opts.sacks[vec_len (tc->rcv_opts.sacks) - 1]; last_hole = scoreboard_last_hole (sb); - if (seq_gt (tc->snd_una_max, sb->high_sacked) - && seq_gt (tc->snd_una_max, last_hole->end)) - last_hole->end = tc->snd_una_max; - /* keep track of max byte sacked for when the last hole + if (seq_gt (tc->snd_una_max, last_hole->end)) + { + if (seq_geq (last_hole->start, sb->high_sacked)) + { + last_hole->end = tc->snd_una_max; + } + /* New hole after high sacked block */ + else if (seq_lt (sb->high_sacked, tc->snd_una_max)) + { + scoreboard_insert_hole (sb, sb->tail, sb->high_sacked, + tc->snd_una_max); + } + } + /* Keep track of max byte sacked for when the last hole * is acked */ if (seq_gt (tmp.end, sb->high_sacked)) sb->high_sacked = tmp.end; @@ -764,7 +780,6 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) while (hole && blk_index < vec_len (tc->rcv_opts.sacks)) { blk = &tc->rcv_opts.sacks[blk_index]; - if (seq_leq (blk->start, hole->start)) { /* Block covers hole. Remove hole */ @@ -784,6 +799,7 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) } else if (!next_hole) { + ASSERT (seq_geq (sb->high_sacked, ack)); sb->snd_una_adv = sb->high_sacked - ack; sb->last_bytes_delivered += sb->high_sacked - hole->end; } @@ -819,7 +835,6 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { hole->end = blk->start; } - hole = scoreboard_next_hole (sb, hole); } } @@ -827,10 +842,13 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) scoreboard_update_bytes (tc, sb); sb->last_sacked_bytes = sb->sacked_bytes - (old_sacked_bytes - sb->last_bytes_delivered); + ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes); ASSERT (sb->sacked_bytes == 0 || sb->sacked_bytes < tc->snd_una_max - seq_max (tc->snd_una, ack)); ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_una_max - seq_max (tc->snd_una, ack)); + ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc) + || sb->holes[sb->head].start == ack + sb->snd_una_adv); } /** @@ -916,7 +934,8 @@ tcp_cc_congestion_undo (tcp_connection_t * tc) static u8 tcp_cc_is_spurious_retransmit (tcp_connection_t * tc) { - return (tc->snd_rxt_ts + return (tcp_in_recovery (tc) + && tc->snd_rxt_ts && tcp_opts_tstamp (&tc->rcv_opts) && timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts)); } @@ -994,6 +1013,7 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) { ASSERT (tc->snd_una != tc->snd_una_max || tc->sack_sb.last_sacked_bytes); + tc->rcv_dupacks++; if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD && !tc->bytes_acked) @@ -1012,17 +1032,20 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) goto partial_ack_test; } - /* If of of the two conditions lower hold, reset dupacks - * 1) Cumulative ack does not cover more than congestion threshold, - * and the following doesn't hold: the congestion window is - * greater than SMSS bytes and the difference between highest_ack - * and prev_highest_ack is at most 4*SMSS bytes (XXX) - * 2) RFC6582 heuristic to avoid multiple fast retransmits + /* If of of the two conditions lower hold, reset dupacks because + * we're probably after timeout (RFC6582 heuristics). + * If Cumulative ack does not cover more than congestion threshold, + * and: + * 1) The following doesn't hold: The congestion window is greater + * than SMSS bytes and the difference between highest_ack + * and prev_highest_ack is at most 4*SMSS bytes + * 2) Echoed timestamp in the last non-dup ack does not equal the + * stored timestamp */ - if ((seq_gt (tc->snd_una, tc->snd_congestion) - || !(tc->cwnd > tc->snd_mss - && tc->bytes_acked <= 4 * tc->snd_mss)) - || tc->rcv_opts.tsecr != tc->tsecr_last_ack) + if (seq_leq (tc->snd_una, tc->snd_congestion) + && ((!(tc->cwnd > tc->snd_mss + && tc->bytes_acked <= 4 * tc->snd_mss)) + || (tc->rcv_opts.tsecr != tc->tsecr_last_ack))) { tc->rcv_dupacks = 0; return; @@ -1038,6 +1061,7 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) * three segments that have left the network and should've been * buffered at the receiver XXX */ tc->cwnd = tc->ssthresh + tc->rcv_dupacks * tc->snd_mss; + ASSERT (tc->cwnd >= tc->snd_mss); /* If cwnd allows, send more data */ if (tcp_opts_sack_permitted (&tc->rcv_opts) @@ -1112,7 +1136,7 @@ partial_ack: >= tc->sack_sb.last_bytes_delivered); rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv - tc->sack_sb.last_bytes_delivered; - if (rxt_delivered && seq_gt (tc->sack_sb.high_rxt, tc->snd_una)) + if (0 && rxt_delivered && seq_gt (tc->sack_sb.high_rxt, tc->snd_una)) { /* If we have sacks and we haven't gotten an ack beyond high_rxt, * remove sacked bytes delivered */ @@ -1301,6 +1325,8 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, { int written; + ASSERT (seq_geq (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt)); + /* Pure ACK. Update rcv_nxt and be done. */ if (PREDICT_FALSE (data_len == 0)) { @@ -1450,6 +1476,7 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, /* Chop off the bytes in the past */ n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number; n_data_bytes -= n_bytes_to_drop; + vnet_buffer (b)->tcp.seq_number = tc->rcv_nxt; vlib_buffer_advance (b, n_bytes_to_drop); goto in_order; @@ -1912,11 +1939,12 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; } - stream_session_init_fifos_pointers (&new_tc0->connection, - new_tc0->irs + 1, - new_tc0->iss + 1); /* Make sure after data segment processing ACK is sent */ new_tc0->flags |= TCP_CONN_SNDACK; + + /* Update rtt with the syn-ack sample */ + new_tc0->bytes_acked = 1; + tcp_update_rtt (new_tc0, vnet_buffer (b0)->tcp.ack_number); } /* SYN: Simultaneous open. Change state to SYN-RCVD and send SYN-ACK */ else @@ -1932,9 +1960,8 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; } - stream_session_init_fifos_pointers (&new_tc0->connection, - new_tc0->irs + 1, - new_tc0->iss + 1); + tc0->rtt_ts = 0; + tcp_make_synack (new_tc0, b0); next0 = tcp_next_output (is_ip4); @@ -2151,8 +2178,6 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, << tc0->rcv_opts.wscale; tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number; tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number; - - /* Shoulder tap the server */ stream_session_accept_notify (&tc0->connection); /* Reset SYN-ACK retransmit timer */ @@ -2175,6 +2200,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* If FIN is ACKed */ if (tc0->snd_una == tc0->snd_una_max) { + ASSERT (tcp_fin (tcp0)); tc0->state = TCP_STATE_FIN_WAIT_2; /* Stop all timers, 2MSL will be set lower */ tcp_connection_timers_reset (tc0); @@ -2545,10 +2571,6 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_make_synack (child0, b0); next0 = tcp_next_output (is_ip4); - /* Init fifo pointers after we have iss */ - stream_session_init_fifos_pointers (&child0->connection, - child0->irs + 1, - child0->iss + 1); drop: if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { @@ -2886,9 +2908,12 @@ do { \ _(LISTEN, TCP_FLAG_SYN, TCP_INPUT_NEXT_LISTEN, TCP_ERROR_NONE); _(LISTEN, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_NONE); _(LISTEN, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_NONE); + _(LISTEN, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, + TCP_ERROR_NONE); /* ACK for for a SYN-ACK -> tcp-rcv-process. */ _(SYN_RCVD, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(SYN_RCVD, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(SYN_RCVD, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); /* SYN-ACK for a SYN */ _(SYN_SENT, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE); @@ -2905,12 +2930,14 @@ do { \ _(ESTABLISHED, TCP_FLAG_RST, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); _(ESTABLISHED, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); + _(ESTABLISHED, TCP_FLAG_SYN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); /* ACK or FIN-ACK to our FIN */ _(FIN_WAIT_1, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(FIN_WAIT_1, TCP_FLAG_ACK | TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); /* FIN in reply to our FIN from the other side */ _(FIN_WAIT_1, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(FIN_WAIT_1, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); /* FIN confirming that the peer (app) has closed */ _(FIN_WAIT_2, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(FIN_WAIT_2, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); @@ -2929,6 +2956,8 @@ do { \ TCP_ERROR_NONE); _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED); _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); + _(CLOSED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, + TCP_ERROR_CONNECTION_CLOSED); #undef _ } diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c index c825e952..103fea4c 100644 --- a/src/vnet/tcp/tcp_newreno.c +++ b/src/vnet/tcp/tcp_newreno.c @@ -63,8 +63,8 @@ newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type) * window deflation" attempts to ensure that, when fast recovery * eventually ends, approximately ssthresh amount of data will be * outstanding in the network.*/ - tc->cwnd = (tc->cwnd > tc->bytes_acked) ? - tc->cwnd - tc->bytes_acked : 0; + tc->cwnd = (tc->cwnd > tc->bytes_acked + tc->snd_mss) ? + tc->cwnd - tc->bytes_acked : tc->snd_mss; if (tc->bytes_acked > tc->snd_mss) tc->cwnd += tc->snd_mss; } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 41bebcb3..b418e8ba 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -19,17 +19,20 @@ vlib_node_registration_t tcp4_output_node; vlib_node_registration_t tcp6_output_node; -typedef enum _tcp_output_nect +typedef enum _tcp_output_next { TCP_OUTPUT_NEXT_DROP, + TCP_OUTPUT_NEXT_IP_LOOKUP, TCP_OUTPUT_N_NEXT } tcp_output_next_t; #define foreach_tcp4_output_next \ _ (DROP, "error-drop") \ + _ (IP_LOOKUP, "ip4-lookup") #define foreach_tcp6_output_next \ _ (DROP, "error-drop") \ + _ (IP_LOOKUP, "ip6-lookup") static char *tcp_error_strings[] = { #define tcp_error(n,s) s, @@ -427,16 +430,16 @@ tcp_init_mss (tcp_connection_t * tc) #define tcp_get_free_buffer_index(tm, bidx) \ do { \ u32 *my_tx_buffers, n_free_buffers; \ - u32 thread_index = vlib_get_thread_index(); \ - my_tx_buffers = tm->tx_buffers[thread_index]; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ if (PREDICT_FALSE(vec_len (my_tx_buffers) == 0)) \ { \ n_free_buffers = 32; /* TODO config or macro */ \ vec_validate (my_tx_buffers, n_free_buffers - 1); \ _vec_len(my_tx_buffers) = vlib_buffer_alloc_from_free_list ( \ - tm->vlib_main, my_tx_buffers, n_free_buffers, \ + vlib_get_main(), my_tx_buffers, n_free_buffers, \ VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); \ - tm->tx_buffers[thread_index] = my_tx_buffers; \ + tm->tx_buffers[thread_index] = my_tx_buffers; \ } \ /* buffer shortage */ \ if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) \ @@ -445,12 +448,12 @@ do { \ _vec_len (my_tx_buffers) -= 1; \ } while (0) -#define tcp_return_buffer(tm) \ -do { \ - u32 *my_tx_buffers; \ - u32 thread_index = vlib_get_thread_index(); \ - my_tx_buffers = tm->tx_buffers[thread_index]; \ - _vec_len (my_tx_buffers) +=1; \ +#define tcp_return_buffer(tm) \ +do { \ + u32 *my_tx_buffers; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ + _vec_len (my_tx_buffers) +=1; \ } while (0) always_inline void @@ -757,23 +760,22 @@ void tcp_push_ip_hdr (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b) { tcp_header_t *th = vlib_buffer_get_current (b); - + vlib_main_t *vm = vlib_get_main (); if (tc->c_is_ip4) { ip4_header_t *ih; - ih = vlib_buffer_push_ip4 (tm->vlib_main, b, &tc->c_lcl_ip4, + ih = vlib_buffer_push_ip4 (vm, b, &tc->c_lcl_ip4, &tc->c_rmt_ip4, IP_PROTOCOL_TCP); - th->checksum = ip4_tcp_udp_compute_checksum (tm->vlib_main, b, ih); + th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih); } else { ip6_header_t *ih; int bogus = ~0; - ih = vlib_buffer_push_ip6 (tm->vlib_main, b, &tc->c_lcl_ip6, + ih = vlib_buffer_push_ip6 (vm, b, &tc->c_lcl_ip6, &tc->c_rmt_ip6, IP_PROTOCOL_TCP); - th->checksum = ip6_tcp_udp_icmp_compute_checksum (tm->vlib_main, b, ih, - &bogus); + th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih, &bogus); ASSERT (!bogus); } } @@ -851,6 +853,13 @@ tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) /* Decide where to send the packet */ next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index; + /* Initialize the trajectory trace, if configured */ + if (VLIB_BUFFER_TRACE_TRAJECTORY > 0) + { + b->pre_data[0] = 1; + b->pre_data[1] = next_index; + } + /* Enqueue the packet */ f = vlib_get_frame_to_node (vm, next_index); to_next = vlib_frame_vector_args (f); @@ -1144,6 +1153,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Account for the SYN */ tc->snd_nxt += 1; + tc->rtt_ts = 0; } else { @@ -1232,7 +1242,7 @@ tcp_timer_persist_handler (u32 index) /* Nothing to send */ if (n_bytes <= 0) { - clib_warning ("persist found nothing to send"); + // clib_warning ("persist found nothing to send"); tcp_return_buffer (tm); return; } @@ -1448,7 +1458,7 @@ tcp46_output_inline (vlib_main_t * vm, tcp_connection_t *tc0; tcp_tx_trace_t *t0; tcp_header_t *th0 = 0; - u32 error0 = TCP_ERROR_PKTS_SENT, next0 = TCP_OUTPUT_NEXT_DROP; + u32 error0 = TCP_ERROR_PKTS_SENT, next0 = TCP_OUTPUT_NEXT_IP_LOOKUP; bi0 = from[0]; to_next[0] = bi0; @@ -1527,6 +1537,7 @@ tcp46_output_inline (vlib_main_t * vm, tc0->rto_boff = 0; } +#if 0 /* Make sure we haven't lost route to our peer */ if (PREDICT_FALSE (tc0->last_fib_check < tc0->snd_opts.tsval + TCP_FIB_RECHECK_PERIOD)) @@ -1547,6 +1558,10 @@ tcp46_output_inline (vlib_main_t * vm, /* Use pre-computed dpo to set next node */ next0 = tc0->c_rmt_dpo.dpoi_next_node; vnet_buffer (b0)->ip.adj_index[VLIB_TX] = tc0->c_rmt_dpo.dpoi_index; +#endif + + vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0; b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; done: diff --git a/src/vnet/tcp/tcp_packet.h b/src/vnet/tcp/tcp_packet.h index a6f62ee1..9ccfe655 100644 --- a/src/vnet/tcp/tcp_packet.h +++ b/src/vnet/tcp/tcp_packet.h @@ -168,6 +168,7 @@ typedef struct #define TCP_OPTION_LEN_TIMESTAMP 10 #define TCP_OPTION_LEN_SACK_BLOCK 8 +#define TCP_HDR_LEN_MAX 60 #define TCP_WND_MAX 65535U #define TCP_MAX_WND_SCALE 14 /* See RFC 1323 */ #define TCP_OPTS_ALIGN 4 diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c index a461e3b8..510deb4f 100644 --- a/src/vnet/tcp/tcp_test.c +++ b/src/vnet/tcp/tcp_test.c @@ -290,7 +290,7 @@ tcp_test_sack_tx (vlib_main_t * vm, unformat_input_t * input) { tcp_connection_t _tc, *tc = &_tc; sack_block_t *sacks; - int i, verbose = 0; + int i, verbose = 0, expected; while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { @@ -326,8 +326,12 @@ tcp_test_sack_tx (vlib_main_t * vm, unformat_input_t * input) sacks = vec_dup (tc->snd_sacks); tcp_update_sack_list (tc, 1100, 1200); - TCP_TEST ((vec_len (tc->snd_sacks) == 5), "sack blocks %d expected %d", - vec_len (tc->snd_sacks), 5); + if (verbose) + vlib_cli_output (vm, "add new segment [1100, 1200]\n%U", + format_tcp_sacks, tc); + expected = 5 < TCP_MAX_SACK_BLOCKS ? 6 : 5; + TCP_TEST ((vec_len (tc->snd_sacks) == expected), + "sack blocks %d expected %d", vec_len (tc->snd_sacks), expected); TCP_TEST ((tc->snd_sacks[0].start == 1100), "first sack block start %u expected %u", tc->snd_sacks[0].start, 1100); diff --git a/src/vnet/udp/udp_input.c b/src/vnet/udp/udp_input.c index e6b4f8fc..9a8ff076 100644 --- a/src/vnet/udp/udp_input.c +++ b/src/vnet/udp/udp_input.c @@ -123,7 +123,7 @@ udp4_uri_input_node_fn (vlib_main_t * vm, /* lookup session */ s0 = stream_session_lookup4 (&ip0->dst_address, &ip0->src_address, udp0->dst_port, udp0->src_port, - SESSION_TYPE_IP4_UDP, my_thread_index); + SESSION_TYPE_IP4_UDP); /* no listener */ if (PREDICT_FALSE (s0 == 0)) -- cgit 1.2.3-korg From 3eb5062b40feb3002de09a3caff86232d6e1adea Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Thu, 13 Jul 2017 01:24:57 -0400 Subject: Fixes and improved tcp/session debugging - Fix rx sack option parsing - Add session sack scoreboard tracing and replaying - Add svm fifo tracing and replaying - Scoreboard/svm fifo ooo segment reception fixes - Improved overall debugging Change-Id: Ieae07eba355e66f5935253232bb00f2dfb7ece00 Signed-off-by: Florin Coras --- src/svm/svm_fifo.c | 150 +++++++++++++++------ src/svm/svm_fifo.h | 41 ++++++ src/vnet/session/session.c | 74 ++++++++++- src/vnet/session/session.h | 38 ++++-- src/vnet/session/session_cli.c | 266 ++++++++++++++++++++++++++++++++----- src/vnet/tcp/builtin_client.c | 2 + src/vnet/tcp/builtin_server.c | 7 +- src/vnet/tcp/tcp.c | 208 +++++++++++++++++++++++++++++ src/vnet/tcp/tcp.h | 53 +++++++- src/vnet/tcp/tcp_input.c | 114 +++++++++++----- src/vnet/tcp/tcp_test.c | 289 +++++++++++++++++++++++++++++++++++++++++ 11 files changed, 1111 insertions(+), 131 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c index da60fee5..fc2189c5 100644 --- a/src/svm/svm_fifo.c +++ b/src/svm/svm_fifo.c @@ -60,6 +60,90 @@ format_ooo_segment (u8 * s, va_list * args) return s; } +u8 * +svm_fifo_dump_trace (u8 * s, svm_fifo_t * f) +{ +#if SVM_FIFO_TRACE + svm_fifo_trace_elem_t *seg = 0; + int i = 0; + + if (f->trace) + { + vec_foreach (seg, f->trace) + { + s = format (s, "{%u, %u, %u}, ", seg->offset, seg->len, seg->action); + i++; + if (i % 5 == 0) + s = format (s, "\n"); + } + s = format (s, "\n"); + } + return s; +#else + return 0; +#endif +} + +u8 * +svm_fifo_replay (u8 * s, svm_fifo_t * f, u8 no_read, u8 verbose) +{ + int i, trace_len; + u8 *data = 0; + svm_fifo_trace_elem_t *trace; + u32 offset; + svm_fifo_t *dummy_fifo; + + if (!f) + return s; + +#if SVM_FIFO_TRACE + trace = f->trace; + trace_len = vec_len (trace); +#else + trace = 0; + trace_len = 0; +#endif + + dummy_fifo = svm_fifo_create (f->nitems); + memset (f->data, 0xFF, f->nitems); + + vec_validate (data, f->nitems); + for (i = 0; i < vec_len (data); i++) + data[i] = i; + + for (i = 0; i < trace_len; i++) + { + offset = trace[i].offset; + if (trace[i].action == 1) + { + if (verbose) + s = format (s, "adding [%u, %u]:", trace[i].offset, + (trace[i].offset + + trace[i].len) % dummy_fifo->nitems); + svm_fifo_enqueue_with_offset (dummy_fifo, trace[i].offset, + trace[i].len, &data[offset]); + } + else if (trace[i].action == 2) + { + if (verbose) + s = format (s, "adding [%u, %u]:", 0, trace[i].len); + svm_fifo_enqueue_nowait (dummy_fifo, trace[i].len, &data[offset]); + } + else if (!no_read) + { + if (verbose) + s = format (s, "read: %u", trace[i].len); + svm_fifo_dequeue_drop (dummy_fifo, trace[i].len); + } + if (verbose) + s = format (s, "%U", format_svm_fifo, dummy_fifo, 1); + } + + s = format (s, "result: %U", format_svm_fifo, dummy_fifo, 1); + + return s; +} + u8 * format_ooo_list (u8 * s, va_list * args) { @@ -73,6 +157,7 @@ format_ooo_list (u8 * s, va_list * args) s = format (s, " %U\n", format_ooo_segment, seg); ooo_segment_index = seg->next; } + return s; } @@ -94,10 +179,10 @@ format_svm_fifo (u8 * s, va_list * args) if (verbose) { - s = format (s, " ooo pool %d active elts\n", - pool_elts (f->ooo_segments)); + s = format (s, " ooo pool %d active elts newest %u\n", + pool_elts (f->ooo_segments), f->ooos_newest); if (svm_fifo_has_ooo_data (f)) - s = format (s, " %U", format_ooo_list, f); + s = format (s, " %U", format_ooo_list, f, verbose); } return s; } @@ -116,7 +201,6 @@ svm_fifo_create (u32 data_size_in_bytes) memset (f, 0, sizeof (*f)); f->nitems = data_size_in_bytes; f->ooos_list_head = OOO_SEGMENT_INVALID_INDEX; - return (f); } @@ -178,6 +262,7 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) u32 new_index, s_end_pos, s_index; u32 normalized_position, normalized_end_position; + ASSERT (offset + length <= ooo_segment_distance_from_tail (f, f->head)); normalized_position = (f->tail + offset) % f->nitems; normalized_end_position = (f->tail + offset + length) % f->nitems; @@ -205,17 +290,9 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) s = prev; s_end_pos = ooo_segment_end_pos (f, s); - /* Check head and tail now since segment may be wider at both ends so - * merge tests lower won't work */ - if (position_lt (f, normalized_position, s->start)) - { - s->start = normalized_position; - s->length = position_diff (f, s_end_pos, s->start); - } - if (position_gt (f, normalized_end_position, s_end_pos)) - { - s->length = position_diff (f, normalized_end_position, s->start); - } + /* Since we have previous, normalized start position cannot be smaller + * than prev->start. Check tail */ + ASSERT (position_lt (f, s->start, normalized_position)); goto check_tail; } @@ -256,6 +333,7 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) /* Pool might've moved, get segment again */ s = pool_elt_at_index (f->ooo_segments, s_index); + /* Needs to be last */ ASSERT (s->next == OOO_SEGMENT_INVALID_INDEX); new_s->prev = s_index; @@ -274,32 +352,22 @@ ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) { s->start = normalized_position; s->length = position_diff (f, s_end_pos, s->start); - } - /* Overlapping tail */ - else if (position_gt (f, normalized_end_position, s_end_pos)) - { - s->length = position_diff (f, normalized_end_position, s->start); - } - /* New segment completely covered by current one */ - else - { - /* Do Nothing */ - s = 0; - goto done; + f->ooos_newest = s - f->ooo_segments; } check_tail: - /* The new segment's tail may cover multiple smaller ones */ + + /* Overlapping tail */ if (position_gt (f, normalized_end_position, s_end_pos)) { - /* Remove the completely overlapped segments */ - it = (s->next != OOO_SEGMENT_INVALID_INDEX) ? - pool_elt_at_index (f->ooo_segments, s->next) : 0; + s->length = position_diff (f, normalized_end_position, s->start); + + /* Remove the completely overlapped segments in the tail */ + it = ooo_segment_next (f, s); while (it && position_leq (f, ooo_segment_end_pos (f, it), normalized_end_position)) { - next = (it->next != OOO_SEGMENT_INVALID_INDEX) ? - pool_elt_at_index (f->ooo_segments, it->next) : 0; + next = ooo_segment_next (f, it); ooo_segment_del (f, it - f->ooo_segments); it = next; } @@ -307,16 +375,12 @@ check_tail: /* If partial overlap with last, merge */ if (it && position_leq (f, it->start, normalized_end_position)) { - s->length = - position_diff (f, ooo_segment_end_pos (f, it), s->start); + s->length = position_diff (f, ooo_segment_end_pos (f, it), + s->start); ooo_segment_del (f, it - f->ooo_segments); } + f->ooos_newest = s - f->ooo_segments; } - -done: - /* Most recently updated segment */ - if (s) - f->ooos_newest = s - f->ooo_segments; } /** @@ -422,6 +486,8 @@ svm_fifo_enqueue_internal (svm_fifo_t * f, u32 max_bytes, u8 * copy_from_here) total_copy_bytes = max_bytes; } + svm_fifo_trace_add (f, f->head, total_copy_bytes, 2); + /* Any out-of-order segments to collect? */ if (PREDICT_FALSE (f->ooos_list_head != OOO_SEGMENT_INVALID_INDEX)) total_copy_bytes += ooo_segment_try_collect (f, total_copy_bytes); @@ -499,6 +565,8 @@ svm_fifo_enqueue_with_offset_internal (svm_fifo_t * f, if ((required_bytes + offset_from_tail) > (nitems - cursize)) return -1; + svm_fifo_trace_add (f, offset, required_bytes, 1); + ooo_segment_add (f, offset, required_bytes); /* Number of bytes we're going to copy */ @@ -707,6 +775,8 @@ svm_fifo_dequeue_drop (svm_fifo_t * f, u32 max_bytes) /* Number of bytes we're going to drop */ total_drop_bytes = (cursize < max_bytes) ? cursize : max_bytes; + svm_fifo_trace_add (f, f->tail, total_drop_bytes, 3); + /* Number of bytes in first copy segment */ first_drop_bytes = ((nitems - f->head) < total_drop_bytes) ? diff --git a/src/svm/svm_fifo.h b/src/svm/svm_fifo.h index fe21de47..a83cd858 100644 --- a/src/svm/svm_fifo.h +++ b/src/svm/svm_fifo.h @@ -36,8 +36,16 @@ typedef struct format_function_t format_ooo_segment; format_function_t format_ooo_list; +#define SVM_FIFO_TRACE (0) #define OOO_SEGMENT_INVALID_INDEX ((u32)~0) +typedef struct +{ + u32 offset; + u32 len; + u32 action; +} svm_fifo_trace_elem_t; + typedef struct _svm_fifo { volatile u32 cursize; /**< current fifo size */ @@ -64,9 +72,28 @@ typedef struct _svm_fifo u32 ooos_newest; /**< Last segment to have been updated */ struct _svm_fifo *next; /**< next in freelist/active chain */ struct _svm_fifo *prev; /**< prev in active chain */ +#if SVM_FIFO_TRACE + svm_fifo_trace_elem_t *trace; +#endif CLIB_CACHE_LINE_ALIGN_MARK (data); } svm_fifo_t; +#if SVM_FIFO_TRACE +#define svm_fifo_trace_add(_f, _s, _l, _t) \ +{ \ + svm_fifo_trace_elem_t *trace_elt; \ + vec_add2(_f->trace, trace_elt, 1); \ + trace_elt->offset = _s; \ + trace_elt->len = _l; \ + trace_elt->action = _t; \ +} +#else +#define svm_fifo_trace_add(_f, _s, _l, _t) +#endif + +u8 *svm_fifo_dump_trace (u8 * s, svm_fifo_t * f); +u8 *svm_fifo_replay (u8 * s, svm_fifo_t * f, u8 no_read, u8 verbose); + static inline u32 svm_fifo_max_dequeue (svm_fifo_t * f) { @@ -132,6 +159,12 @@ svm_fifo_newest_ooo_segment (svm_fifo_t * f) return pool_elt_at_index (f->ooo_segments, f->ooos_newest); } +always_inline void +svm_fifo_newest_ooo_segment_reset (svm_fifo_t * f) +{ + f->ooos_newest = OOO_SEGMENT_INVALID_INDEX; +} + always_inline u32 ooo_segment_distance_from_tail (svm_fifo_t * f, u32 pos) { @@ -174,6 +207,14 @@ ooo_segment_get_prev (svm_fifo_t * f, ooo_segment_t * s) return pool_elt_at_index (f->ooo_segments, s->prev); } +always_inline ooo_segment_t * +ooo_segment_next (svm_fifo_t * f, ooo_segment_t * s) +{ + if (s->next == OOO_SEGMENT_INVALID_INDEX) + return 0; + return pool_elt_at_index (f->ooo_segments, s->next); +} + #endif /* __included_ssvm_fifo_h__ */ /* diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 0a86d563..2c2a27c1 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -325,9 +325,9 @@ stream_session_half_open_lookup (session_manager_main_t * smm, } transport_connection_t * -stream_session_lookup_transport4 (ip4_address_t * lcl, ip4_address_t * rmt, - u16 lcl_port, u16 rmt_port, u8 proto, - u32 my_thread_index) +stream_session_lookup_transport_wt4 (ip4_address_t * lcl, ip4_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto, + u32 my_thread_index) { session_manager_main_t *smm = &session_manager_main; session_kv4_t kv4; @@ -358,9 +358,40 @@ stream_session_lookup_transport4 (ip4_address_t * lcl, ip4_address_t * rmt, } transport_connection_t * -stream_session_lookup_transport6 (ip6_address_t * lcl, ip6_address_t * rmt, - u16 lcl_port, u16 rmt_port, u8 proto, - u32 my_thread_index) +stream_session_lookup_transport4 (ip4_address_t * lcl, ip4_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto) +{ + session_manager_main_t *smm = &session_manager_main; + session_kv4_t kv4; + stream_session_t *s; + int rv; + + /* Lookup session amongst established ones */ + make_v4_ss_kv (&kv4, lcl, rmt, lcl_port, rmt_port, proto); + rv = clib_bihash_search_inline_16_8 (&smm->v4_session_hash, &kv4); + if (rv == 0) + { + s = stream_session_get_from_handle (kv4.value); + return tp_vfts[s->session_type].get_connection (s->connection_index, + s->thread_index); + } + + /* If nothing is found, check if any listener is available */ + s = stream_session_lookup_listener4 (lcl, lcl_port, proto); + if (s) + return tp_vfts[s->session_type].get_listener (s->connection_index); + + /* Finally, try half-open connections */ + rv = clib_bihash_search_inline_16_8 (&smm->v4_half_open_hash, &kv4); + if (rv == 0) + return tp_vfts[proto].get_half_open (kv4.value & 0xFFFFFFFF); + return 0; +} + +transport_connection_t * +stream_session_lookup_transport_wt6 (ip6_address_t * lcl, ip6_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto, + u32 my_thread_index) { session_manager_main_t *smm = &session_manager_main; stream_session_t *s; @@ -390,6 +421,37 @@ stream_session_lookup_transport6 (ip6_address_t * lcl, ip6_address_t * rmt, return 0; } +transport_connection_t * +stream_session_lookup_transport6 (ip6_address_t * lcl, ip6_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto) +{ + session_manager_main_t *smm = &session_manager_main; + stream_session_t *s; + session_kv6_t kv6; + int rv; + + make_v6_ss_kv (&kv6, lcl, rmt, lcl_port, rmt_port, proto); + rv = clib_bihash_search_inline_48_8 (&smm->v6_session_hash, &kv6); + if (rv == 0) + { + s = stream_session_get_from_handle (kv6.value); + return tp_vfts[s->session_type].get_connection (s->connection_index, + s->thread_index); + } + + /* If nothing is found, check if any listener is available */ + s = stream_session_lookup_listener6 (lcl, lcl_port, proto); + if (s) + return tp_vfts[s->session_type].get_listener (s->connection_index); + + /* Finally, try half-open connections */ + rv = clib_bihash_search_inline_48_8 (&smm->v6_half_open_hash, &kv6); + if (rv == 0) + return tp_vfts[proto].get_half_open (kv6.value & 0xFFFFFFFF); + + return 0; +} + int stream_session_create_i (segment_manager_t * sm, transport_connection_t * tc, stream_session_t ** ret_s) diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index b4507d4e..6069c574 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -263,15 +263,30 @@ stream_session_t *stream_session_lookup6 (ip6_address_t * lcl, ip6_address_t * rmt, u16 lcl_port, u16 rmt_port, u8 proto); transport_connection_t - * stream_session_lookup_transport4 (ip4_address_t * lcl, - ip4_address_t * rmt, u16 lcl_port, - u16 rmt_port, u8 proto, - u32 thread_index); -transport_connection_t - * stream_session_lookup_transport6 (ip6_address_t * lcl, - ip6_address_t * rmt, u16 lcl_port, - u16 rmt_port, u8 proto, - u32 thread_index); + * stream_session_lookup_transport_wt4 (ip4_address_t * lcl, + ip4_address_t * rmt, u16 lcl_port, + u16 rmt_port, u8 proto, + u32 thread_index); +transport_connection_t *stream_session_lookup_transport4 (ip4_address_t * lcl, + ip4_address_t * rmt, + u16 lcl_port, + u16 rmt_port, + u8 proto); +transport_connection_t *stream_session_lookup_transport_wt6 (ip6_address_t * + lcl, + ip6_address_t * + rmt, + u16 lcl_port, + u16 rmt_port, + u8 proto, + u32 + thread_index); +transport_connection_t *stream_session_lookup_transport6 (ip6_address_t * lcl, + ip6_address_t * rmt, + u16 lcl_port, + u16 rmt_port, + u8 proto); + stream_session_t *stream_session_lookup_listener (ip46_address_t * lcl, u16 lcl_port, u8 proto); void stream_session_table_add_for_tc (transport_connection_t * tc, u64 value); @@ -415,7 +430,12 @@ void stream_session_cleanup (stream_session_t * s); void session_send_session_evt_to_thread (u64 session_handle, fifo_event_type_t evt_type, u32 thread_index); + u8 *format_stream_session (u8 * s, va_list * args); +uword unformat_stream_session (unformat_input_t * input, va_list * args); +uword unformat_transport_connection (unformat_input_t * input, + va_list * args); + int send_session_connected_callback (u32 app_index, u32 api_context, stream_session_t * s, u8 is_fail); diff --git a/src/vnet/session/session_cli.c b/src/vnet/session/session_cli.c index e06bc586..e8e6f99c 100755 --- a/src/vnet/session/session_cli.c +++ b/src/vnet/session/session_cli.c @@ -81,12 +81,141 @@ format_stream_session (u8 * s, va_list * args) { clib_warning ("Session in state: %d!", ss->session_state); } - vec_free (str); return s; } +uword +unformat_stream_session_id (unformat_input_t * input, va_list * args) +{ + u8 *proto = va_arg (*args, u8 *); + ip46_address_t *lcl = va_arg (*args, ip46_address_t *); + ip46_address_t *rmt = va_arg (*args, ip46_address_t *); + u16 *lcl_port = va_arg (*args, u16 *); + u16 *rmt_port = va_arg (*args, u16 *); + u8 *is_ip4 = va_arg (*args, u8 *); + u8 tuple_is_set = 0; + + memset (lcl, 0, sizeof (*lcl)); + memset (rmt, 0, sizeof (*rmt)); + + if (unformat (input, "tcp")) + { + *proto = TRANSPORT_PROTO_TCP; + } + if (unformat (input, "udp")) + { + *proto = TRANSPORT_PROTO_UDP; + } + else if (unformat (input, "%U:%d->%U:%d", unformat_ip4_address, &lcl->ip4, + lcl_port, unformat_ip4_address, &rmt->ip4, rmt_port)) + { + *is_ip4 = 1; + tuple_is_set = 1; + } + else if (unformat (input, "%U:%d->%U:%d", unformat_ip6_address, &lcl->ip6, + lcl_port, unformat_ip6_address, &rmt->ip6, rmt_port)) + { + *is_ip4 = 0; + tuple_is_set = 1; + } + else + return 0; + + if (tuple_is_set) + return 1; + + return 0; +} + +uword +unformat_stream_session (unformat_input_t * input, va_list * args) +{ + stream_session_t **result = va_arg (*args, stream_session_t **); + stream_session_t *s; + u8 proto = ~0; + ip46_address_t lcl, rmt; + u32 lcl_port = 0, rmt_port = 0; + u8 is_ip4 = 0, s_type = ~0, id_is_set = 0; + + if (unformat (input, "%U", unformat_stream_session_id, &proto, &lcl, &rmt, + &lcl_port, &rmt_port, &is_ip4)) + { + id_is_set = 1; + } + else + return 0; + + if (!id_is_set) + { + return 0; + } + + s_type = session_type_from_proto_and_ip (proto, is_ip4); + if (is_ip4) + s = stream_session_lookup4 (&lcl.ip4, &rmt.ip4, + clib_host_to_net_u16 (lcl_port), + clib_host_to_net_u16 (rmt_port), s_type); + else + s = stream_session_lookup6 (&lcl.ip6, &rmt.ip6, + clib_host_to_net_u16 (lcl_port), + clib_host_to_net_u16 (rmt_port), s_type); + if (s) + { + *result = s; + return 1; + } + return 0; +} + +uword +unformat_transport_connection (unformat_input_t * input, va_list * args) +{ + transport_connection_t **result = va_arg (*args, transport_connection_t **); + u32 suggested_proto = va_arg (*args, u32); + transport_connection_t *tc; + u8 proto = ~0; + ip46_address_t lcl, rmt; + u32 lcl_port = 0, rmt_port = 0; + u8 is_ip4 = 0, s_type = ~0, id_is_set = 0; + + if (unformat (input, "%U", unformat_stream_session_id, &proto, &lcl, &rmt, + &lcl_port, &rmt_port, &is_ip4)) + { + id_is_set = 1; + } + else + return 0; + + if (!id_is_set) + { + return 0; + } + + proto = (proto == (u8) ~ 0) ? suggested_proto : proto; + if (proto == (u8) ~ 0) + return 0; + s_type = session_type_from_proto_and_ip (proto, is_ip4); + if (is_ip4) + tc = stream_session_lookup_transport4 (&lcl.ip4, &rmt.ip4, + clib_host_to_net_u16 (lcl_port), + clib_host_to_net_u16 (rmt_port), + s_type); + else + tc = stream_session_lookup_transport6 (&lcl.ip6, &rmt.ip6, + clib_host_to_net_u16 (lcl_port), + clib_host_to_net_u16 (rmt_port), + s_type); + + if (tc) + { + *result = tc; + return 1; + } + return 0; +} + static clib_error_t * show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) @@ -95,13 +224,7 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, int verbose = 0, i; stream_session_t *pool; stream_session_t *s; - u8 *str = 0, one_session = 0, proto_set = 0, proto = 0; - u8 is_ip4 = 0, s_type = 0; - ip4_address_t lcl_ip4, rmt_ip4; - u32 lcl_port = 0, rmt_port = 0; - - memset (&lcl_ip4, 0, sizeof (lcl_ip4)); - memset (&rmt_ip4, 0, sizeof (rmt_ip4)); + u8 *str = 0, one_session = 0; if (!smm->is_enabled) { @@ -114,40 +237,18 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, ; else if (unformat (input, "verbose")) verbose = 1; - else if (unformat (input, "tcp")) - { - proto_set = 1; - proto = TRANSPORT_PROTO_TCP; - } - else if (unformat (input, "%U:%d->%U:%d", - unformat_ip4_address, &lcl_ip4, &lcl_port, - unformat_ip4_address, &rmt_ip4, &rmt_port)) + else if (unformat (input, "%U", unformat_stream_session, &s)) { one_session = 1; - is_ip4 = 1; } - else - break; + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); } if (one_session) { - if (!proto_set) - { - vlib_cli_output (vm, "proto not set"); - return clib_error_return (0, "proto not set"); - } - - s_type = session_type_from_proto_and_ip (proto, is_ip4); - s = stream_session_lookup4 (&lcl_ip4, &rmt_ip4, - clib_host_to_net_u16 (lcl_port), - clib_host_to_net_u16 (rmt_port), s_type); - if (s) - vlib_cli_output (vm, "%U", format_stream_session, s, 2); - else - vlib_cli_output (vm, "session does not exist"); - + vlib_cli_output (vm, "%U", format_stream_session, s, 2); return 0; } @@ -274,6 +375,103 @@ VLIB_CLI_COMMAND (clear_session_command, static) = }; /* *INDENT-ON* */ +static clib_error_t * +show_session_fifo_trace_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + stream_session_t *s = 0; + u8 is_rx = 0, *str = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "%U", unformat_stream_session, &s)) + ; + else if (unformat (input, "rx")) + is_rx = 1; + else if (unformat (input, "tx")) + is_rx = 0; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + if (!SVM_FIFO_TRACE) + { + vlib_cli_output (vm, "fifo tracing not enabled"); + return 0; + } + + if (!s) + { + vlib_cli_output (vm, "could not find session"); + return 0; + } + + str = is_rx ? + svm_fifo_dump_trace (str, s->server_rx_fifo) : + svm_fifo_dump_trace (str, s->server_tx_fifo); + + vlib_cli_output (vm, "%v", str); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_session_fifo_trace_command, static) = +{ + .path = "show session fifo trace", + .short_help = "show session fifo trace ", + .function = show_session_fifo_trace_command_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +session_replay_fifo_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + stream_session_t *s = 0; + u8 is_rx = 0, *str = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "%U", unformat_stream_session, &s)) + ; + else if (unformat (input, "rx")) + is_rx = 1; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + if (!SVM_FIFO_TRACE) + { + vlib_cli_output (vm, "fifo tracing not enabled"); + return 0; + } + + if (!s) + { + vlib_cli_output (vm, "could not find session"); + return 0; + } + + str = is_rx ? + svm_fifo_replay (str, s->server_rx_fifo, 0, 1) : + svm_fifo_replay (str, s->server_tx_fifo, 0, 1); + + vlib_cli_output (vm, "%v", str); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (session_replay_fifo_trace_command, static) = +{ + .path = "session replay fifo", + .short_help = "session replay fifo ", + .function = session_replay_fifo_command_fn, +}; +/* *INDENT-ON* */ + static clib_error_t * session_enable_disable_fn (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index a6c8a235..a92bacaa 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -464,6 +464,8 @@ builtin_session_connected_callback (u32 app_index, u32 api_context, static void builtin_session_reset_callback (stream_session_t * s) { + if (s->session_state == SESSION_STATE_READY) + clib_warning ("Reset active connection %U", format_stream_session, s, 2); return; } diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c index 8e958ac0..4ecaf56a 100644 --- a/src/vnet/tcp/builtin_server.c +++ b/src/vnet/tcp/builtin_server.c @@ -99,8 +99,7 @@ builtin_session_disconnect_callback (stream_session_t * s) void builtin_session_reset_callback (stream_session_t * s) { - clib_warning ("called.. "); - + clib_warning ("Reset session %U", format_stream_session, s, 2); stream_session_cleanup (s); } @@ -224,10 +223,6 @@ builtin_server_rx_callback (stream_session_t * s) clib_warning ("session stuck: %U", format_stream_session, s, 2); } } - else - { - bsm->rx_retries[thread_index][s->session_index] = 0; - } return 0; } diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index f379e699..8ed325d2 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -732,6 +732,7 @@ format_tcp_connection (u8 * s, va_list * args) if (verbose > 1) s = format (s, " %U\n%U", format_tcp_timers, tc, format_tcp_vars, tc); } + return s; } @@ -791,6 +792,30 @@ format_tcp_sacks (u8 * s, va_list * args) return s; } +u8 * +format_tcp_rcv_sacks (u8 * s, va_list * args) +{ + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + sack_block_t *sacks = tc->rcv_opts.sacks; + sack_block_t *block; + int i, len = 0; + + len = vec_len (sacks); + for (i = 0; i < len - 1; i++) + { + block = &sacks[i]; + s = format (s, " start %u end %u\n", block->start - tc->iss, + block->end - tc->iss); + } + if (len) + { + block = &sacks[len - 1]; + s = format (s, " start %u end %u", block->start - tc->iss, + block->end - tc->iss); + } + return s; +} + u8 * format_tcp_sack_hole (u8 * s, va_list * args) { @@ -820,6 +845,7 @@ format_tcp_scoreboard (u8 * s, va_list * args) s = format (s, "%U", format_tcp_sack_hole, hole); hole = scoreboard_next_hole (sb, hole); } + return s; } @@ -1304,7 +1330,189 @@ VLIB_CLI_COMMAND (tcp_src_address_command, static) = }; /* *INDENT-ON* */ +static u8 * +tcp_scoreboard_dump_trace (u8 * s, sack_scoreboard_t * sb) +{ +#if TCP_SCOREBOARD_TRACE + + scoreboard_trace_elt_t *block; + int i = 0; + + if (!sb->trace) + return s; + + s = format (s, "scoreboard trace:"); + vec_foreach (block, sb->trace) + { + s = format (s, "{%u, %u, %u, %u, %u}, ", block->start, block->end, + block->ack, block->snd_una_max, block->group); + if ((++i % 3) == 0) + s = format (s, "\n"); + } + return s; +#else + return 0; +#endif +} + +static clib_error_t * +tcp_show_scoreboard_trace_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd_arg) +{ + transport_connection_t *tconn = 0; + tcp_connection_t *tc; + u8 *s = 0; + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "%U", unformat_transport_connection, &tconn, + TRANSPORT_PROTO_TCP)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + if (!TCP_SCOREBOARD_TRACE) + { + vlib_cli_output (vm, "scoreboard tracing not enabled"); + return 0; + } + + tc = tcp_get_connection_from_transport (tconn); + s = tcp_scoreboard_dump_trace (s, &tc->sack_sb); + vlib_cli_output (vm, "%v", s); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (tcp_show_scoreboard_trace_command, static) = +{ + .path = "show tcp scoreboard trace", + .short_help = "show tcp scoreboard trace ", + .function = tcp_show_scoreboard_trace_fn, +}; +/* *INDENT-ON* */ +u8 * +tcp_scoreboard_replay (u8 * s, tcp_connection_t * tc, u8 verbose) +{ + int i, trace_len; + scoreboard_trace_elt_t *trace; + u32 next_ack, left, group, has_new_ack = 0; + tcp_connection_t _dummy_tc, *dummy_tc = &_dummy_tc; + sack_block_t *block; + + if (!tc) + return s; + + memset (dummy_tc, 0, sizeof (*dummy_tc)); + tcp_connection_timers_init (dummy_tc); + scoreboard_init (&dummy_tc->sack_sb); + dummy_tc->rcv_opts.flags |= TCP_OPTS_FLAG_SACK; + +#if TCP_SCOREBOARD_TRACE + trace = tc->sack_sb.trace; + trace_len = vec_len (tc->sack_sb.trace); +#else + trace = 0; + trace_len = 0; +#endif + + for (i = 0; i < trace_len; i++) + { + if (trace[i].ack != 0) + { + dummy_tc->snd_una = trace[i].ack - 1448; + dummy_tc->snd_una_max = trace[i].ack; + } + } + + left = 0; + while (left < trace_len) + { + group = trace[left].group; + vec_reset_length (dummy_tc->rcv_opts.sacks); + has_new_ack = 0; + while (trace[left].group == group) + { + if (trace[left].ack != 0) + { + if (verbose) + s = format (s, "Adding ack %u, snd_una_max %u, segs: ", + trace[left].ack, trace[left].snd_una_max); + dummy_tc->snd_una_max = trace[left].snd_una_max; + next_ack = trace[left].ack; + has_new_ack = 1; + } + else + { + if (verbose) + s = format (s, "[%u, %u], ", trace[left].start, + trace[left].end); + vec_add2 (dummy_tc->rcv_opts.sacks, block, 1); + block->start = trace[left].start; + block->end = trace[left].end; + } + left++; + } + + /* Push segments */ + tcp_rcv_sacks (dummy_tc, next_ack); + if (has_new_ack) + dummy_tc->snd_una = next_ack + dummy_tc->sack_sb.snd_una_adv; + + if (verbose) + s = format (s, "result: %U", format_tcp_scoreboard, + &dummy_tc->sack_sb); + + } + s = format (s, "result: %U", format_tcp_scoreboard, &dummy_tc->sack_sb); + + return s; +} + +static clib_error_t * +tcp_scoreboard_trace_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd_arg) +{ + transport_connection_t *tconn = 0; + tcp_connection_t *tc = 0; + u8 *str = 0; + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "%U", unformat_transport_connection, &tconn, + TRANSPORT_PROTO_TCP)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + if (!TCP_SCOREBOARD_TRACE) + { + vlib_cli_output (vm, "scoreboard tracing not enabled"); + return 0; + } + + tc = tcp_get_connection_from_transport (tconn); + if (!tc) + { + vlib_cli_output (vm, "connection not found"); + return 0; + } + str = tcp_scoreboard_replay (str, tc, 1); + vlib_cli_output (vm, "%v", str); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (tcp_replay_scoreboard_command, static) = +{ + .path = "tcp replay scoreboard", + .short_help = "tcp replay scoreboard ", + .function = tcp_scoreboard_trace_fn, +}; +/* *INDENT-ON* */ /* * fd.io coding-style-patch-verification: ON diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 37b10fd4..fd0d02b9 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -62,6 +62,7 @@ typedef enum _tcp_state format_function_t format_tcp_state; format_function_t format_tcp_flags; format_function_t format_tcp_sacks; +format_function_t format_tcp_rcv_sacks; /** TCP timers */ #define foreach_tcp_timer \ @@ -151,9 +152,19 @@ enum #undef _ }; +#define TCP_SCOREBOARD_TRACE (0) #define TCP_MAX_SACK_BLOCKS 15 /**< Max number of SACK blocks stored */ #define TCP_INVALID_SACK_HOLE_INDEX ((u32)~0) +typedef struct _scoreboard_trace_elt +{ + u32 start; + u32 end; + u32 ack; + u32 snd_una_max; + u32 group; +} scoreboard_trace_elt_t; + typedef struct _sack_scoreboard_hole { u32 next; /**< Index for next entry in linked list */ @@ -177,8 +188,38 @@ typedef struct _sack_scoreboard u32 rescue_rxt; /**< Rescue sequence number */ u32 lost_bytes; /**< Bytes lost as per RFC6675 */ u32 cur_rxt_hole; /**< Retransmitting from this hole */ + +#if TCP_SCOREBOARD_TRACE + scoreboard_trace_elt_t *trace; +#endif + } sack_scoreboard_t; +#if TCP_SCOREBOARD_TRACE +#define tcp_scoreboard_trace_add(_tc, _ack) \ +{ \ + static u64 _group = 0; \ + sack_scoreboard_t *_sb = &_tc->sack_sb; \ + sack_block_t *_sack, *_sacks; \ + scoreboard_trace_elt_t *_elt; \ + int i; \ + _group++; \ + _sacks = _tc->rcv_opts.sacks; \ + for (i = 0; i < vec_len (_sacks); i++) \ + { \ + _sack = &_sacks[i]; \ + vec_add2 (_sb->trace, _elt, 1); \ + _elt->start = _sack->start; \ + _elt->end = _sack->end; \ + _elt->ack = _elt->end == _ack ? _ack : 0; \ + _elt->snd_una_max = _elt->end == _ack ? _tc->snd_una_max : 0; \ + _elt->group = _group; \ + } \ +} +#else +#define tcp_scoreboard_trace_add(_tc, _ack) +#endif + typedef enum _tcp_cc_algorithm_type { TCP_CC_NEWRENO, @@ -405,6 +446,12 @@ tcp_connection_get_if_valid (u32 conn_index, u32 thread_index) return pool_elt_at_index (tcp_main.connections[thread_index], conn_index); } +always_inline tcp_connection_t * +tcp_get_connection_from_transport (transport_connection_t * tconn) +{ + return (tcp_connection_t *) tconn; +} + void tcp_connection_close (tcp_connection_t * tc); void tcp_connection_cleanup (tcp_connection_t * tc); void tcp_connection_del (tcp_connection_t * tc); @@ -414,6 +461,8 @@ u8 *format_tcp_connection_id (u8 * s, va_list * args); u8 *format_tcp_connection (u8 * s, va_list * args); u8 *format_tcp_scoreboard (u8 * s, va_list * args); +u8 *tcp_scoreboard_replay (u8 * s, tcp_connection_t * tc, u8 verbose); + always_inline tcp_connection_t * tcp_listener_get (u32 tli) { @@ -689,7 +738,7 @@ sack_scoreboard_hole_t *scoreboard_next_rxt_hole (sack_scoreboard_t * sb, start, u8 have_sent_1_smss, u8 * can_rescue, u8 * snd_limited); -void scoreboard_init_high_rxt (sack_scoreboard_t * sb); +void scoreboard_init_high_rxt (sack_scoreboard_t * sb, u32 seq); always_inline sack_scoreboard_hole_t * scoreboard_get_hole (sack_scoreboard_t * sb, u32 index) @@ -740,6 +789,7 @@ scoreboard_clear (sack_scoreboard_t * sb) scoreboard_remove_hole (sb, hole); } ASSERT (sb->head == sb->tail && sb->head == TCP_INVALID_SACK_HOLE_INDEX); + ASSERT (pool_elts (sb->holes) == 0); sb->sacked_bytes = 0; sb->last_sacked_bytes = 0; sb->last_bytes_delivered = 0; @@ -759,6 +809,7 @@ scoreboard_hole_bytes (sack_scoreboard_hole_t * hole) always_inline u32 scoreboard_hole_index (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) { + ASSERT (!pool_is_free_index (sb->holes, hole - sb->holes)); return hole - sb->holes; } diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 45db0da6..bc7d9015 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -206,8 +206,8 @@ tcp_options_parse (tcp_header_t * th, tcp_options_t * to) vec_reset_length (to->sacks); for (j = 0; j < to->n_sack_blocks; j++) { - b.start = clib_net_to_host_u32 (*(u32 *) (data + 2 + 4 * j)); - b.end = clib_net_to_host_u32 (*(u32 *) (data + 6 + 4 * j)); + b.start = clib_net_to_host_u32 (*(u32 *) (data + 2 + 8 * j)); + b.end = clib_net_to_host_u32 (*(u32 *) (data + 6 + 8 * j)); vec_add1 (to->sacks, b); } break; @@ -540,6 +540,10 @@ scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) if (scoreboard_hole_index (sb, hole) == sb->cur_rxt_hole) sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; + /* Poison the entry */ + if (CLIB_DEBUG > 0) + memset (hole, 0xfe, sizeof (*hole)); + pool_put (sb->holes, hole); } @@ -555,7 +559,7 @@ scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index, hole->start = start; hole->end = end; - hole_index = hole - sb->holes; + hole_index = scoreboard_hole_index (sb, hole); prev = scoreboard_get_hole (sb, prev_index); if (prev) @@ -680,12 +684,30 @@ scoreboard_next_rxt_hole (sack_scoreboard_t * sb, } void -scoreboard_init_high_rxt (sack_scoreboard_t * sb) +scoreboard_init_high_rxt (sack_scoreboard_t * sb, u32 seq) { sack_scoreboard_hole_t *hole; hole = scoreboard_first_hole (sb); - sb->high_rxt = hole->start; - sb->cur_rxt_hole = sb->head; + if (hole) + { + seq = seq_gt (seq, hole->start) ? seq : hole->start; + sb->cur_rxt_hole = sb->head; + } + sb->high_rxt = seq; +} + +/** + * Test that scoreboard is sane after recovery + * + * Returns 1 if scoreboard is empty or if first hole beyond + * snd_una. + */ +u8 +tcp_scoreboard_is_sane_post_recovery (tcp_connection_t * tc) +{ + sack_scoreboard_hole_t *hole; + hole = scoreboard_first_hole (&tc->sack_sb); + return (!hole || seq_geq (hole->start, tc->snd_una)); } void @@ -712,7 +734,7 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { if (seq_lt (blk->start, blk->end) && seq_gt (blk->start, tc->snd_una) - && seq_gt (blk->start, ack) && seq_leq (blk->end, tc->snd_nxt)) + && seq_gt (blk->start, ack) && seq_leq (blk->end, tc->snd_una_max)) { blk++; continue; @@ -731,6 +753,8 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) if (vec_len (tc->rcv_opts.sacks) == 0) return; + tcp_scoreboard_trace_add (tc, ack); + /* Make sure blocks are ordered */ for (i = 0; i < vec_len (tc->rcv_opts.sacks); i++) for (j = i + 1; j < vec_len (tc->rcv_opts.sacks); j++) @@ -797,7 +821,7 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) sb->last_bytes_delivered += next_hole->start - hole->end; } - else if (!next_hole) + else { ASSERT (seq_geq (sb->high_sacked, ack)); sb->snd_una_adv = sb->high_sacked - ack; @@ -824,12 +848,14 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) if (seq_lt (blk->end, hole->end)) { hole_index = scoreboard_hole_index (sb, hole); - scoreboard_insert_hole (sb, hole_index, blk->end, hole->end); + next_hole = scoreboard_insert_hole (sb, hole_index, blk->end, + hole->end); /* Pool might've moved */ hole = scoreboard_get_hole (sb, hole_index); hole->end = blk->start; blk_index++; + ASSERT (hole->next == scoreboard_hole_index (sb, next_hole)); } else if (seq_lt (blk->start, hole->end)) { @@ -957,7 +983,7 @@ tcp_cc_recover (tcp_connection_t * tc) ASSERT (tc->rto_boff == 0); ASSERT (!tcp_in_cong_recovery (tc)); - + ASSERT (tcp_scoreboard_is_sane_post_recovery (tc)); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); return 0; } @@ -965,7 +991,7 @@ tcp_cc_recover (tcp_connection_t * tc) static void tcp_cc_update (tcp_connection_t * tc, vlib_buffer_t * b) { - ASSERT (!tcp_in_cong_recovery (tc)); + ASSERT (!tcp_in_cong_recovery (tc) || tcp_is_lost_fin (tc)); /* Congestion avoidance */ tc->cc_algo->rcv_ack (tc); @@ -1064,10 +1090,10 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) ASSERT (tc->cwnd >= tc->snd_mss); /* If cwnd allows, send more data */ - if (tcp_opts_sack_permitted (&tc->rcv_opts) - && scoreboard_first_hole (&tc->sack_sb)) + if (tcp_opts_sack_permitted (&tc->rcv_opts)) { - scoreboard_init_high_rxt (&tc->sack_sb); + scoreboard_init_high_rxt (&tc->sack_sb, + tc->snd_una + tc->snd_mss); tcp_fast_retransmit_sack (tc); } else @@ -1134,12 +1160,13 @@ partial_ack: /* Remove retransmitted bytes that have been delivered */ ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv >= tc->sack_sb.last_bytes_delivered); - rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv - - tc->sack_sb.last_bytes_delivered; - if (0 && rxt_delivered && seq_gt (tc->sack_sb.high_rxt, tc->snd_una)) + + if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt)) { /* If we have sacks and we haven't gotten an ack beyond high_rxt, * remove sacked bytes delivered */ + rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv + - tc->sack_sb.last_bytes_delivered; ASSERT (tc->snd_rxt_bytes >= rxt_delivered); tc->snd_rxt_bytes -= rxt_delivered; } @@ -1256,6 +1283,18 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, return 0; } +static u8 +tcp_sack_vector_is_sane (sack_block_t * sacks) +{ + int i; + for (i = 1; i < vec_len (sacks); i++) + { + if (sacks[i - 1].end == sacks[i].start) + return 0; + } + return 1; +} + /** * Build SACK list as per RFC2018. * @@ -1316,6 +1355,9 @@ tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end) /* Replace old vector with new one */ vec_free (tc->snd_sacks); tc->snd_sacks = new_list; + + /* Segments should not 'touch' */ + ASSERT (tcp_sack_vector_is_sane (tc->snd_sacks)); } /** Enqueue data for delivery to application */ @@ -1330,7 +1372,6 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, /* Pure ACK. Update rcv_nxt and be done. */ if (PREDICT_FALSE (data_len == 0)) { - tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end; return TCP_ERROR_PURE_ACK; } @@ -1385,7 +1426,7 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, u16 data_len) { stream_session_t *s0; - int rv; + int rv, offset; ASSERT (seq_gt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt)); @@ -1421,12 +1462,12 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, newest = svm_fifo_newest_ooo_segment (s0->server_rx_fifo); if (newest) { - start = - tc->rcv_nxt + ooo_segment_offset (s0->server_rx_fifo, newest); + offset = ooo_segment_offset (s0->server_rx_fifo, newest); + ASSERT (offset <= vnet_buffer (b)->tcp.seq_number - tc->rcv_nxt); + start = tc->rcv_nxt + offset; end = start + ooo_segment_length (s0->server_rx_fifo, newest); tcp_update_sack_list (tc, start, end); - - ASSERT (seq_gt (start, tc->rcv_nxt)); + svm_fifo_newest_ooo_segment_reset (s0->server_rx_fifo); } } @@ -2736,12 +2777,12 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* lookup session */ tc0 = (tcp_connection_t *) - stream_session_lookup_transport4 (&ip40->dst_address, - &ip40->src_address, - tcp0->dst_port, - tcp0->src_port, - SESSION_TYPE_IP4_TCP, - my_thread_index); + stream_session_lookup_transport_wt4 (&ip40->dst_address, + &ip40->src_address, + tcp0->dst_port, + tcp0->src_port, + SESSION_TYPE_IP4_TCP, + my_thread_index); } else { @@ -2754,12 +2795,12 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tc0 = (tcp_connection_t *) - stream_session_lookup_transport6 (&ip60->src_address, - &ip60->dst_address, - tcp0->src_port, - tcp0->dst_port, - SESSION_TYPE_IP6_TCP, - my_thread_index); + stream_session_lookup_transport_wt6 (&ip60->src_address, + &ip60->dst_address, + tcp0->src_port, + tcp0->dst_port, + SESSION_TYPE_IP6_TCP, + my_thread_index); } /* Length check */ @@ -2931,6 +2972,8 @@ do { \ _(ESTABLISHED, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); _(ESTABLISHED, TCP_FLAG_SYN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); + _(ESTABLISHED, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, + TCP_ERROR_NONE); /* ACK or FIN-ACK to our FIN */ _(FIN_WAIT_1, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(FIN_WAIT_1, TCP_FLAG_ACK | TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, @@ -2954,6 +2997,7 @@ do { \ _(TIME_WAIT, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(TIME_WAIT, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(TIME_WAIT, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED); _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); _(CLOSED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c index 510deb4f..f37ba96d 100644 --- a/src/vnet/tcp/tcp_test.c +++ b/src/vnet/tcp/tcp_test.c @@ -34,6 +34,38 @@ } \ } +/* *INDENT-OFF* */ +scoreboard_trace_elt_t sb_trace[] = {}; +/* *INDENT-ON* */ + +static int +tcp_test_scoreboard_replay (vlib_main_t * vm, unformat_input_t * input) +{ + int verbose = 0; + tcp_connection_t _tc, *tc = &_tc; + u8 *s = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "detail")) + verbose = 1; + else + { + clib_error_t *e = clib_error_return + (0, "unknown input `%U'", format_unformat_error, input); + clib_error_report (e); + return -1; + } + } + +#if TCP_SCOREBOARD_TRACE + tc->sack_sb.trace = sb_trace; +#endif + s = tcp_scoreboard_replay (s, tc, verbose); + vlib_cli_output (vm, "%v", s); + return 0; +} + static int tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) { @@ -47,6 +79,8 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) { if (unformat (input, "verbose")) verbose = 1; + else if (unformat (input, "replay")) + return tcp_test_scoreboard_replay (vm, input); } memset (tc, 0, sizeof (*tc)); @@ -282,6 +316,44 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) TCP_TEST ((sb->last_bytes_delivered == 400), "last bytes delivered %d", sb->last_bytes_delivered); + /* + * One hole close to head, patch head, split in two and start acking + * the lowest part + */ + scoreboard_clear (sb); + tc->snd_una = 0; + tc->snd_una_max = 1000; + tc->snd_nxt = 1000; + + block.start = 500; + block.end = 1000; + vec_add1 (tc->rcv_opts.sacks, block); + tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks); + + tcp_rcv_sacks (tc, 0); + if (verbose) + vlib_cli_output (vm, "sb added [500, 1000]:\n%U", + format_tcp_scoreboard, sb); + + vec_reset_length (tc->rcv_opts.sacks); + block.start = 300; + block.end = 400; + vec_add1 (tc->rcv_opts.sacks, block); + tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks); + tcp_rcv_sacks (tc, 100); + if (verbose) + vlib_cli_output (vm, "sb added [0, 100] [300, 400]:\n%U", + format_tcp_scoreboard, sb); + TCP_TEST ((pool_elts (sb->holes) == 2), + "scoreboard has %d elements", pool_elts (sb->holes)); + + tc->snd_una = 100; + tcp_rcv_sacks (tc, 200); + tcp_rcv_sacks (tc, 300); + if (verbose) + vlib_cli_output (vm, "sb added [0, 300]:\n%U", format_tcp_scoreboard, sb); + TCP_TEST ((sb->sacked_bytes == 500), "sacked bytes %d", sb->sacked_bytes); + return 0; } @@ -390,6 +462,37 @@ tcp_test_sack_tx (vlib_main_t * vm, unformat_input_t * input) vlib_cli_output (vm, "advance rcv_nxt to 1200\n%U", format_tcp_sacks, tc); TCP_TEST ((vec_len (tc->snd_sacks) == 0), "sack blocks %d expected %d", vec_len (tc->snd_sacks), 0); + + + /* + * Add 2 blocks, overwrite first and update rcv_nxt to also remove it + */ + + vec_reset_length (tc->snd_sacks); + tc->rcv_nxt = 0; + + tcp_update_sack_list (tc, 100, 200); + tcp_update_sack_list (tc, 300, 400); + + if (verbose) + vlib_cli_output (vm, "add [100, 200] [300, 400]\n%U", + format_tcp_sacks, tc); + TCP_TEST ((vec_len (tc->snd_sacks) == 2), + "sack blocks %d expected %d", vec_len (tc->snd_sacks), 2); + TCP_TEST ((tc->snd_sacks[0].start == 300), + "first sack block start %u expected %u", tc->snd_sacks[0].start, + 300); + + tc->rcv_nxt = 100; + tcp_update_sack_list (tc, 100, 100); + if (verbose) + vlib_cli_output (vm, "add [100, 200] rcv_nxt = 100\n%U", + format_tcp_sacks, tc); + TCP_TEST ((vec_len (tc->snd_sacks) == 1), + "sack blocks %d expected %d", vec_len (tc->snd_sacks), 1); + TCP_TEST ((tc->snd_sacks[0].start == 300), + "first sack block start %u expected %u", tc->snd_sacks[0].start, + 300); return 0; } @@ -1188,6 +1291,176 @@ tcp_test_fifo4 (vlib_main_t * vm, unformat_input_t * input) return 0; } +static u32 +fifo_pos (svm_fifo_t * f, u32 pos) +{ + return pos % f->nitems; +} + +static int +tcp_test_fifo5 (vlib_main_t * vm, unformat_input_t * input) +{ + svm_fifo_t *f; + u32 fifo_size = 400, j = 0, offset = 200; + int i, rv, verbose = 0; + u8 *test_data = 0, *data_buf = 0; + ooo_segment_t *ooo_seg; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "verbose")) + verbose = 1; + else + { + clib_error_t *e = clib_error_return + (0, "unknown input `%U'", format_unformat_error, input); + clib_error_report (e); + return -1; + } + } + + f = fifo_prepare (fifo_size); + svm_fifo_init_pointers (f, offset); + + vec_validate (test_data, 399); + for (i = 0; i < vec_len (test_data); i++) + test_data[i] = i % 0xff; + + /* + * Start with [100, 200] and [300, 400] + */ + svm_fifo_enqueue_with_offset (f, 100, 100, &test_data[100]); + svm_fifo_enqueue_with_offset (f, 300, 100, &test_data[300]); + + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 2), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); + TCP_TEST ((f->ooos_newest == 1), "newest %u", f->ooos_newest); + if (verbose) + vlib_cli_output (vm, "fifo after [100, 200] and [300, 400] : %U", + format_svm_fifo, f, 2 /* verbose */ ); + + /* + * Add [225, 275] + */ + + rv = svm_fifo_enqueue_with_offset (f, 225, 50, &test_data[200]); + if (verbose) + vlib_cli_output (vm, "fifo after [225, 275] : %U", + format_svm_fifo, f, 2 /* verbose */ ); + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 3), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); + ooo_seg = svm_fifo_first_ooo_segment (f); + TCP_TEST ((ooo_seg->start == fifo_pos (f, 100 + offset)), + "first seg start %u expected %u", ooo_seg->start, + fifo_pos (f, 100 + offset)); + TCP_TEST ((ooo_seg->length == 100), "first seg length %u expected %u", + ooo_seg->length, 100); + ooo_seg = ooo_segment_next (f, ooo_seg); + TCP_TEST ((ooo_seg->start == fifo_pos (f, 225 + offset)), + "second seg start %u expected %u", + ooo_seg->start, fifo_pos (f, 225 + offset)); + TCP_TEST ((ooo_seg->length == 50), "second seg length %u expected %u", + ooo_seg->length, 50); + ooo_seg = ooo_segment_next (f, ooo_seg); + TCP_TEST ((ooo_seg->start == fifo_pos (f, 300 + offset)), + "third seg start %u expected %u", + ooo_seg->start, fifo_pos (f, 300 + offset)); + TCP_TEST ((ooo_seg->length == 100), "third seg length %u expected %u", + ooo_seg->length, 100); + TCP_TEST ((f->ooos_newest == 2), "newest %u", f->ooos_newest); + /* + * Add [190, 310] + */ + rv = svm_fifo_enqueue_with_offset (f, 190, 120, &test_data[190]); + if (verbose) + vlib_cli_output (vm, "fifo after [190, 310] : %U", + format_svm_fifo, f, 1 /* verbose */ ); + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 1), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); + ooo_seg = svm_fifo_first_ooo_segment (f); + TCP_TEST ((ooo_seg->start == fifo_pos (f, offset + 100)), + "first seg start %u expected %u", + ooo_seg->start, fifo_pos (f, offset + 100)); + TCP_TEST ((ooo_seg->length == 300), "first seg length %u expected %u", + ooo_seg->length, 300); + + /* + * Add [0, 150] + */ + rv = svm_fifo_enqueue_nowait (f, 150, test_data); + + if (verbose) + vlib_cli_output (vm, "fifo after [0 150] : %U", format_svm_fifo, f, + 2 /* verbose */ ); + + TCP_TEST ((rv == 400), "managed to enqueue %u expected %u", rv, 400); + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 0), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); + + vec_validate (data_buf, 399); + svm_fifo_peek (f, 0, 400, data_buf); + if (compare_data (data_buf, test_data, 0, 400, &j)) + { + TCP_TEST (0, "[%d] peeked %u expected %u", j, data_buf[j], + test_data[j]); + } + + /* + * Add [100 200] and overlap it with [50 250] + */ + svm_fifo_free (f); + f = fifo_prepare (fifo_size); + + svm_fifo_enqueue_with_offset (f, 100, 100, &test_data[100]); + svm_fifo_enqueue_with_offset (f, 50, 200, &test_data[50]); + TCP_TEST ((svm_fifo_number_ooo_segments (f) == 1), + "number of ooo segments %u", svm_fifo_number_ooo_segments (f)); + ooo_seg = svm_fifo_first_ooo_segment (f); + TCP_TEST ((ooo_seg->start == 50), "first seg start %u expected %u", + ooo_seg->start, 50); + TCP_TEST ((ooo_seg->length == 200), "first seg length %u expected %u", + ooo_seg->length, 200); + + svm_fifo_free (f); + vec_free (test_data); + return 0; +} + +/* *INDENT-OFF* */ +svm_fifo_trace_elem_t fifo_trace[] = {}; +/* *INDENT-ON* */ + +static int +tcp_test_fifo_replay (vlib_main_t * vm, unformat_input_t * input) +{ + svm_fifo_t f; + int verbose = 0; + u8 no_read = 0, *str = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "verbose")) + verbose = 1; + else if (unformat (input, "no-read")) + no_read = 1; + else + { + clib_error_t *e = clib_error_return + (0, "unknown input `%U'", format_unformat_error, input); + clib_error_report (e); + return -1; + } + } + +#if SVMF_FIFO_TRACE + f.trace = fifo_trace; +#endif + + str = svm_fifo_replay (str, &f, no_read, verbose); + vlib_cli_output (vm, "%v", str); + return 0; +} + static int tcp_test_fifo (vlib_main_t * vm, unformat_input_t * input) { @@ -1237,6 +1510,14 @@ tcp_test_fifo (vlib_main_t * vm, unformat_input_t * input) if (tcp_test_fifo3 (vm, input)) return -1; unformat_free (input); + + res = tcp_test_fifo4 (vm, input); + if (res) + return res; + + res = tcp_test_fifo5 (vm, input); + if (res) + return res; } else { @@ -1256,6 +1537,14 @@ tcp_test_fifo (vlib_main_t * vm, unformat_input_t * input) { res = tcp_test_fifo4 (vm, input); } + else if (unformat (input, "fifo5")) + { + res = tcp_test_fifo5 (vm, input); + } + else if (unformat (input, "replay")) + { + res = tcp_test_fifo_replay (vm, input); + } } return res; -- cgit 1.2.3-korg From 6534b7aa13bc5bed15ed87f47bb766405963e9e8 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Tue, 18 Jul 2017 05:38:03 -0400 Subject: Improvements to tcp rx path and debugging - Increment rcv_nxt for fin packets - Call tcp_segment_rcv only if buffer has data - Parse rcv opts before deleting half-open connection - Fix initial rcv_wnd - Improved event logging Change-Id: I9b83c04f432c4cec832c480b03e534deff02c3b1 Signed-off-by: Florin Coras --- src/vnet/session/node.c | 73 ++++++++++++ src/vnet/session/session.c | 38 ++++++- src/vnet/session/session.h | 4 + src/vnet/session/session_api.c | 7 -- src/vnet/session/session_cli.c | 22 +++- src/vnet/tcp/builtin_client.c | 11 +- src/vnet/tcp/builtin_server.c | 8 +- src/vnet/tcp/tcp.c | 59 ++++++++-- src/vnet/tcp/tcp.h | 12 +- src/vnet/tcp/tcp_debug.h | 246 +++++++++++++++++++++++++++++++---------- src/vnet/tcp/tcp_input.c | 165 ++++++++++++++++----------- src/vnet/tcp/tcp_output.c | 51 +++++---- src/vnet/tcp/tcp_test.c | 99 +++++++++++++++++ 13 files changed, 612 insertions(+), 183 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c index 983b78b8..8d703b0b 100644 --- a/src/vnet/session/node.c +++ b/src/vnet/session/node.c @@ -443,6 +443,79 @@ dump_thread_0_event_queue (void) } } +static u8 +session_node_cmp_event (session_fifo_event_t * e, svm_fifo_t * f) +{ + stream_session_t *s; + switch (e->event_type) + { + case FIFO_EVENT_APP_RX: + case FIFO_EVENT_APP_TX: + case FIFO_EVENT_BUILTIN_RX: + if (e->fifo == f) + return 1; + break; + case FIFO_EVENT_DISCONNECT: + break; + case FIFO_EVENT_RPC: + s = stream_session_get_from_handle (e->session_handle); + if (!s) + { + clib_warning ("session has event but doesn't exist!"); + break; + } + if (s->server_rx_fifo == f || s->server_tx_fifo == f) + return 1; + break; + default: + break; + } + return 0; +} + +u8 +session_node_lookup_fifo_event (svm_fifo_t * f, session_fifo_event_t * e) +{ + session_manager_main_t *smm = vnet_get_session_manager_main (); + unix_shared_memory_queue_t *q; + session_fifo_event_t *pending_event_vector, *evt; + int i, index, found = 0; + i8 *headp; + u8 thread_index; + + ASSERT (e); + thread_index = f->master_thread_index; + /* + * Search evt queue + */ + q = smm->vpp_event_queues[thread_index]; + index = q->head; + for (i = 0; i < q->cursize; i++) + { + headp = (i8 *) (&q->data[0] + q->elsize * index); + clib_memcpy (e, headp, q->elsize); + found = session_node_cmp_event (e, f); + if (found) + break; + if (++index == q->maxsize) + index = 0; + } + /* + * Search pending events vector + */ + pending_event_vector = smm->pending_event_vector[thread_index]; + vec_foreach (evt, pending_event_vector) + { + found = session_node_cmp_event (evt, f); + if (found) + { + clib_memcpy (e, evt, sizeof (*evt)); + break; + } + } + return found; +} + static uword session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 2c2a27c1..09bc00e7 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -32,6 +32,22 @@ static transport_proto_vft_t *tp_vfts; session_manager_main_t session_manager_main; +transport_connection_t * +stream_session_lookup_half_open (transport_connection_t * tc) +{ + session_manager_main_t *smm = &session_manager_main; + session_kv4_t kv4; + int rv; + if (tc->is_ip4) + { + make_v4_ss_kv_from_tc (&kv4, tc); + rv = clib_bihash_search_inline_16_8 (&smm->v4_half_open_hash, &kv4); + if (rv == 0) + return tp_vfts[tc->proto].get_half_open (kv4.value & 0xFFFFFFFFULL); + } + return 0; +} + /* * Session lookup key; (src-ip, dst-ip, src-port, dst-port, session-type) * Value: (owner thread index << 32 | session_index); @@ -501,7 +517,7 @@ stream_session_create_i (segment_manager_t * sm, transport_connection_t * tc, tc->s_index = s->session_index; /* Add to the main lookup table */ - value = (((u64) thread_index) << 32) | (u64) s->session_index; + value = stream_session_handle (s); stream_session_table_add_for_tc (tc, value); *ret_s = s; @@ -817,8 +833,18 @@ stream_session_connect_notify (transport_connection_t * tc, u8 sst, } /* Notify client */ - app->cb_fns.session_connected_callback (app->index, api_context, new_s, - is_fail); + if (app->cb_fns.session_connected_callback (app->index, api_context, new_s, + is_fail)) + { + clib_warning ("failed to notify app"); + if (!is_fail) + stream_session_disconnect (new_s); + } + else + { + if (!is_fail) + new_s->session_state = SESSION_STATE_READY; + } /* Cleanup session lookup */ stream_session_half_open_table_del (smm, sst, tc); @@ -862,15 +888,19 @@ void stream_session_delete (stream_session_t * s) { session_manager_main_t *smm = vnet_get_session_manager_main (); + int rv; /* Delete from the main lookup table. */ - stream_session_table_del (smm, s); + if ((rv = stream_session_table_del (smm, s))) + clib_warning ("hash delete error, rv %d", rv); /* Cleanup fifo segments */ segment_manager_dealloc_fifos (s->svm_segment_index, s->server_rx_fifo, s->server_tx_fifo); pool_put (smm->sessions[s->thread_index], s); + if (CLIB_DEBUG) + memset (s, 0xFA, sizeof (*s)); } /** diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index 6069c574..6c616326 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -170,6 +170,8 @@ typedef int extern session_fifo_rx_fn session_tx_fifo_peek_and_snd; extern session_fifo_rx_fn session_tx_fifo_dequeue_and_snd; +u8 session_node_lookup_fifo_event (svm_fifo_t * f, session_fifo_event_t * e); + struct _session_manager_main { /** Lookup tables for established sessions and listeners */ @@ -289,6 +291,8 @@ transport_connection_t *stream_session_lookup_transport6 (ip6_address_t * lcl, stream_session_t *stream_session_lookup_listener (ip46_address_t * lcl, u16 lcl_port, u8 proto); +transport_connection_t + * stream_session_lookup_half_open (transport_connection_t * tc); void stream_session_table_add_for_tc (transport_connection_t * tc, u64 value); int stream_session_table_del_for_tc (transport_connection_t * tc); diff --git a/src/vnet/session/session_api.c b/src/vnet/session/session_api.c index 60f764af..6bee3e27 100755 --- a/src/vnet/session/session_api.c +++ b/src/vnet/session/session_api.c @@ -184,13 +184,6 @@ send_session_connected_callback (u32 app_index, u32 api_context, } vl_msg_api_send_shmem (q, (u8 *) & mp); - - /* Remove client if connect failed */ - if (!is_fail) - { - s->session_state = SESSION_STATE_READY; - } - return 0; } diff --git a/src/vnet/session/session_cli.c b/src/vnet/session/session_cli.c index e8e6f99c..4d432977 100755 --- a/src/vnet/session/session_cli.c +++ b/src/vnet/session/session_cli.c @@ -19,8 +19,24 @@ u8 * format_stream_session_fifos (u8 * s, va_list * args) { stream_session_t *ss = va_arg (*args, stream_session_t *); + int verbose = va_arg (*args, int); + session_fifo_event_t _e, *e = &_e; + u8 found; + s = format (s, " Rx fifo: %U", format_svm_fifo, ss->server_rx_fifo, 1); + if (verbose > 2 && ss->server_rx_fifo->has_event) + { + found = session_node_lookup_fifo_event (ss->server_rx_fifo, e); + s = format (s, " session node event: %s\n", + found ? "found" : "not found"); + } s = format (s, " Tx fifo: %U", format_svm_fifo, ss->server_tx_fifo, 1); + if (verbose > 2 && ss->server_tx_fifo->has_event) + { + found = session_node_lookup_fifo_event (ss->server_tx_fifo, e); + s = format (s, " session node event: %s\n", + found ? "found" : "not found"); + } return s; } @@ -55,7 +71,7 @@ format_stream_session (u8 * s, va_list * args) if (verbose == 1) s = format (s, "%v", str); if (verbose > 1) - s = format (s, "%U", format_stream_session_fifos, ss); + s = format (s, "%U", format_stream_session_fifos, ss, verbose); } else if (ss->session_state == SESSION_STATE_LISTENING) { @@ -75,7 +91,7 @@ format_stream_session (u8 * s, va_list * args) if (verbose == 1) s = format (s, "%v", str); if (verbose > 1) - s = format (s, "%U", format_stream_session_fifos, ss); + s = format (s, "%U", format_stream_session_fifos, ss, verbose); } else { @@ -248,7 +264,7 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, if (one_session) { - vlib_cli_output (vm, "%U", format_stream_session, s, 2); + vlib_cli_output (vm, "%U", format_stream_session, s, 3); return 0; } diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index a92bacaa..744f50e7 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -410,9 +410,6 @@ builtin_session_connected_callback (u32 app_index, u32 api_context, return -1; } - /* Mark vpp session as connected */ - s->session_state = SESSION_STATE_READY; - tm->our_event_queue = session_manager_get_vpp_event_queue (s->thread_index); tm->vpp_event_queue = session_manager_get_vpp_event_queue (s->thread_index); @@ -466,6 +463,7 @@ builtin_session_reset_callback (stream_session_t * s) { if (s->session_state == SESSION_STATE_READY) clib_warning ("Reset active connection %U", format_stream_session, s, 2); + stream_session_cleanup (s); return; } @@ -478,6 +476,11 @@ builtin_session_create_callback (stream_session_t * s) static void builtin_session_disconnect_callback (stream_session_t * s) { + tclient_main_t *tm = &tclient_main; + vnet_disconnect_args_t _a, *a = &_a; + a->handle = stream_session_handle (s); + a->app_index = tm->app_index; + vnet_disconnect_session (a); return; } @@ -521,7 +524,7 @@ attach_builtin_test_clients_app (void) options[SESSION_OPTIONS_ACCEPT_COOKIE] = 0x12345678; options[SESSION_OPTIONS_SEGMENT_SIZE] = (2ULL << 32); options[SESSION_OPTIONS_RX_FIFO_SIZE] = tm->fifo_size; - options[SESSION_OPTIONS_TX_FIFO_SIZE] = tm->fifo_size / 2; + options[SESSION_OPTIONS_TX_FIFO_SIZE] = tm->fifo_size; options[APP_OPTIONS_PRIVATE_SEGMENT_COUNT] = tm->private_segment_count; options[APP_OPTIONS_PRIVATE_SEGMENT_SIZE] = tm->private_segment_size; options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = prealloc_fifos; diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c index 4ecaf56a..3416678e 100644 --- a/src/vnet/tcp/builtin_server.c +++ b/src/vnet/tcp/builtin_server.c @@ -213,15 +213,15 @@ builtin_server_rx_callback (stream_session_t * s) q = bsm->vpp_queue[thread_index]; if (PREDICT_FALSE (q->cursize == q->maxsize)) clib_warning ("out of event queue space"); - else if (unix_shared_memory_queue_add (q, (u8 *) & evt, 0 /* don't wait for mutex */ - )) + else if (unix_shared_memory_queue_add (q, (u8 *) & evt, 0)) clib_warning ("failed to enqueue self-tap"); - bsm->rx_retries[thread_index][s->session_index]++; if (bsm->rx_retries[thread_index][s->session_index] == 500000) { clib_warning ("session stuck: %U", format_stream_session, s, 2); } + if (bsm->rx_retries[thread_index][s->session_index] < 500001) + bsm->rx_retries[thread_index][s->session_index]++; } return 0; @@ -303,7 +303,7 @@ create_api_loopback (vlib_main_t * vm) /* Wait for reply */ bsm->node_index = vlib_get_current_process (vm)->node_runtime.node_index; - vlib_process_wait_for_event_or_clock (vm, 1.0); + vlib_process_wait_for_event_or_clock (vm, 2.0); event_type = vlib_process_get_events (vm, &event_data); switch (event_type) { diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 8ed325d2..a2214158 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -163,6 +163,33 @@ tcp_connection_del (tcp_connection_t * tc) tcp_connection_cleanup (tc); } +/** + * Cleanup half-open connection + */ +void +tcp_half_open_connection_del (tcp_connection_t * tc) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + if (CLIB_DEBUG) + memset (tc, 0xFA, sizeof (*tc)); + clib_spinlock_lock (&tm->half_open_lock); + pool_put (tm->half_open_connections, tc); + clib_spinlock_unlock (&tm->half_open_lock); +} + +tcp_connection_t * +tcp_connection_new (u8 thread_index) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + tcp_connection_t *tc; + + pool_get (tm->connections[thread_index], tc); + memset (tc, 0, sizeof (*tc)); + tc->c_c_index = tc - tm->connections[thread_index]; + tc->c_thread_index = thread_index; + return tc; +} + /** Notify session that connection has been reset. * * Switch state to closed and wait for session to call cleanup. @@ -170,6 +197,7 @@ tcp_connection_del (tcp_connection_t * tc) void tcp_connection_reset (tcp_connection_t * tc) { + TCP_EVT_DBG (TCP_EVT_RST_RCVD, tc); switch (tc->state) { case TCP_STATE_SYN_RCVD: @@ -178,12 +206,18 @@ tcp_connection_reset (tcp_connection_t * tc) tcp_connection_cleanup (tc); break; case TCP_STATE_SYN_SENT: + /* XXX remove sst from call */ + stream_session_connect_notify (&tc->connection, tc->connection.proto, + 1 /* fail */ ); + tcp_connection_cleanup (tc); + break; case TCP_STATE_ESTABLISHED: case TCP_STATE_CLOSE_WAIT: case TCP_STATE_FIN_WAIT_1: case TCP_STATE_FIN_WAIT_2: case TCP_STATE_CLOSING: tc->state = TCP_STATE_CLOSED; + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc); /* Make sure all timers are cleared */ tcp_connection_timers_reset (tc); @@ -227,6 +261,7 @@ tcp_connection_close (tcp_connection_t * tc) tc->state = TCP_STATE_CLOSED; else if (tc->state == TCP_STATE_CLOSE_WAIT) tc->state = TCP_STATE_LAST_ACK; + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc); /* If in CLOSED and WAITCLOSE timer is not set, delete connection now */ if (tc->timers[TCP_TIMER_WAITCLOSE] == TCP_TIMER_HANDLE_INVALID @@ -250,6 +285,7 @@ tcp_session_cleanup (u32 conn_index, u32 thread_index) /* Wait for the session tx events to clear */ tc->state = TCP_STATE_CLOSED; + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc); tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); } @@ -287,7 +323,7 @@ ip_interface_get_first_ip (u32 sw_if_index, u8 is_ip4) * Allocate local port and add if successful add entry to local endpoint * table to mark the pair as used. */ -u16 +int tcp_allocate_local_port (tcp_main_t * tm, ip46_address_t * ip) { transport_endpoint_t *tep; @@ -484,7 +520,7 @@ tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4) fib_node_index_t fei; u32 sw_if_index; ip46_address_t lcl_addr; - u16 lcl_port; + int lcl_port; /* * Find the local address and allocate port @@ -500,12 +536,19 @@ tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4) /* Couldn't find route to destination. Bail out. */ if (fei == FIB_NODE_INDEX_INVALID) - return -1; + { + clib_warning ("no route to destination"); + return -1; + } sw_if_index = fib_entry_get_resolving_interface (fei); if (sw_if_index == (u32) ~ 0) - return -1; + { + clib_warning ("no resolving interface for %U", format_ip46_address, + rmt_addr, IP46_TYPE_IP4); + return -1; + } if (is_ip4) { @@ -570,11 +613,9 @@ tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4) /* The other connection vars will be initialized after SYN ACK */ tcp_connection_timers_init (tc); - tcp_send_syn (tc); - - tc->state = TCP_STATE_SYN_SENT; - TCP_EVT_DBG (TCP_EVT_OPEN, tc); + tc->state = TCP_STATE_SYN_SENT; + tcp_send_syn (tc); return tc->c_c_index; } @@ -1206,7 +1247,7 @@ tcp_main_enable (vlib_main_t * vm) clib_bihash_init_24_8 (&tm->local_endpoints_table, "local endpoint table", 200000 /* $$$$ config parameter nbuckets */ , (64 << 20) /*$$$ config parameter table size */ ); - + clib_spinlock_init (&tm->half_open_lock); return error; } diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index fd0d02b9..89c30616 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -33,6 +33,7 @@ #define TCP_DUPACK_THRESHOLD 3 #define TCP_MAX_RX_FIFO_SIZE 4 << 20 +#define TCP_MIN_RX_FIFO_SIZE 4 << 10 #define TCP_IW_N_SEGMENTS 10 #define TCP_ALWAYS_ACK 1 /**< On/off delayed acks */ #define TCP_USE_SACKS 1 /**< Disable only for testing */ @@ -371,11 +372,9 @@ typedef struct _tcp_main /* Per worker-thread timer wheel for connections timers */ tw_timer_wheel_16t_2w_512sl_t *timer_wheels; -// /* Convenience per worker-thread vector of connections to DELACK */ -// u32 **delack_connections; - /* Pool of half-open connections on which we've sent a SYN */ tcp_connection_t *half_open_connections; + clib_spinlock_t half_open_lock; /* Pool of local TCP endpoints */ transport_endpoint_t *local_endpoints; @@ -455,6 +454,8 @@ tcp_get_connection_from_transport (transport_connection_t * tconn) void tcp_connection_close (tcp_connection_t * tc); void tcp_connection_cleanup (tcp_connection_t * tc); void tcp_connection_del (tcp_connection_t * tc); +void tcp_half_open_connection_del (tcp_connection_t * tc); +tcp_connection_t *tcp_connection_new (u8 thread_index); void tcp_connection_reset (tcp_connection_t * tc); u8 *format_tcp_connection_id (u8 * s, va_list * args); @@ -472,13 +473,15 @@ tcp_listener_get (u32 tli) always_inline tcp_connection_t * tcp_half_open_connection_get (u32 conn_index) { + if (pool_is_free_index (tcp_main.half_open_connections, conn_index)) + return 0; return pool_elt_at_index (tcp_main.half_open_connections, conn_index); } void tcp_make_ack (tcp_connection_t * ts, vlib_buffer_t * b); void tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b); void tcp_make_synack (tcp_connection_t * ts, vlib_buffer_t * b); -void tcp_send_reset (vlib_buffer_t * pkt, u8 is_ip4); +void tcp_send_reset (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4); void tcp_send_syn (tcp_connection_t * tc); void tcp_send_fin (tcp_connection_t * tc); void tcp_init_mss (tcp_connection_t * tc); @@ -658,7 +661,6 @@ tcp_timer_update (tcp_connection_t * tc, u8 timer_id, u32 interval) tc->c_c_index, timer_id, interval); } -/* XXX Switch retransmit to faster TW */ always_inline void tcp_retransmit_timer_set (tcp_connection_t * tc) { diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index be51bca2..e3da56f4 100755 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -19,10 +19,9 @@ #include #define TCP_DEBUG (1) -#define TCP_DEBUG_SM (0) -#define TCP_DEBUG_CC (1) -#define TCP_DEBUG_CC_STAT (1) -#define TCP_DEBUG_SM_VERBOSE (0) +#define TCP_DEBUG_SM (2) +#define TCP_DEBUG_CC (0) +#define TCP_DEBUG_CC_STAT (0) #define foreach_tcp_dbg_evt \ _(INIT, "") \ @@ -33,7 +32,9 @@ _(UNBIND, "unbind") \ _(DELETE, "delete") \ _(SYN_SENT, "SYN sent") \ - _(SYN_RTX, "SYN retransmit") \ + _(SYNACK_SENT, "SYNACK sent") \ + _(SYNACK_RCVD, "SYNACK rcvd") \ + _(SYN_RXT, "SYN retransmit") \ _(FIN_SENT, "FIN sent") \ _(ACK_SENT, "ACK sent") \ _(DUPACK_SENT, "DUPACK sent") \ @@ -43,6 +44,7 @@ _(DUPACK_RCVD, "DUPACK rcvd") \ _(FIN_RCVD, "FIN rcvd") \ _(RST_RCVD, "RST rcvd") \ + _(STATE_CHANGE, "state change") \ _(PKTIZE, "packetize") \ _(INPUT, "in") \ _(SND_WND, "snd_wnd update") \ @@ -96,11 +98,64 @@ typedef enum _tcp_dbg_evt ed = ELOG_TRACK_DATA (&vlib_global_main.elog_main, \ _e, _tc->c_elog_track) -#define TCP_EVT_INIT_HANDLER(_tc, _fmt, ...) \ +#define TCP_DBG_IP_TAG_LCL(_tc) \ { \ - _tc->c_elog_track.name = \ - (char *) format (0, _fmt, _tc->c_c_index, 0); \ + if (_tc->c_is_ip4) \ + { \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "lcl: %d.%d.%d.%d:%d", \ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->c_lcl_ip.ip4.as_u8[0]; \ + ed->data[1] = _tc->c_lcl_ip.ip4.as_u8[1]; \ + ed->data[2] = _tc->c_lcl_ip.ip4.as_u8[2]; \ + ed->data[3] = _tc->c_lcl_ip.ip4.as_u8[3]; \ + ed->data[4] = clib_net_to_host_u16(_tc->c_lcl_port); \ + } \ +} + +#define TCP_DBG_IP_TAG_RMT(_tc) \ +{ \ + if (_tc->c_is_ip4) \ + { \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "rmt: %d.%d.%d.%d:%d", \ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->c_rmt_ip.ip4.as_u8[0]; \ + ed->data[1] = _tc->c_rmt_ip.ip4.as_u8[1]; \ + ed->data[2] = _tc->c_rmt_ip.ip4.as_u8[2]; \ + ed->data[3] = _tc->c_rmt_ip.ip4.as_u8[3]; \ + ed->data[4] = clib_net_to_host_u16(_tc->c_rmt_port); \ + } \ +} + +#define TCP_EVT_INIT_HANDLER(_tc, _is_l, ...) \ +{ \ + char *_fmt = _is_l ? "l[%d].%d:%d%c" : "[%d].%d:%d->.%d:%d%c"; \ + if (_tc->c_is_ip4) \ + { \ + _tc->c_elog_track.name = \ + (char *) format (0, _fmt, _tc->c_thread_index, \ + _tc->c_lcl_ip.ip4.as_u8[3], \ + clib_net_to_host_u16(_tc->c_lcl_port), \ + _tc->c_rmt_ip.ip4.as_u8[3], \ + clib_net_to_host_u16(_tc->c_rmt_port), 0); \ + } \ + else \ + _tc->c_elog_track.name = \ + (char *) format (0, _fmt, _tc->c_thread_index, \ + _tc->c_lcl_ip.ip6.as_u8[15], \ + clib_net_to_host_u16(_tc->c_lcl_port), \ + _tc->c_rmt_ip.ip6.as_u8[15], \ + clib_net_to_host_u16(_tc->c_rmt_port), 0); \ elog_track_register (&vlib_global_main.elog_main, &_tc->c_elog_track);\ + TCP_DBG_IP_TAG_LCL(_tc); \ + TCP_DBG_IP_TAG_RMT(_tc); \ } #define TCP_EVT_DEALLOC_HANDLER(_tc, ...) \ @@ -110,7 +165,7 @@ typedef enum _tcp_dbg_evt #define TCP_EVT_OPEN_HANDLER(_tc, ...) \ { \ - TCP_EVT_INIT_HANDLER(_tc, "s%d%c"); \ + TCP_EVT_INIT_HANDLER(_tc, 0); \ ELOG_TYPE_DECLARE (_e) = \ { \ .format = "open: index %d", \ @@ -133,7 +188,7 @@ typedef enum _tcp_dbg_evt #define TCP_EVT_BIND_HANDLER(_tc, ...) \ { \ - TCP_EVT_INIT_HANDLER(_tc, "l%d%c"); \ + TCP_EVT_INIT_HANDLER(_tc, 1); \ ELOG_TYPE_DECLARE (_e) = \ { \ .format = "bind: listener %d", \ @@ -166,18 +221,6 @@ typedef enum _tcp_dbg_evt TCP_EVT_DEALLOC_HANDLER(_tc); \ } -#define TCP_EVT_SYN_RCVD_HANDLER(_tc, ...) \ -{ \ - TCP_EVT_INIT_HANDLER(_tc, "s%d%c"); \ - ELOG_TYPE_DECLARE (_e) = \ - { \ - .format = "SYNrx: irs %u", \ - .format_args = "i4", \ - }; \ - DECLARE_ETD(_tc, _e, 1); \ - ed->data[0] = _tc->irs; \ -} - #define CONCAT_HELPER(_a, _b) _a##_b #define CC(_a, _b) CONCAT_HELPER(_a, _b) #define TCP_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args) @@ -190,63 +233,86 @@ typedef enum _tcp_dbg_evt */ #if TCP_DEBUG_SM -#define TCP_EVT_ACK_SENT_HANDLER(_tc, ...) \ +#define TCP_EVT_STATE_CHANGE_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "ack_tx: acked %u rcv_nxt %u rcv_wnd %u snd_nxt %u snd_wnd %u",\ - .format_args = "i4i4i4i4i4", \ + .format = "state: %s", \ + .format_args = "t4", \ + .n_enum_strings = 11, \ + .enum_strings = { \ + "closed", \ + "listen", \ + "syn-sent", \ + "syn-rcvd", \ + "established", \ + "close_wait", \ + "fin-wait-1", \ + "last-ack", \ + "closing", \ + "fin-wait-2", \ + "time-wait", \ + }, \ }; \ - DECLARE_ETD(_tc, _e, 5); \ - ed->data[0] = _tc->rcv_nxt - _tc->rcv_las; \ - ed->data[1] = _tc->rcv_nxt - _tc->irs; \ - ed->data[2] = _tc->rcv_wnd; \ - ed->data[3] = _tc->snd_nxt - _tc->iss; \ - ed->data[4] = _tc->snd_wnd; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->state; \ } -#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...) \ +#define TCP_EVT_SYN_RCVD_HANDLER(_tc, ...) \ { \ + TCP_EVT_INIT_HANDLER(_tc, 0); \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "dack_tx: rcv_nxt %u rcv_wnd %u snd_nxt %u av_wnd %u snd_wnd %u",\ - .format_args = "i4i4i4i4i4", \ + .format = "syn-rx: irs %u", \ + .format_args = "i4", \ }; \ - DECLARE_ETD(_tc, _e, 5); \ - ed->data[0] = _tc->rcv_nxt - _tc->irs; \ - ed->data[1] = _tc->rcv_wnd; \ - ed->data[2] = _tc->snd_nxt - _tc->iss; \ - ed->data[3] = tcp_available_wnd(_tc); \ - ed->data[4] = _tc->snd_wnd; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->irs; \ + TCP_EVT_STATE_CHANGE_HANDLER(_tc); \ } #define TCP_EVT_SYN_SENT_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "SYNtx: iss %u", \ + .format = "syn-tx: iss %u", \ .format_args = "i4", \ }; \ DECLARE_ETD(_tc, _e, 1); \ ed->data[0] = _tc->iss; \ + TCP_EVT_STATE_CHANGE_HANDLER(_tc); \ } -#define TCP_EVT_SYN_RTX_HANDLER(_tc, ...) \ +#define TCP_EVT_SYNACK_SENT_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "SYNrtx: iss %u", \ - .format_args = "i4", \ + .format = "synack-tx: iss %u irs %u", \ + .format_args = "i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 1); \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = _tc->iss; \ + ed->data[1] = _tc->irs; \ +} + +#define TCP_EVT_SYNACK_RCVD_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "synack-rx: iss %u irs %u", \ + .format_args = "i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 2); \ ed->data[0] = _tc->iss; \ + ed->data[1] = _tc->irs; \ + TCP_EVT_STATE_CHANGE_HANDLER(_tc); \ } #define TCP_EVT_FIN_SENT_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "FINtx: snd_nxt %d rcv_nxt %d", \ + .format = "fin-tx: snd_nxt %d rcv_nxt %d", \ .format_args = "i4i4", \ }; \ DECLARE_ETD(_tc, _e, 2); \ @@ -258,19 +324,20 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "RSTtx: snd_nxt %d rcv_nxt %d", \ + .format = "rst-tx: snd_nxt %d rcv_nxt %d", \ .format_args = "i4i4", \ }; \ DECLARE_ETD(_tc, _e, 2); \ ed->data[0] = _tc->snd_nxt - _tc->iss; \ ed->data[1] = _tc->rcv_nxt - _tc->irs; \ + TCP_EVT_STATE_CHANGE_HANDLER(_tc); \ } #define TCP_EVT_FIN_RCVD_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "FINrx: snd_nxt %d rcv_nxt %d", \ + .format = "fin-rx: snd_nxt %d rcv_nxt %d", \ .format_args = "i4i4", \ }; \ DECLARE_ETD(_tc, _e, 2); \ @@ -282,7 +349,7 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "RSTrx: snd_nxt %d rcv_nxt %d", \ + .format = "rst-rx: snd_nxt %d rcv_nxt %d", \ .format_args = "i4i4", \ }; \ DECLARE_ETD(_tc, _e, 2); \ @@ -290,6 +357,67 @@ typedef enum _tcp_dbg_evt ed->data[1] = _tc->rcv_nxt - _tc->irs; \ } +#define TCP_EVT_SYN_RXT_HANDLER(_tc, _type, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "%s-rxt: iss %u", \ + .format_args = "t4i4", \ + .n_enum_strings = 2, \ + .enum_strings = { \ + "syn", \ + "syn-ack", \ + }, \ + }; \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = _type; \ + ed->data[1] = _tc->iss; \ +} + +#else +#define TCP_EVT_SYN_SENT_HANDLER(_tc, ...) +#define TCP_EVT_SYNACK_SENT_HANDLER(_tc, ...) +#define TCP_EVT_SYNACK_RCVD_HANDLER(_tc, ...) +#define TCP_EVT_SYN_RXT_HANDLER(_tc, ...) +#define TCP_EVT_FIN_SENT_HANDLER(_tc, ...) +#define TCP_EVT_RST_SENT_HANDLER(_tc, ...) +#define TCP_EVT_FIN_RCVD_HANDLER(_tc, ...) +#define TCP_EVT_RST_RCVD_HANDLER(_tc, ...) +#define TCP_EVT_STATE_CHANGE_HANDLER(_tc, ...) +#endif + +#if TCP_DEBUG_SM > 1 + +#define TCP_EVT_ACK_SENT_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "ack-tx: acked %u rcv_nxt %u rcv_wnd %u snd_nxt %u snd_wnd %u",\ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->rcv_nxt - _tc->rcv_las; \ + ed->data[1] = _tc->rcv_nxt - _tc->irs; \ + ed->data[2] = _tc->rcv_wnd; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ + ed->data[4] = _tc->snd_wnd; \ +} + +#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "dack-tx: rcv_nxt %u rcv_wnd %u snd_nxt %u av_wnd %u snd_wnd %u",\ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->rcv_nxt - _tc->irs; \ + ed->data[1] = _tc->rcv_wnd; \ + ed->data[2] = _tc->snd_nxt - _tc->iss; \ + ed->data[3] = tcp_available_wnd(_tc); \ + ed->data[4] = _tc->snd_wnd; \ +} + #define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ @@ -309,7 +437,7 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "dack_rx: snd_una %u cwnd %u snd_wnd %u flight %u rcv_wnd %u",\ + .format = "dack-rx: snd_una %u cwnd %u snd_wnd %u flight %u rcv_wnd %u",\ .format_args = "i4i4i4i4i4", \ }; \ DECLARE_ETD(_tc, _e, 5); \ @@ -370,7 +498,7 @@ typedef enum _tcp_dbg_evt } \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "TimerPop: %s (%d)", \ + .format = "timer-pop: %s (%d)", \ .format_args = "t4i4", \ .n_enum_strings = 7, \ .enum_strings = { \ @@ -391,7 +519,8 @@ typedef enum _tcp_dbg_evt } \ else \ { \ - clib_warning ("pop for unexisting connection %d", _tc_index); \ + clib_warning ("pop %d for unexisting connection %d", _timer_id, \ + _tc_index); \ } \ } @@ -414,7 +543,7 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "paws fail: seq %u end %u tsval %u tsval_recent %u", \ + .format = "paws-err: seq %u end %u tsval %u tsval_recent %u", \ .format_args = "i4i4i4i4", \ }; \ DECLARE_ETD(_tc, _e, 4); \ @@ -465,12 +594,6 @@ if (_av > 0) \ #else #define TCP_EVT_ACK_SENT_HANDLER(_tc, ...) #define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...) -#define TCP_EVT_SYN_SENT_HANDLER(_tc, ...) -#define TCP_EVT_SYN_RTX_HANDLER(_tc, ...) -#define TCP_EVT_FIN_SENT_HANDLER(_tc, ...) -#define TCP_EVT_RST_SENT_HANDLER(_tc, ...) -#define TCP_EVT_FIN_RCVD_HANDLER(_tc, ...) -#define TCP_EVT_RST_RCVD_HANDLER(_tc, ...) #define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...) #define TCP_EVT_DUPACK_RCVD_HANDLER(_tc, ...) #define TCP_EVT_PKTIZE_HANDLER(_tc, ...) @@ -485,12 +608,12 @@ if (_av > 0) \ /* * State machine verbose */ -#if TCP_DBG_SM_VERBOSE +#if TCP_DEBUG_SM > 2 #define TCP_EVT_SND_WND_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "snd_wnd update: %u ", \ + .format = "snd-wnd update: %u ", \ .format_args = "i4", \ }; \ DECLARE_ETD(_tc, _e, 1); \ @@ -617,6 +740,7 @@ if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now()) \ #define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...) #define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...) #define TCP_EVT_CC_PACK_HANDLER(_tc, ...) +#define TCP_EVT_CC_STAT_HANDLER(_tc, ...) #endif #endif /* SRC_VNET_TCP_TCP_DEBUG_H_ */ diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index bc7d9015..cc5cecdc 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -349,7 +349,10 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, /* 4th: check the SYN bit */ if (tcp_syn (th0)) { - tcp_send_reset (b0, tc0->c_is_ip4); + /* TODO implement RFC 5961 */ + tcp_make_ack (tc0, b0); + *next0 = tcp_next_output (tc0->c_is_ip4); + TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0); return -1; } @@ -1246,8 +1249,6 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, * Looks okay, process feedback */ - TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc); - if (tcp_opts_sack_permitted (&tc->rcv_opts)) tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number); @@ -1263,6 +1264,8 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, if (tc->bytes_acked) tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number); + TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc); + /* * Check if we have congestion event */ @@ -1496,9 +1499,13 @@ tcp_can_delack (tcp_connection_t * tc) static int tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, - u16 n_data_bytes, u32 * next0) + u32 * next0) { - u32 error = 0, n_bytes_to_drop; + u32 error = 0, n_bytes_to_drop, n_data_bytes; + + vlib_buffer_advance (b, vnet_buffer (b)->tcp.data_offset); + n_data_bytes = vnet_buffer (b)->tcp.data_len; + ASSERT (n_data_bytes); /* Handle out-of-order data */ if (PREDICT_FALSE (vnet_buffer (b)->tcp.seq_number != tc->rcv_nxt)) @@ -1512,7 +1519,12 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, /* Completely in the past (possible retransmit) */ if (seq_leq (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt)) - goto done; + { + /* Ack retransmissions since we may not have any data to send */ + tcp_make_ack (tc, b); + *next0 = tcp_next_output (tc->c_is_ip4); + goto done; + } /* Chop off the bytes in the past */ n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number; @@ -1550,12 +1562,6 @@ in_order: * segments can be enqueued after fifo tail offset changes. */ error = tcp_session_enqueue_data (tc, b, n_data_bytes); - if (n_data_bytes == 0) - { - *next0 = TCP_NEXT_DROP; - goto done; - } - /* Check if ACK can be delayed */ if (tcp_can_delack (tc)) { @@ -1680,7 +1686,9 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } th0 = tcp_buffer_hdr (b0); - is_fin = (th0->flags & TCP_FLAG_FIN) != 0; + /* N.B. buffer is rewritten if segment is ooo. Thus, th0 becomes a + * dangling reference. */ + is_fin = tcp_is_fin (th0); /* SYNs, FINs and data consume sequence numbers */ vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number @@ -1700,29 +1708,23 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* 5: check the ACK field */ if (tcp_rcv_ack (tc0, b0, th0, &next0, &error0)) - { - goto done; - } + goto done; /* 6: check the URG bit TODO */ /* 7: process the segment text */ - - vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset); - error0 = tcp_segment_rcv (tm, tc0, b0, - vnet_buffer (b0)->tcp.data_len, &next0); - - /* N.B. buffer is rewritten if segment is ooo. Thus, th0 becomes a - * dangling reference. */ + if (vnet_buffer (b0)->tcp.data_len) + error0 = tcp_segment_rcv (tm, tc0, b0, &next0); /* 8: check the FIN bit */ - if (is_fin) + if (PREDICT_FALSE (is_fin)) { /* Enter CLOSE-WAIT and notify session. Don't send ACK, instead * wait for session to call close. To avoid lingering * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */ tc0->state = TCP_STATE_CLOSE_WAIT; TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0); + tc0->rcv_nxt += (vnet_buffer (b0)->tcp.data_len == 0); stream_session_disconnect_notify (&tc0->connection); tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); } @@ -1856,6 +1858,21 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, seq0 = vnet_buffer (b0)->tcp.seq_number; tcp0 = tcp_buffer_hdr (b0); + if (!tc0) + { + ip4_header_t *ip40 = vlib_buffer_get_current (b0); + tcp0 = ip4_next_header (ip40); + tc0 = + (tcp_connection_t *) + stream_session_lookup_transport_wt4 (&ip40->dst_address, + &ip40->src_address, + tcp0->dst_port, + tcp0->src_port, + SESSION_TYPE_IP4_TCP, + my_thread_index); + ASSERT (0); + goto drop; + } if (PREDICT_FALSE (!tcp_ack (tcp0) && !tcp_rst (tcp0) && !tcp_syn (tcp0))) goto drop; @@ -1881,8 +1898,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (ack0 <= tc0->iss || ack0 > tc0->snd_nxt) { if (!tcp_rst (tcp0)) - tcp_send_reset (b0, is_ip4); - + tcp_send_reset (tc0, b0, is_ip4); goto drop; } @@ -1900,11 +1916,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* If ACK is acceptable, signal client that peer is not * willing to accept connection and drop connection*/ if (tcp_ack (tcp0)) - { - stream_session_connect_notify (&tc0->connection, sst, - 1 /* fail */ ); - tcp_connection_cleanup (tc0); - } + tcp_connection_reset (tc0); goto drop; } @@ -1920,6 +1932,10 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (!tcp_syn (tcp0)) goto drop; + /* Parse options */ + if (tcp_options_parse (tcp0, &tc0->rcv_opts)) + goto drop; + /* Stop connection establishment and retransmit timers */ tcp_timer_reset (tc0, TCP_TIMER_ESTABLISH); tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT_SYN); @@ -1928,19 +1944,11 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, * current thread pool. */ pool_get (tm->connections[my_thread_index], new_tc0); clib_memcpy (new_tc0, tc0, sizeof (*new_tc0)); - - new_tc0->c_thread_index = my_thread_index; new_tc0->c_c_index = new_tc0 - tm->connections[my_thread_index]; - - /* Cleanup half-open connection XXX lock */ - pool_put (tm->half_open_connections, tc0); - + new_tc0->c_thread_index = my_thread_index; new_tc0->rcv_nxt = vnet_buffer (b0)->tcp.seq_end; new_tc0->irs = seq0; - - /* Parse options */ - if (tcp_options_parse (tcp0, &new_tc0->rcv_opts)) - goto drop; + tcp_half_open_connection_del (tc0); if (tcp_opts_tstamp (&new_tc0->rcv_opts)) { @@ -1959,7 +1967,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_connection_init_vars (new_tc0); /* SYN-ACK: See if we can switch to ESTABLISHED state */ - if (tcp_ack (tcp0)) + if (PREDICT_TRUE (tcp_ack (tcp0))) { /* Our SYN is ACKed: we have iss < ack = snd_una */ @@ -1976,7 +1984,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, 0)) { tcp_connection_cleanup (new_tc0); - tcp_send_reset (b0, is_ip4); + tcp_send_reset (tc0, b0, is_ip4); goto drop; } @@ -1986,6 +1994,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Update rtt with the syn-ack sample */ new_tc0->bytes_acked = 1; tcp_update_rtt (new_tc0, vnet_buffer (b0)->tcp.ack_number); + TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, new_tc0); } /* SYN: Simultaneous open. Change state to SYN-RCVD and send SYN-ACK */ else @@ -1997,12 +2006,12 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, (&new_tc0->connection, sst, 0)) { tcp_connection_cleanup (new_tc0); - tcp_send_reset (b0, is_ip4); + tcp_send_reset (tc0, b0, is_ip4); + TCP_EVT_DBG (TCP_EVT_RST_SENT, tc0); goto drop; } tc0->rtt_ts = 0; - tcp_make_synack (new_tc0, b0); next0 = tcp_next_output (is_ip4); @@ -2010,12 +2019,10 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } /* Read data, if any */ - if (vnet_buffer (b0)->tcp.data_len) + if (PREDICT_FALSE (vnet_buffer (b0)->tcp.data_len)) { - vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset); - error0 = tcp_segment_rcv (tm, new_tc0, b0, - vnet_buffer (b0)->tcp.data_len, - &next0); + ASSERT (0); + error0 = tcp_segment_rcv (tm, new_tc0, b0, &next0); if (error0 == TCP_ERROR_PURE_ACK) error0 = TCP_ERROR_SYN_ACKS_RCVD; } @@ -2114,6 +2121,7 @@ VLIB_REGISTER_NODE (tcp6_syn_sent_node) = /* *INDENT-ON* */ VLIB_NODE_FUNCTION_MULTIARCH (tcp6_syn_sent_node, tcp6_syn_sent_rcv); + /** * Handles reception for all states except LISTEN, SYN-SENT and ESTABLISHED * as per RFC793 p. 64 @@ -2202,7 +2210,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, */ if (!tcp_rcv_ack_is_acceptable (tc0, b0)) { - tcp_send_reset (b0, is_ip4); + tcp_send_reset (tc0, b0, is_ip4); goto drop; } @@ -2243,6 +2251,8 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { ASSERT (tcp_fin (tcp0)); tc0->state = TCP_STATE_FIN_WAIT_2; + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); + /* Stop all timers, 2MSL will be set lower */ tcp_connection_timers_reset (tc0); } @@ -2269,6 +2279,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* XXX test that send queue empty */ tc0->state = TCP_STATE_TIME_WAIT; + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); goto drop; break; @@ -2289,6 +2300,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } tc0->state = TCP_STATE_CLOSED; + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); /* Don't delete the connection/session yet. Instead, wait a * reasonable amount of time until the pipes are cleared. In @@ -2329,10 +2341,8 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, case TCP_STATE_ESTABLISHED: case TCP_STATE_FIN_WAIT_1: case TCP_STATE_FIN_WAIT_2: - vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset); - error0 = tcp_segment_rcv (tm, tc0, b0, - vnet_buffer (b0)->tcp.data_len, - &next0); + if (vnet_buffer (b0)->tcp.data_len) + error0 = tcp_segment_rcv (tm, tc0, b0, &next0); break; case TCP_STATE_CLOSE_WAIT: case TCP_STATE_CLOSING: @@ -2357,6 +2367,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, next0 = tcp_next_output (tc0->c_is_ip4); stream_session_disconnect_notify (&tc0->connection); tc0->state = TCP_STATE_CLOSE_WAIT; + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); break; case TCP_STATE_CLOSE_WAIT: case TCP_STATE_CLOSING: @@ -2367,6 +2378,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tc0->state = TCP_STATE_TIME_WAIT; tcp_connection_timers_reset (tc0); tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); break; case TCP_STATE_FIN_WAIT_2: /* Got FIN, send ACK! */ @@ -2375,6 +2387,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); tcp_make_ack (tc0, b0); next0 = tcp_next_output (is_ip4); + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); break; case TCP_STATE_TIME_WAIT: /* Remain in the TIME-WAIT state. Restart the 2 MSL time-wait @@ -2486,7 +2499,6 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { u32 n_left_from, next_index, *from, *to_next; u32 my_thread_index = vm->thread_index; - tcp_main_t *tm = vnet_get_tcp_main (); u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; from = vlib_frame_vector_args (from_frame); @@ -2549,14 +2561,10 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* 3. check for a SYN (did that already) */ /* Create child session and send SYN-ACK */ - pool_get (tm->connections[my_thread_index], child0); - memset (child0, 0, sizeof (*child0)); - - child0->c_c_index = child0 - tm->connections[my_thread_index]; + child0 = tcp_connection_new (my_thread_index); child0->c_lcl_port = lc0->c_lcl_port; child0->c_rmt_port = th0->src_port; child0->c_is_ip4 = is_ip4; - child0->c_thread_index = my_thread_index; child0->state = TCP_STATE_SYN_RCVD; if (is_ip4) @@ -2605,7 +2613,6 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, child0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number; tcp_connection_init_vars (child0); - TCP_EVT_DBG (TCP_EVT_SYN_RCVD, child0); /* Reuse buffer to make syn-ack and send */ @@ -2722,6 +2729,31 @@ typedef enum _tcp_input_next #define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN) +static u8 +tcp_lookup_is_valid (tcp_connection_t * tc, tcp_header_t * hdr) +{ + transport_connection_t *tmp; + if (!tc) + return 1; + + u8 is_valid = (tc->c_lcl_port == hdr->dst_port + && (tc->state == TCP_STATE_LISTEN + || tc->c_rmt_port == hdr->src_port)); + + if (!is_valid) + { + if ((tmp = stream_session_lookup_half_open (&tc->connection))) + { + if (tmp->lcl_port == hdr->dst_port + && tmp->rmt_port == hdr->src_port) + { + clib_warning ("half-open is valid!"); + } + } + } + return is_valid; +} + always_inline uword tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, int is_ip4) @@ -2774,7 +2806,6 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, n_data_bytes0 = clib_net_to_host_u16 (ip40->length) - n_advance_bytes0; - /* lookup session */ tc0 = (tcp_connection_t *) stream_session_lookup_transport_wt4 (&ip40->dst_address, @@ -2783,6 +2814,7 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp0->src_port, SESSION_TYPE_IP4_TCP, my_thread_index); + ASSERT (tcp_lookup_is_valid (tc0, tcp0)); } else { @@ -2795,12 +2827,13 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tc0 = (tcp_connection_t *) - stream_session_lookup_transport_wt6 (&ip60->src_address, - &ip60->dst_address, - tcp0->src_port, + stream_session_lookup_transport_wt6 (&ip60->dst_address, + &ip60->src_address, tcp0->dst_port, + tcp0->src_port, SESSION_TYPE_IP6_TCP, my_thread_index); + ASSERT (tcp_lookup_is_valid (tc0, tcp0)); } /* Length check */ diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 35f3eba1..5e9ecf11 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -75,12 +75,34 @@ tcp_window_compute_scale (u32 available_space) } /** - * TCP's IW as recommended by RFC6928 + * Update max segment size we're able to process. + * + * The value is constrained by our interface's MTU and IP options. It is + * also what we advertise to our peer. + */ +void +tcp_update_rcv_mss (tcp_connection_t * tc) +{ + /* TODO find our iface MTU */ + tc->mss = dummy_mtu; +} + +/** + * TCP's initial window */ always_inline u32 tcp_initial_wnd_unscaled (tcp_connection_t * tc) { - return TCP_IW_N_SEGMENTS * tc->mss; + /* RFC 6928 recommends the value lower. However at the time our connections + * are initialized, fifos may not be allocated. Therefore, advertise the + * smallest possible unscaled window size and update once fifos are + * assigned to the session. + */ + /* + tcp_update_rcv_mss (tc); + TCP_IW_N_SEGMENTS * tc->mss; + */ + return TCP_MIN_RX_FIFO_SIZE; } /** @@ -372,19 +394,6 @@ tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts, } } -/** - * Update max segment size we're able to process. - * - * The value is constrained by our interface's MTU and IP options. It is - * also what we advertise to our peer. - */ -void -tcp_update_rcv_mss (tcp_connection_t * tc) -{ - /* TODO find our iface MTU */ - tc->mss = dummy_mtu; -} - /** * Update snd_mss to reflect the effective segment size that we can send * by taking into account all TCP options, including SACKs @@ -576,6 +585,7 @@ tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b) /* Init retransmit timer */ tcp_retransmit_timer_set (tc); + TCP_EVT_DBG (TCP_EVT_SYNACK_SENT, tc); } always_inline void @@ -684,7 +694,7 @@ tcp_make_reset_in_place (vlib_main_t * vm, vlib_buffer_t * b0, * Send reset without reusing existing buffer */ void -tcp_send_reset (vlib_buffer_t * pkt, u8 is_ip4) +tcp_send_reset (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4) { vlib_buffer_t *b; u32 bi; @@ -720,7 +730,7 @@ tcp_send_reset (vlib_buffer_t * pkt, u8 is_ip4) { flags = TCP_FLAG_RST; seq = pkt_th->ack_number; - ack = 0; + ack = (tc && tc->state >= TCP_STATE_SYN_RCVD) ? tc->rcv_nxt : 0; } else { @@ -754,6 +764,7 @@ tcp_send_reset (vlib_buffer_t * pkt, u8 is_ip4) } tcp_enqueue_to_ip_lookup (vm, b, bi, is_ip4); + TCP_EVT_DBG (TCP_EVT_RST_SENT, tc); } void @@ -839,6 +850,7 @@ tcp_send_syn (tcp_connection_t * tc) tcp_push_ip_hdr (tm, tc, b); tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4); + TCP_EVT_DBG (TCP_EVT_SYN_SENT, tc); } always_inline void @@ -1148,12 +1160,13 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); vlib_buffer_make_headroom (b, MAX_HDRS_LEN); - tcp_push_hdr_i (tc, b, tc->state, 1); /* Account for the SYN */ tc->snd_nxt += 1; tc->rtt_ts = 0; + TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc, + (tc->state == TCP_STATE_SYN_SENT ? 0 : 1)); } else { @@ -1173,8 +1186,6 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { ASSERT (tc->state == TCP_STATE_SYN_SENT); - TCP_EVT_DBG (TCP_EVT_SYN_RTX, tc); - /* This goes straight to ipx_lookup */ tcp_push_ip_hdr (tm, tc, b); tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4); diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c index f37ba96d..5c40ddf9 100644 --- a/src/vnet/tcp/tcp_test.c +++ b/src/vnet/tcp/tcp_test.c @@ -1550,6 +1550,101 @@ tcp_test_fifo (vlib_main_t * vm, unformat_input_t * input) return res; } +static int +tcp_test_lookup (vlib_main_t * vm, unformat_input_t * input) +{ + session_manager_main_t *smm = &session_manager_main; + tcp_main_t *tm = &tcp_main; + transport_connection_t _tc1, *tc1 = &_tc1, _tc2, *tc2 = &_tc2, *tconn; + tcp_connection_t *tc; + stream_session_t *s; + u8 cmp = 0; + + pool_get (smm->sessions[0], s); + memset (s, 0, sizeof (*s)); + s->session_index = s - smm->sessions[0]; + + pool_get (tm->connections[0], tc); + memset (tc, 0, sizeof (*tc)); + tc->connection.c_index = tc - tm->connections[0]; + tc->connection.s_index = s->session_index; + s->connection_index = tc->connection.c_index; + + tc->connection.lcl_ip.ip4.as_u32 = clib_host_to_net_u32 (0x06000101); + tc->connection.rmt_ip.ip4.as_u32 = clib_host_to_net_u32 (0x06000103); + tc->connection.lcl_port = 35051; + tc->connection.rmt_port = 53764; + tc->connection.proto = 0; + clib_memcpy (tc1, &tc->connection, sizeof (*tc1)); + + pool_get (session_manager_main.sessions[0], s); + memset (s, 0, sizeof (*s)); + s->session_index = s - smm->sessions[0]; + pool_get (tm->connections[0], tc); + memset (tc, 0, sizeof (*tc)); + tc->connection.c_index = tc - tm->connections[0]; + tc->connection.s_index = s->session_index; + s->connection_index = tc->connection.c_index; + + tc->connection.lcl_ip.ip4.as_u32 = clib_host_to_net_u32 (0x06000101); + tc->connection.rmt_ip.ip4.as_u32 = clib_host_to_net_u32 (0x06000102); + tc->connection.lcl_port = 38225; + tc->connection.rmt_port = 53764; + tc->connection.proto = 0; + clib_memcpy (tc2, &tc->connection, sizeof (*tc2)); + + /* + * Confirm that connection lookup works + */ + + stream_session_table_add_for_tc (tc1, tc1->s_index); + tconn = stream_session_lookup_transport_wt4 (&tc1->lcl_ip.ip4, + &tc1->rmt_ip.ip4, + tc1->lcl_port, tc1->rmt_port, + tc1->proto, 0); + cmp = (memcmp (&tconn->rmt_ip, &tc1->rmt_ip, sizeof (tc1->rmt_ip)) == 0); + TCP_TEST ((cmp), "rmt ip is identical %d", cmp); + TCP_TEST ((tconn->lcl_port == tc1->lcl_port), + "rmt port is identical %d", tconn->lcl_port == tc1->lcl_port); + + /* + * Non-existing connection lookup should not work + */ + + tconn = stream_session_lookup_transport_wt4 (&tc2->lcl_ip.ip4, + &tc2->rmt_ip.ip4, + tc2->lcl_port, tc2->rmt_port, + tc2->proto, 0); + TCP_TEST ((tconn == 0), "lookup result should be null"); + + /* + * Delete and lookup again + */ + stream_session_table_del_for_tc (tc1); + tconn = stream_session_lookup_transport_wt4 (&tc1->lcl_ip.ip4, + &tc1->rmt_ip.ip4, + tc1->lcl_port, tc1->rmt_port, + tc1->proto, 0); + TCP_TEST ((tconn == 0), "lookup result should be null"); + tconn = stream_session_lookup_transport_wt4 (&tc2->lcl_ip.ip4, + &tc2->rmt_ip.ip4, + tc2->lcl_port, tc2->rmt_port, + tc2->proto, 0); + TCP_TEST ((tconn == 0), "lookup result should be null"); + + /* + * Re-add and lookup tc2 + */ + stream_session_table_add_for_tc (tc1, tc1->s_index); + tconn = stream_session_lookup_transport_wt4 (&tc2->lcl_ip.ip4, + &tc2->rmt_ip.ip4, + tc2->lcl_port, tc2->rmt_port, + tc2->proto, 0); + TCP_TEST ((tconn == 0), "lookup result should be null"); + + return 0; +} + static int tcp_test_session (vlib_main_t * vm, unformat_input_t * input) { @@ -1632,6 +1727,10 @@ tcp_test (vlib_main_t * vm, { res = tcp_test_session (vm, input); } + else if (unformat (input, "lookup")) + { + res = tcp_test_lookup (vm, input); + } else break; } -- cgit 1.2.3-korg From 04e5344a358a9ad42d896486d2d226149fd326f4 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Sun, 16 Jul 2017 17:12:15 -0700 Subject: Cleanup/refactor session layer code Change-Id: Ica99e8cb919fca6b069c37c969d60e8ccc2c6bf9 Signed-off-by: Florin Coras --- src/uri/uri_tcp_test.c | 4 - src/uri/uri_udp_test.c | 4 - src/vnet.am | 9 +- src/vnet/session/hashes.c | 28 -- src/vnet/session/node.c | 685 --------------------------------- src/vnet/session/session.c | 506 +----------------------- src/vnet/session/session.h | 164 +------- src/vnet/session/session_lookup.c | 620 +++++++++++++++++++++++++++++ src/vnet/session/session_lookup.h | 101 +++++ src/vnet/session/session_node.c | 685 +++++++++++++++++++++++++++++++++ src/vnet/session/stream_session.h | 98 +++++ src/vnet/session/transport.c | 64 --- src/vnet/session/transport.h | 174 +-------- src/vnet/session/transport_interface.c | 106 +++++ src/vnet/session/transport_interface.h | 82 ++++ src/vnet/tcp/tcp.c | 122 +++--- src/vnet/tcp/tcp_input.c | 5 +- src/vnet/udp/udp.c | 22 +- 18 files changed, 1800 insertions(+), 1679 deletions(-) delete mode 100644 src/vnet/session/hashes.c delete mode 100644 src/vnet/session/node.c create mode 100644 src/vnet/session/session_lookup.c create mode 100644 src/vnet/session/session_lookup.h create mode 100644 src/vnet/session/session_node.c create mode 100644 src/vnet/session/stream_session.h delete mode 100644 src/vnet/session/transport.c create mode 100644 src/vnet/session/transport_interface.c create mode 100644 src/vnet/session/transport_interface.h (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/uri/uri_tcp_test.c b/src/uri/uri_tcp_test.c index 80aab183..f5fbbd23 100755 --- a/src/uri/uri_tcp_test.c +++ b/src/uri/uri_tcp_test.c @@ -36,10 +36,6 @@ #include #undef vl_printfun -/* Satisfy external references when not linking with -lvlib */ -vlib_main_t vlib_global_main; -vlib_main_t **vlib_mains; - typedef struct { svm_fifo_t *server_rx_fifo; diff --git a/src/uri/uri_udp_test.c b/src/uri/uri_udp_test.c index a8e39eaa..aea4707c 100644 --- a/src/uri/uri_udp_test.c +++ b/src/uri/uri_udp_test.c @@ -47,10 +47,6 @@ #include #undef vl_printfun -/* Satisfy external references when not linking with -lvlib */ -vlib_main_t vlib_global_main; -vlib_main_t **vlib_mains; - typedef enum { STATE_START, diff --git a/src/vnet.am b/src/vnet.am index ebcf0a0a..060e3f38 100644 --- a/src/vnet.am +++ b/src/vnet.am @@ -832,19 +832,22 @@ nobase_include_HEADERS += \ libvnet_la_SOURCES += \ vnet/session/session.c \ - vnet/session/node.c \ - vnet/session/transport.c \ + vnet/session/session_lookup.c \ + vnet/session/session_node.c \ + vnet/session/transport_interface.c \ vnet/session/application.c \ vnet/session/session_cli.c \ - vnet/session/hashes.c \ vnet/session/application_interface.c \ vnet/session/segment_manager.c \ vnet/session/session_api.c nobase_include_HEADERS += \ vnet/session/session.h \ + vnet/session/stream_session.h \ + vnet/session/session_lookup.h \ vnet/session/application.h \ vnet/session/transport.h \ + vnet/session/transport_interface.h \ vnet/session/application_interface.h \ vnet/session/session_debug.h \ vnet/session/segment_manager.h \ diff --git a/src/vnet/session/hashes.c b/src/vnet/session/hashes.c deleted file mode 100644 index 1808dd73..00000000 --- a/src/vnet/session/hashes.c +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2016 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** Generate typed init functions for multiple hash table styles... */ - -#include -#include - -#include - -#undef __included_bihash_template_h__ - -#include -#include - -#include diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c deleted file mode 100644 index 8d703b0b..00000000 --- a/src/vnet/session/node.c +++ /dev/null @@ -1,685 +0,0 @@ -/* - * Copyright (c) 2017 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -vlib_node_registration_t session_queue_node; - -typedef struct -{ - u32 session_index; - u32 server_thread_index; -} session_queue_trace_t; - -/* packet trace format function */ -static u8 * -format_session_queue_trace (u8 * s, va_list * args) -{ - CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); - CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); - session_queue_trace_t *t = va_arg (*args, session_queue_trace_t *); - - s = format (s, "SESSION_QUEUE: session index %d, server thread index %d", - t->session_index, t->server_thread_index); - return s; -} - -vlib_node_registration_t session_queue_node; - -#define foreach_session_queue_error \ -_(TX, "Packets transmitted") \ -_(TIMER, "Timer events") \ -_(NO_BUFFER, "Out of buffers") - -typedef enum -{ -#define _(sym,str) SESSION_QUEUE_ERROR_##sym, - foreach_session_queue_error -#undef _ - SESSION_QUEUE_N_ERROR, -} session_queue_error_t; - -static char *session_queue_error_strings[] = { -#define _(sym,string) string, - foreach_session_queue_error -#undef _ -}; - -static u32 session_type_to_next[] = { - SESSION_QUEUE_NEXT_TCP_IP4_OUTPUT, - SESSION_QUEUE_NEXT_IP4_LOOKUP, - SESSION_QUEUE_NEXT_TCP_IP6_OUTPUT, - SESSION_QUEUE_NEXT_IP6_LOOKUP, -}; - -always_inline void -session_tx_fifo_chain_tail (session_manager_main_t * smm, vlib_main_t * vm, - u8 thread_index, svm_fifo_t * fifo, - vlib_buffer_t * b0, u32 bi0, u8 n_bufs_per_seg, - u32 * left_to_snd0, u16 * n_bufs, u32 * rx_offset, - u16 deq_per_buf, u8 peek_data) -{ - vlib_buffer_t *chain_b0, *prev_b0; - u32 chain_bi0; - u16 len_to_deq0, n_bytes_read; - u8 *data0, j; - - chain_bi0 = bi0; - chain_b0 = b0; - for (j = 1; j < n_bufs_per_seg; j++) - { - prev_b0 = chain_b0; - len_to_deq0 = clib_min (*left_to_snd0, deq_per_buf); - - *n_bufs -= 1; - chain_bi0 = smm->tx_buffers[thread_index][*n_bufs]; - _vec_len (smm->tx_buffers[thread_index]) = *n_bufs; - - chain_b0 = vlib_get_buffer (vm, chain_bi0); - chain_b0->current_data = 0; - data0 = vlib_buffer_get_current (chain_b0); - if (peek_data) - { - n_bytes_read = svm_fifo_peek (fifo, *rx_offset, len_to_deq0, data0); - *rx_offset += n_bytes_read; - } - else - { - n_bytes_read = svm_fifo_dequeue_nowait (fifo, len_to_deq0, data0); - } - ASSERT (n_bytes_read == len_to_deq0); - chain_b0->current_length = n_bytes_read; - b0->total_length_not_including_first_buffer += chain_b0->current_length; - - /* update previous buffer */ - prev_b0->next_buffer = chain_bi0; - prev_b0->flags |= VLIB_BUFFER_NEXT_PRESENT; - - /* update current buffer */ - chain_b0->next_buffer = 0; - - *left_to_snd0 -= n_bytes_read; - if (*left_to_snd0 == 0) - break; - } -} - -always_inline int -session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, - session_manager_main_t * smm, - session_fifo_event_t * e0, - stream_session_t * s0, u32 thread_index, - int *n_tx_packets, u8 peek_data) -{ - u32 n_trace = vlib_get_trace_count (vm, node); - u32 left_to_snd0, max_len_to_snd0, len_to_deq0, snd_space0; - u32 n_bufs_per_evt, n_frames_per_evt; - transport_connection_t *tc0; - transport_proto_vft_t *transport_vft; - u32 next_index, next0, *to_next, n_left_to_next, bi0; - vlib_buffer_t *b0; - u32 rx_offset = 0, max_dequeue0, n_bytes_per_seg; - u16 snd_mss0, n_bufs_per_seg, n_bufs; - u8 *data0; - int i, n_bytes_read; - u32 n_bytes_per_buf, deq_per_buf; - u32 buffers_allocated, buffers_allocated_this_call; - - next_index = next0 = session_type_to_next[s0->session_type]; - - transport_vft = session_get_transport_vft (s0->session_type); - tc0 = transport_vft->get_connection (s0->connection_index, thread_index); - - /* Make sure we have space to send and there's something to dequeue */ - snd_mss0 = transport_vft->send_mss (tc0); - snd_space0 = transport_vft->send_space (tc0); - - /* Can't make any progress */ - if (snd_space0 == 0 || snd_mss0 == 0) - { - vec_add1 (smm->pending_event_vector[thread_index], *e0); - return 0; - } - - if (peek_data) - { - /* Offset in rx fifo from where to peek data */ - rx_offset = transport_vft->tx_fifo_offset (tc0); - } - - /* Check how much we can pull. If buffering, subtract the offset */ - max_dequeue0 = svm_fifo_max_dequeue (s0->server_tx_fifo) - rx_offset; - - /* Nothing to read return */ - if (max_dequeue0 == 0) - { - svm_fifo_unset_event (s0->server_tx_fifo); - return 0; - } - - /* Ensure we're not writing more than transport window allows */ - if (max_dequeue0 < snd_space0) - { - /* Constrained by tx queue. Try to send only fully formed segments */ - max_len_to_snd0 = (max_dequeue0 > snd_mss0) ? - max_dequeue0 - max_dequeue0 % snd_mss0 : max_dequeue0; - /* TODO Nagle ? */ - } - else - { - max_len_to_snd0 = snd_space0; - } - - n_bytes_per_buf = vlib_buffer_free_list_buffer_size - (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); - n_bytes_per_seg = MAX_HDRS_LEN + snd_mss0; - n_bufs_per_seg = ceil ((double) n_bytes_per_seg / n_bytes_per_buf); - n_bufs_per_evt = (ceil ((double) max_len_to_snd0 / n_bytes_per_seg)) - * n_bufs_per_seg; - n_frames_per_evt = ceil ((double) n_bufs_per_evt / VLIB_FRAME_SIZE); - - deq_per_buf = clib_min (snd_mss0, n_bytes_per_buf); - - n_bufs = vec_len (smm->tx_buffers[thread_index]); - left_to_snd0 = max_len_to_snd0; - for (i = 0; i < n_frames_per_evt; i++) - { - /* Make sure we have at least one full frame of buffers ready */ - if (PREDICT_FALSE (n_bufs < VLIB_FRAME_SIZE)) - { - vec_validate (smm->tx_buffers[thread_index], - n_bufs + 2 * VLIB_FRAME_SIZE - 1); - - buffers_allocated = 0; - do - { - buffers_allocated_this_call = - vlib_buffer_alloc - (vm, - &smm->tx_buffers[thread_index][n_bufs + buffers_allocated], - 2 * VLIB_FRAME_SIZE - buffers_allocated); - buffers_allocated += buffers_allocated_this_call; - } - while (buffers_allocated_this_call > 0 - && ((buffers_allocated + n_bufs < VLIB_FRAME_SIZE))); - - n_bufs += buffers_allocated; - - _vec_len (smm->tx_buffers[thread_index]) = n_bufs; - - if (PREDICT_FALSE (n_bufs < VLIB_FRAME_SIZE)) - { - vec_add1 (smm->pending_event_vector[thread_index], *e0); - return -1; - } - } - /* Allow enqueuing of a new event */ - svm_fifo_unset_event (s0->server_tx_fifo); - - vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); - while (left_to_snd0 && n_left_to_next >= n_bufs_per_seg) - { - /* - * Handle first buffer in chain separately - */ - - /* Get free buffer */ - ASSERT (n_bufs >= 1); - bi0 = smm->tx_buffers[thread_index][--n_bufs]; - ASSERT (bi0); - _vec_len (smm->tx_buffers[thread_index]) = n_bufs; - - /* usual speculation, or the enqueue_x1 macro will barf */ - to_next[0] = bi0; - to_next += 1; - n_left_to_next -= 1; - - b0 = vlib_get_buffer (vm, bi0); - b0->error = 0; - b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID - | VNET_BUFFER_F_LOCALLY_ORIGINATED; - b0->current_data = 0; - b0->total_length_not_including_first_buffer = 0; - - len_to_deq0 = clib_min (left_to_snd0, deq_per_buf); - - data0 = vlib_buffer_make_headroom (b0, MAX_HDRS_LEN); - if (peek_data) - { - n_bytes_read = svm_fifo_peek (s0->server_tx_fifo, rx_offset, - len_to_deq0, data0); - /* Keep track of progress locally, transport is also supposed to - * increment it independently when pushing the header */ - rx_offset += n_bytes_read; - } - else - { - n_bytes_read = svm_fifo_dequeue_nowait (s0->server_tx_fifo, - len_to_deq0, data0); - } - - if (n_bytes_read <= 0) - goto dequeue_fail; - - b0->current_length = n_bytes_read; - - left_to_snd0 -= n_bytes_read; - *n_tx_packets = *n_tx_packets + 1; - - /* - * Fill in the remaining buffers in the chain, if any - */ - if (PREDICT_FALSE (n_bufs_per_seg > 1)) - session_tx_fifo_chain_tail (smm, vm, thread_index, - s0->server_tx_fifo, b0, bi0, - n_bufs_per_seg, &left_to_snd0, - &n_bufs, &rx_offset, deq_per_buf, - peek_data); - - /* Ask transport to push header after current_length and - * total_length_not_including_first_buffer are updated */ - transport_vft->push_header (tc0, b0); - - /* *INDENT-OFF* */ - SESSION_EVT_DBG(SESSION_EVT_DEQ, s0, ({ - ed->data[0] = e0->event_id; - ed->data[1] = max_dequeue0; - ed->data[2] = len_to_deq0; - ed->data[3] = left_to_snd0; - })); - /* *INDENT-ON* */ - - - VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); - if (PREDICT_FALSE (n_trace > 0)) - { - session_queue_trace_t *t0; - vlib_trace_buffer (vm, node, next_index, b0, - 1 /* follow_chain */ ); - vlib_set_trace_count (vm, node, --n_trace); - t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); - t0->session_index = s0->session_index; - t0->server_thread_index = s0->thread_index; - } - - vlib_validate_buffer_enqueue_x1 (vm, node, next_index, - to_next, n_left_to_next, - bi0, next0); - } - vlib_put_next_frame (vm, node, next_index, n_left_to_next); - } - - /* If we couldn't dequeue all bytes mark as partially read */ - if (max_len_to_snd0 < max_dequeue0) - { - /* If we don't already have new event */ - if (svm_fifo_set_event (s0->server_tx_fifo)) - { - vec_add1 (smm->pending_event_vector[thread_index], *e0); - } - } - return 0; - -dequeue_fail: - /* - * Can't read from fifo. If we don't already have an event, save as partially - * read, return buff to free list and return - */ - clib_warning ("dequeue fail"); - - if (svm_fifo_set_event (s0->server_tx_fifo)) - { - vec_add1 (smm->pending_event_vector[thread_index], *e0); - } - vlib_put_next_frame (vm, node, next_index, n_left_to_next + 1); - _vec_len (smm->tx_buffers[thread_index]) += 1; - - return 0; -} - -int -session_tx_fifo_peek_and_snd (vlib_main_t * vm, vlib_node_runtime_t * node, - session_manager_main_t * smm, - session_fifo_event_t * e0, - stream_session_t * s0, u32 thread_index, - int *n_tx_pkts) -{ - return session_tx_fifo_read_and_snd_i (vm, node, smm, e0, s0, thread_index, - n_tx_pkts, 1); -} - -int -session_tx_fifo_dequeue_and_snd (vlib_main_t * vm, vlib_node_runtime_t * node, - session_manager_main_t * smm, - session_fifo_event_t * e0, - stream_session_t * s0, u32 thread_index, - int *n_tx_pkts) -{ - return session_tx_fifo_read_and_snd_i (vm, node, smm, e0, s0, thread_index, - n_tx_pkts, 0); -} - -always_inline stream_session_t * -session_event_get_session (session_fifo_event_t * e, u8 thread_index) -{ - ASSERT (e->fifo->master_thread_index == thread_index); - return stream_session_get_if_valid (e->fifo->master_session_index, - thread_index); -} - -void -dump_thread_0_event_queue (void) -{ - session_manager_main_t *smm = vnet_get_session_manager_main (); - vlib_main_t *vm = &vlib_global_main; - u32 my_thread_index = vm->thread_index; - session_fifo_event_t _e, *e = &_e; - stream_session_t *s0; - int i, index; - i8 *headp; - - unix_shared_memory_queue_t *q; - q = smm->vpp_event_queues[my_thread_index]; - - index = q->head; - - for (i = 0; i < q->cursize; i++) - { - headp = (i8 *) (&q->data[0] + q->elsize * index); - clib_memcpy (e, headp, q->elsize); - - switch (e->event_type) - { - case FIFO_EVENT_APP_TX: - s0 = session_event_get_session (e, my_thread_index); - fformat (stdout, "[%04d] TX session %d\n", i, s0->session_index); - break; - - case FIFO_EVENT_DISCONNECT: - s0 = stream_session_get_from_handle (e->session_handle); - fformat (stdout, "[%04d] disconnect session %d\n", i, - s0->session_index); - break; - - case FIFO_EVENT_BUILTIN_RX: - s0 = session_event_get_session (e, my_thread_index); - fformat (stdout, "[%04d] builtin_rx %d\n", i, s0->session_index); - break; - - case FIFO_EVENT_RPC: - fformat (stdout, "[%04d] RPC call %llx with %llx\n", - i, (u64) (e->rpc_args.fp), (u64) (e->rpc_args.arg)); - break; - - default: - fformat (stdout, "[%04d] unhandled event type %d\n", - i, e->event_type); - break; - } - - index++; - - if (index == q->maxsize) - index = 0; - } -} - -static u8 -session_node_cmp_event (session_fifo_event_t * e, svm_fifo_t * f) -{ - stream_session_t *s; - switch (e->event_type) - { - case FIFO_EVENT_APP_RX: - case FIFO_EVENT_APP_TX: - case FIFO_EVENT_BUILTIN_RX: - if (e->fifo == f) - return 1; - break; - case FIFO_EVENT_DISCONNECT: - break; - case FIFO_EVENT_RPC: - s = stream_session_get_from_handle (e->session_handle); - if (!s) - { - clib_warning ("session has event but doesn't exist!"); - break; - } - if (s->server_rx_fifo == f || s->server_tx_fifo == f) - return 1; - break; - default: - break; - } - return 0; -} - -u8 -session_node_lookup_fifo_event (svm_fifo_t * f, session_fifo_event_t * e) -{ - session_manager_main_t *smm = vnet_get_session_manager_main (); - unix_shared_memory_queue_t *q; - session_fifo_event_t *pending_event_vector, *evt; - int i, index, found = 0; - i8 *headp; - u8 thread_index; - - ASSERT (e); - thread_index = f->master_thread_index; - /* - * Search evt queue - */ - q = smm->vpp_event_queues[thread_index]; - index = q->head; - for (i = 0; i < q->cursize; i++) - { - headp = (i8 *) (&q->data[0] + q->elsize * index); - clib_memcpy (e, headp, q->elsize); - found = session_node_cmp_event (e, f); - if (found) - break; - if (++index == q->maxsize) - index = 0; - } - /* - * Search pending events vector - */ - pending_event_vector = smm->pending_event_vector[thread_index]; - vec_foreach (evt, pending_event_vector) - { - found = session_node_cmp_event (evt, f); - if (found) - { - clib_memcpy (e, evt, sizeof (*evt)); - break; - } - } - return found; -} - -static uword -session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, - vlib_frame_t * frame) -{ - session_manager_main_t *smm = vnet_get_session_manager_main (); - session_fifo_event_t *my_pending_event_vector, *e; - session_fifo_event_t *my_fifo_events; - u32 n_to_dequeue, n_events; - unix_shared_memory_queue_t *q; - application_t *app; - int n_tx_packets = 0; - u32 my_thread_index = vm->thread_index; - int i, rv; - f64 now = vlib_time_now (vm); - void (*fp) (void *); - - SESSION_EVT_DBG (SESSION_EVT_POLL_GAP_TRACK, smm, my_thread_index); - - /* - * Update TCP time - */ - tcp_update_time (now, my_thread_index); - - /* - * Get vpp queue events - */ - q = smm->vpp_event_queues[my_thread_index]; - if (PREDICT_FALSE (q == 0)) - return 0; - - my_fifo_events = smm->free_event_vector[my_thread_index]; - - /* min number of events we can dequeue without blocking */ - n_to_dequeue = q->cursize; - my_pending_event_vector = smm->pending_event_vector[my_thread_index]; - - if (n_to_dequeue == 0 && vec_len (my_pending_event_vector) == 0) - return 0; - - SESSION_EVT_DBG (SESSION_EVT_DEQ_NODE, 0); - - /* - * If we didn't manage to process previous events try going - * over them again without dequeuing new ones. - */ - /* XXX: Block senders to sessions that can't keep up */ - if (0 && vec_len (my_pending_event_vector) >= 100) - { - clib_warning ("too many fifo events unsolved"); - goto skip_dequeue; - } - - /* See you in the next life, don't be late */ - if (pthread_mutex_trylock (&q->mutex)) - return 0; - - for (i = 0; i < n_to_dequeue; i++) - { - vec_add2 (my_fifo_events, e, 1); - unix_shared_memory_queue_sub_raw (q, (u8 *) e); - } - - /* The other side of the connection is not polling */ - if (q->cursize < (q->maxsize / 8)) - (void) pthread_cond_broadcast (&q->condvar); - pthread_mutex_unlock (&q->mutex); - - vec_append (my_fifo_events, my_pending_event_vector); - - _vec_len (my_pending_event_vector) = 0; - smm->pending_event_vector[my_thread_index] = my_pending_event_vector; - -skip_dequeue: - n_events = vec_len (my_fifo_events); - for (i = 0; i < n_events; i++) - { - stream_session_t *s0; /* $$$ prefetch 1 ahead maybe */ - session_fifo_event_t *e0; - - e0 = &my_fifo_events[i]; - - switch (e0->event_type) - { - case FIFO_EVENT_APP_TX: - s0 = session_event_get_session (e0, my_thread_index); - - if (CLIB_DEBUG && !s0) - { - clib_warning ("It's dead, Jim!"); - continue; - } - - if (PREDICT_FALSE (s0->session_state == SESSION_STATE_CLOSED)) - continue; - /* Spray packets in per session type frames, since they go to - * different nodes */ - rv = (smm->session_tx_fns[s0->session_type]) (vm, node, smm, e0, s0, - my_thread_index, - &n_tx_packets); - /* Out of buffers */ - if (PREDICT_FALSE (rv < 0)) - { - vlib_node_increment_counter (vm, node->node_index, - SESSION_QUEUE_ERROR_NO_BUFFER, 1); - continue; - } - break; - case FIFO_EVENT_DISCONNECT: - s0 = stream_session_get_from_handle (e0->session_handle); - stream_session_disconnect (s0); - break; - case FIFO_EVENT_BUILTIN_RX: - s0 = session_event_get_session (e0, my_thread_index); - svm_fifo_unset_event (s0->server_rx_fifo); - app = application_get (s0->app_index); - app->cb_fns.builtin_server_rx_callback (s0); - break; - case FIFO_EVENT_RPC: - fp = e0->rpc_args.fp; - (*fp) (e0->rpc_args.arg); - break; - - default: - clib_warning ("unhandled event type %d", e0->event_type); - } - } - - _vec_len (my_fifo_events) = 0; - smm->free_event_vector[my_thread_index] = my_fifo_events; - - vlib_node_increment_counter (vm, session_queue_node.index, - SESSION_QUEUE_ERROR_TX, n_tx_packets); - - SESSION_EVT_DBG (SESSION_EVT_DEQ_NODE, 1); - - return n_tx_packets; -} - -/* *INDENT-OFF* */ -VLIB_REGISTER_NODE (session_queue_node) = -{ - .function = session_queue_node_fn, - .name = "session-queue", - .format_trace = format_session_queue_trace, - .type = VLIB_NODE_TYPE_INPUT, - .n_errors = ARRAY_LEN (session_queue_error_strings), - .error_strings = session_queue_error_strings, - .n_next_nodes = SESSION_QUEUE_N_NEXT, - .state = VLIB_NODE_STATE_DISABLED, - .next_nodes = - { - [SESSION_QUEUE_NEXT_DROP] = "error-drop", - [SESSION_QUEUE_NEXT_IP4_LOOKUP] = "ip4-lookup", - [SESSION_QUEUE_NEXT_IP6_LOOKUP] = "ip6-lookup", - [SESSION_QUEUE_NEXT_TCP_IP4_OUTPUT] = "tcp4-output", - [SESSION_QUEUE_NEXT_TCP_IP6_OUTPUT] = "tcp6-output", - }, -}; -/* *INDENT-ON* */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 09bc00e7..48000a6f 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -18,455 +18,15 @@ */ #include +#include +#include #include #include #include -#include #include -#include - -/** - * Per-type vector of transport protocol virtual function tables - */ -static transport_proto_vft_t *tp_vfts; session_manager_main_t session_manager_main; - -transport_connection_t * -stream_session_lookup_half_open (transport_connection_t * tc) -{ - session_manager_main_t *smm = &session_manager_main; - session_kv4_t kv4; - int rv; - if (tc->is_ip4) - { - make_v4_ss_kv_from_tc (&kv4, tc); - rv = clib_bihash_search_inline_16_8 (&smm->v4_half_open_hash, &kv4); - if (rv == 0) - return tp_vfts[tc->proto].get_half_open (kv4.value & 0xFFFFFFFFULL); - } - return 0; -} - -/* - * Session lookup key; (src-ip, dst-ip, src-port, dst-port, session-type) - * Value: (owner thread index << 32 | session_index); - */ -void -stream_session_table_add_for_tc (transport_connection_t * tc, u64 value) -{ - session_manager_main_t *smm = &session_manager_main; - session_kv4_t kv4; - session_kv6_t kv6; - - switch (tc->proto) - { - case SESSION_TYPE_IP4_UDP: - case SESSION_TYPE_IP4_TCP: - make_v4_ss_kv_from_tc (&kv4, tc); - kv4.value = value; - clib_bihash_add_del_16_8 (&smm->v4_session_hash, &kv4, 1 /* is_add */ ); - break; - case SESSION_TYPE_IP6_UDP: - case SESSION_TYPE_IP6_TCP: - make_v6_ss_kv_from_tc (&kv6, tc); - kv6.value = value; - clib_bihash_add_del_48_8 (&smm->v6_session_hash, &kv6, 1 /* is_add */ ); - break; - default: - clib_warning ("Session type not supported"); - ASSERT (0); - } -} - -void -stream_session_table_add (session_manager_main_t * smm, stream_session_t * s, - u64 value) -{ - transport_connection_t *tc; - - tc = tp_vfts[s->session_type].get_connection (s->connection_index, - s->thread_index); - stream_session_table_add_for_tc (tc, value); -} - -static void -stream_session_half_open_table_add (session_type_t sst, - transport_connection_t * tc, u64 value) -{ - session_manager_main_t *smm = &session_manager_main; - session_kv4_t kv4; - session_kv6_t kv6; - - switch (sst) - { - case SESSION_TYPE_IP4_UDP: - case SESSION_TYPE_IP4_TCP: - make_v4_ss_kv_from_tc (&kv4, tc); - kv4.value = value; - clib_bihash_add_del_16_8 (&smm->v4_half_open_hash, &kv4, - 1 /* is_add */ ); - break; - case SESSION_TYPE_IP6_UDP: - case SESSION_TYPE_IP6_TCP: - make_v6_ss_kv_from_tc (&kv6, tc); - kv6.value = value; - clib_bihash_add_del_48_8 (&smm->v6_half_open_hash, &kv6, - 1 /* is_add */ ); - break; - default: - clib_warning ("Session type not supported"); - ASSERT (0); - } -} - -int -stream_session_table_del_for_tc (transport_connection_t * tc) -{ - session_manager_main_t *smm = &session_manager_main; - session_kv4_t kv4; - session_kv6_t kv6; - switch (tc->proto) - { - case SESSION_TYPE_IP4_UDP: - case SESSION_TYPE_IP4_TCP: - make_v4_ss_kv_from_tc (&kv4, tc); - return clib_bihash_add_del_16_8 (&smm->v4_session_hash, &kv4, - 0 /* is_add */ ); - break; - case SESSION_TYPE_IP6_UDP: - case SESSION_TYPE_IP6_TCP: - make_v6_ss_kv_from_tc (&kv6, tc); - return clib_bihash_add_del_48_8 (&smm->v6_session_hash, &kv6, - 0 /* is_add */ ); - break; - default: - clib_warning ("Session type not supported"); - ASSERT (0); - } - - return 0; -} - -static int -stream_session_table_del (session_manager_main_t * smm, stream_session_t * s) -{ - transport_connection_t *ts; - - ts = tp_vfts[s->session_type].get_connection (s->connection_index, - s->thread_index); - return stream_session_table_del_for_tc (ts); -} - -static void -stream_session_half_open_table_del (session_manager_main_t * smm, u8 sst, - transport_connection_t * tc) -{ - session_kv4_t kv4; - session_kv6_t kv6; - - switch (sst) - { - case SESSION_TYPE_IP4_UDP: - case SESSION_TYPE_IP4_TCP: - make_v4_ss_kv_from_tc (&kv4, tc); - clib_bihash_add_del_16_8 (&smm->v4_half_open_hash, &kv4, - 0 /* is_add */ ); - break; - case SESSION_TYPE_IP6_UDP: - case SESSION_TYPE_IP6_TCP: - make_v6_ss_kv_from_tc (&kv6, tc); - clib_bihash_add_del_48_8 (&smm->v6_half_open_hash, &kv6, - 0 /* is_add */ ); - break; - default: - clib_warning ("Session type not supported"); - ASSERT (0); - } -} - -stream_session_t * -stream_session_lookup_listener4 (ip4_address_t * lcl, u16 lcl_port, u8 proto) -{ - session_manager_main_t *smm = &session_manager_main; - session_kv4_t kv4; - int rv; - - make_v4_listener_kv (&kv4, lcl, lcl_port, proto); - rv = clib_bihash_search_inline_16_8 (&smm->v4_session_hash, &kv4); - if (rv == 0) - return pool_elt_at_index (smm->listen_sessions[proto], (u32) kv4.value); - - /* Zero out the lcl ip */ - kv4.key[0] = 0; - rv = clib_bihash_search_inline_16_8 (&smm->v4_session_hash, &kv4); - if (rv == 0) - return pool_elt_at_index (smm->listen_sessions[proto], kv4.value); - - return 0; -} - -/** Looks up a session based on the 5-tuple passed as argument. - * - * First it tries to find an established session, if this fails, it tries - * finding a listener session if this fails, it tries a lookup with a - * wildcarded local source (listener bound to all interfaces) - */ -stream_session_t * -stream_session_lookup4 (ip4_address_t * lcl, ip4_address_t * rmt, - u16 lcl_port, u16 rmt_port, u8 proto) -{ - session_manager_main_t *smm = &session_manager_main; - session_kv4_t kv4; - stream_session_t *s; - int rv; - - /* Lookup session amongst established ones */ - make_v4_ss_kv (&kv4, lcl, rmt, lcl_port, rmt_port, proto); - rv = clib_bihash_search_inline_16_8 (&smm->v4_session_hash, &kv4); - if (rv == 0) - return stream_session_get_from_handle (kv4.value); - - /* If nothing is found, check if any listener is available */ - if ((s = stream_session_lookup_listener4 (lcl, lcl_port, proto))) - return s; - - /* Finally, try half-open connections */ - rv = clib_bihash_search_inline_16_8 (&smm->v4_half_open_hash, &kv4); - if (rv == 0) - return stream_session_get_from_handle (kv4.value); - return 0; -} - -stream_session_t * -stream_session_lookup_listener6 (ip6_address_t * lcl, u16 lcl_port, u8 proto) -{ - session_manager_main_t *smm = &session_manager_main; - session_kv6_t kv6; - int rv; - - make_v6_listener_kv (&kv6, lcl, lcl_port, proto); - rv = clib_bihash_search_inline_48_8 (&smm->v6_session_hash, &kv6); - if (rv == 0) - return pool_elt_at_index (smm->listen_sessions[proto], kv6.value); - - /* Zero out the lcl ip */ - kv6.key[0] = kv6.key[1] = 0; - rv = clib_bihash_search_inline_48_8 (&smm->v6_session_hash, &kv6); - if (rv == 0) - return pool_elt_at_index (smm->listen_sessions[proto], kv6.value); - - return 0; -} - -/* Looks up a session based on the 5-tuple passed as argument. - * First it tries to find an established session, if this fails, it tries - * finding a listener session if this fails, it tries a lookup with a - * wildcarded local source (listener bound to all interfaces) */ -stream_session_t * -stream_session_lookup6 (ip6_address_t * lcl, ip6_address_t * rmt, - u16 lcl_port, u16 rmt_port, u8 proto) -{ - session_manager_main_t *smm = vnet_get_session_manager_main (); - session_kv6_t kv6; - stream_session_t *s; - int rv; - - make_v6_ss_kv (&kv6, lcl, rmt, lcl_port, rmt_port, proto); - rv = clib_bihash_search_inline_48_8 (&smm->v6_session_hash, &kv6); - if (rv == 0) - return stream_session_get_from_handle (kv6.value); - - /* If nothing is found, check if any listener is available */ - if ((s = stream_session_lookup_listener6 (lcl, lcl_port, proto))) - return s; - - /* Finally, try half-open connections */ - rv = clib_bihash_search_inline_48_8 (&smm->v6_half_open_hash, &kv6); - if (rv == 0) - return stream_session_get_from_handle (kv6.value); - return 0; -} - -stream_session_t * -stream_session_lookup_listener (ip46_address_t * lcl, u16 lcl_port, u8 proto) -{ - switch (proto) - { - case SESSION_TYPE_IP4_UDP: - case SESSION_TYPE_IP4_TCP: - return stream_session_lookup_listener4 (&lcl->ip4, lcl_port, proto); - break; - case SESSION_TYPE_IP6_UDP: - case SESSION_TYPE_IP6_TCP: - return stream_session_lookup_listener6 (&lcl->ip6, lcl_port, proto); - break; - } - return 0; -} - -static u64 -stream_session_half_open_lookup (session_manager_main_t * smm, - ip46_address_t * lcl, ip46_address_t * rmt, - u16 lcl_port, u16 rmt_port, u8 proto) -{ - session_kv4_t kv4; - session_kv6_t kv6; - int rv; - - switch (proto) - { - case SESSION_TYPE_IP4_UDP: - case SESSION_TYPE_IP4_TCP: - make_v4_ss_kv (&kv4, &lcl->ip4, &rmt->ip4, lcl_port, rmt_port, proto); - rv = clib_bihash_search_inline_16_8 (&smm->v4_half_open_hash, &kv4); - - if (rv == 0) - return kv4.value; - - return (u64) ~ 0; - break; - case SESSION_TYPE_IP6_UDP: - case SESSION_TYPE_IP6_TCP: - make_v6_ss_kv (&kv6, &lcl->ip6, &rmt->ip6, lcl_port, rmt_port, proto); - rv = clib_bihash_search_inline_48_8 (&smm->v6_half_open_hash, &kv6); - - if (rv == 0) - return kv6.value; - - return (u64) ~ 0; - break; - } - return 0; -} - -transport_connection_t * -stream_session_lookup_transport_wt4 (ip4_address_t * lcl, ip4_address_t * rmt, - u16 lcl_port, u16 rmt_port, u8 proto, - u32 my_thread_index) -{ - session_manager_main_t *smm = &session_manager_main; - session_kv4_t kv4; - stream_session_t *s; - int rv; - - /* Lookup session amongst established ones */ - make_v4_ss_kv (&kv4, lcl, rmt, lcl_port, rmt_port, proto); - rv = clib_bihash_search_inline_16_8 (&smm->v4_session_hash, &kv4); - if (rv == 0) - { - s = stream_session_get_tsi (kv4.value, my_thread_index); - - return tp_vfts[s->session_type].get_connection (s->connection_index, - my_thread_index); - } - - /* If nothing is found, check if any listener is available */ - s = stream_session_lookup_listener4 (lcl, lcl_port, proto); - if (s) - return tp_vfts[s->session_type].get_listener (s->connection_index); - - /* Finally, try half-open connections */ - rv = clib_bihash_search_inline_16_8 (&smm->v4_half_open_hash, &kv4); - if (rv == 0) - return tp_vfts[proto].get_half_open (kv4.value & 0xFFFFFFFF); - return 0; -} - -transport_connection_t * -stream_session_lookup_transport4 (ip4_address_t * lcl, ip4_address_t * rmt, - u16 lcl_port, u16 rmt_port, u8 proto) -{ - session_manager_main_t *smm = &session_manager_main; - session_kv4_t kv4; - stream_session_t *s; - int rv; - - /* Lookup session amongst established ones */ - make_v4_ss_kv (&kv4, lcl, rmt, lcl_port, rmt_port, proto); - rv = clib_bihash_search_inline_16_8 (&smm->v4_session_hash, &kv4); - if (rv == 0) - { - s = stream_session_get_from_handle (kv4.value); - return tp_vfts[s->session_type].get_connection (s->connection_index, - s->thread_index); - } - - /* If nothing is found, check if any listener is available */ - s = stream_session_lookup_listener4 (lcl, lcl_port, proto); - if (s) - return tp_vfts[s->session_type].get_listener (s->connection_index); - - /* Finally, try half-open connections */ - rv = clib_bihash_search_inline_16_8 (&smm->v4_half_open_hash, &kv4); - if (rv == 0) - return tp_vfts[proto].get_half_open (kv4.value & 0xFFFFFFFF); - return 0; -} - -transport_connection_t * -stream_session_lookup_transport_wt6 (ip6_address_t * lcl, ip6_address_t * rmt, - u16 lcl_port, u16 rmt_port, u8 proto, - u32 my_thread_index) -{ - session_manager_main_t *smm = &session_manager_main; - stream_session_t *s; - session_kv6_t kv6; - int rv; - - make_v6_ss_kv (&kv6, lcl, rmt, lcl_port, rmt_port, proto); - rv = clib_bihash_search_inline_48_8 (&smm->v6_session_hash, &kv6); - if (rv == 0) - { - s = stream_session_get_tsi (kv6.value, my_thread_index); - - return tp_vfts[s->session_type].get_connection (s->connection_index, - my_thread_index); - } - - /* If nothing is found, check if any listener is available */ - s = stream_session_lookup_listener6 (lcl, lcl_port, proto); - if (s) - return tp_vfts[s->session_type].get_listener (s->connection_index); - - /* Finally, try half-open connections */ - rv = clib_bihash_search_inline_48_8 (&smm->v6_half_open_hash, &kv6); - if (rv == 0) - return tp_vfts[proto].get_half_open (kv6.value & 0xFFFFFFFF); - - return 0; -} - -transport_connection_t * -stream_session_lookup_transport6 (ip6_address_t * lcl, ip6_address_t * rmt, - u16 lcl_port, u16 rmt_port, u8 proto) -{ - session_manager_main_t *smm = &session_manager_main; - stream_session_t *s; - session_kv6_t kv6; - int rv; - - make_v6_ss_kv (&kv6, lcl, rmt, lcl_port, rmt_port, proto); - rv = clib_bihash_search_inline_48_8 (&smm->v6_session_hash, &kv6); - if (rv == 0) - { - s = stream_session_get_from_handle (kv6.value); - return tp_vfts[s->session_type].get_connection (s->connection_index, - s->thread_index); - } - - /* If nothing is found, check if any listener is available */ - s = stream_session_lookup_listener6 (lcl, lcl_port, proto); - if (s) - return tp_vfts[s->session_type].get_listener (s->connection_index); - - /* Finally, try half-open connections */ - rv = clib_bihash_search_inline_48_8 (&smm->v6_half_open_hash, &kv6); - if (rv == 0) - return tp_vfts[proto].get_half_open (kv6.value & 0xFFFFFFFF); - - return 0; -} +extern transport_proto_vft_t *tp_vfts; int stream_session_create_i (segment_manager_t * sm, transport_connection_t * tc, @@ -797,16 +357,15 @@ int stream_session_connect_notify (transport_connection_t * tc, u8 sst, u8 is_fail) { - session_manager_main_t *smm = &session_manager_main; application_t *app; stream_session_t *new_s = 0; u64 handle; u32 api_context = 0; int error = 0; - handle = stream_session_half_open_lookup (smm, &tc->lcl_ip, &tc->rmt_ip, - tc->lcl_port, tc->rmt_port, - tc->proto); + handle = stream_session_half_open_lookup_handle (&tc->lcl_ip, &tc->rmt_ip, + tc->lcl_port, tc->rmt_port, + tc->proto); if (handle == HALF_OPEN_LOOKUP_INVALID_VALUE) { clib_warning ("This can't be good!"); @@ -847,7 +406,7 @@ stream_session_connect_notify (transport_connection_t * tc, u8 sst, } /* Cleanup session lookup */ - stream_session_half_open_table_del (smm, sst, tc); + stream_session_half_open_table_del (sst, tc); return error; } @@ -891,7 +450,7 @@ stream_session_delete (stream_session_t * s) int rv; /* Delete from the main lookup table. */ - if ((rv = stream_session_table_del (smm, s))) + if ((rv = stream_session_table_del (s))) clib_warning ("hash delete error, rv %d", rv); /* Cleanup fifo segments */ @@ -986,14 +545,14 @@ stream_session_accept (transport_connection_t * tc, u32 listener_index, */ int stream_session_open (u32 app_index, session_type_t st, - transport_endpoint_t * tep, + transport_endpoint_t * rmt, transport_connection_t ** res) { transport_connection_t *tc; int rv; u64 handle; - rv = tp_vfts[st].open (&tep->ip, tep->port); + rv = tp_vfts[st].open (rmt); if (rv < 0) { clib_warning ("Transport failed to open connection."); @@ -1030,7 +589,7 @@ stream_session_listen (stream_session_t * s, transport_endpoint_t * tep) u32 tci; /* Transport bind/listen */ - tci = tp_vfts[s->session_type].bind (s->session_index, &tep->ip, tep->port); + tci = tp_vfts[s->session_type].bind (s->session_index, tep); if (tci == (u32) ~ 0) return -1; @@ -1132,41 +691,18 @@ stream_session_disconnect (stream_session_t * s) void stream_session_cleanup (stream_session_t * s) { - session_manager_main_t *smm = &session_manager_main; int rv; s->session_state = SESSION_STATE_CLOSED; /* Delete from the main lookup table to avoid more enqueues */ - rv = stream_session_table_del (smm, s); + rv = stream_session_table_del (s); if (rv) clib_warning ("hash delete error, rv %d", rv); tp_vfts[s->session_type].cleanup (s->connection_index, s->thread_index); } -void -session_register_transport (u8 type, const transport_proto_vft_t * vft) -{ - session_manager_main_t *smm = vnet_get_session_manager_main (); - - vec_validate (tp_vfts, type); - tp_vfts[type] = *vft; - - /* If an offset function is provided, then peek instead of dequeue */ - smm->session_tx_fns[type] = - (vft->tx_fifo_offset) ? session_tx_fifo_peek_and_snd : - session_tx_fifo_dequeue_and_snd; -} - -transport_proto_vft_t * -session_get_transport_vft (u8 type) -{ - if (type >= vec_len (tp_vfts)) - return 0; - return &tp_vfts[type]; -} - /** * Allocate vpp event queue (once) per worker thread */ @@ -1269,19 +805,7 @@ session_manager_main_enable (vlib_main_t * vm) for (i = 0; i < smm->preallocated_sessions; i++) pool_put_index (smm->sessions[0], i); - clib_bihash_init_16_8 (&smm->v4_session_hash, "v4 session table", - 200000 /* $$$$ config parameter nbuckets */ , - (64 << 20) /*$$$ config parameter table size */ ); - clib_bihash_init_48_8 (&smm->v6_session_hash, "v6 session table", - 200000 /* $$$$ config parameter nbuckets */ , - (64 << 20) /*$$$ config parameter table size */ ); - - clib_bihash_init_16_8 (&smm->v4_half_open_hash, "v4 half-open table", - 200000 /* $$$$ config parameter nbuckets */ , - (64 << 20) /*$$$ config parameter table size */ ); - clib_bihash_init_48_8 (&smm->v6_half_open_hash, "v6 half-open table", - 200000 /* $$$$ config parameter nbuckets */ , - (64 << 20) /*$$$ config parameter table size */ ); + session_lookup_init (); smm->is_enabled = 1; @@ -1328,11 +852,7 @@ clib_error_t * session_manager_main_init (vlib_main_t * vm) { session_manager_main_t *smm = &session_manager_main; - - smm->vlib_main = vm; - smm->vnet_main = vnet_get_main (); smm->is_enabled = 0; - return 0; } diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index 6c616326..bb22f100 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -15,7 +15,9 @@ #ifndef __included_session_h__ #define __included_session_h__ -#include +#include +#include +#include #include #include #include @@ -66,37 +68,6 @@ typedef enum SESSION_QUEUE_N_NEXT, } session_queue_next_t; -#define foreach_session_type \ - _(IP4_TCP, ip4_tcp) \ - _(IP4_UDP, ip4_udp) \ - _(IP6_TCP, ip6_tcp) \ - _(IP6_UDP, ip6_udp) - -typedef enum -{ -#define _(A, a) SESSION_TYPE_##A, - foreach_session_type -#undef _ - SESSION_N_TYPES, -} session_type_t; - - -session_type_t -session_type_from_proto_and_ip (transport_proto_t proto, u8 is_ip4); - -/* - * Application session state - */ -typedef enum -{ - SESSION_STATE_LISTENING, - SESSION_STATE_CONNECTING, - SESSION_STATE_ACCEPTING, - SESSION_STATE_READY, - SESSION_STATE_CLOSED, - SESSION_STATE_N_STATES, -} stream_session_state_t; - typedef struct { void *fp; @@ -116,48 +87,6 @@ typedef CLIB_PACKED (struct { }) session_fifo_event_t; /* *INDENT-ON* */ -typedef struct _stream_session_t -{ - /** fifo pointers. Once allocated, these do not move */ - svm_fifo_t *server_rx_fifo; - svm_fifo_t *server_tx_fifo; - - /** Type */ - u8 session_type; - - /** State */ - u8 session_state; - - u8 thread_index; - - /** To avoid n**2 "one event per frame" check */ - u8 enqueue_epoch; - - /** Pad to a multiple of 8 octets */ - u8 align_pad[4]; - - /** svm segment index where fifos were allocated */ - u32 svm_segment_index; - - /** Session index in per_thread pool */ - u32 session_index; - - /** Transport specific */ - u32 connection_index; - - /** Application specific */ - u32 pid; - - /** stream server pool index */ - u32 app_index; - - /** Parent listener session if the result of an accept */ - u32 listener_index; - - /** Opaque, pad to a 64-octet boundary */ - u64 opaque[2]; -} stream_session_t; - /* Forward definition */ typedef struct _session_manager_main session_manager_main_t; @@ -174,14 +103,6 @@ u8 session_node_lookup_fifo_event (svm_fifo_t * f, session_fifo_event_t * e); struct _session_manager_main { - /** Lookup tables for established sessions and listeners */ - clib_bihash_16_8_t v4_session_hash; - clib_bihash_48_8_t v6_session_hash; - - /** Lookup tables for half-open sessions */ - clib_bihash_16_8_t v4_half_open_hash; - clib_bihash_48_8_t v6_half_open_hash; - /** Per worker thread session pools */ stream_session_t **sessions; @@ -224,10 +145,6 @@ struct _session_manager_main /** Preallocate session config parameter */ u32 preallocated_sessions; - /* Convenience */ - vlib_main_t *vlib_main; - vnet_main_t *vnet_main; - #if SESSION_DBG /** * last event poll time by thread @@ -250,60 +167,6 @@ vnet_get_session_manager_main () return &session_manager_main; } -/* - * Stream session functions - */ - -stream_session_t *stream_session_lookup_listener4 (ip4_address_t * lcl, - u16 lcl_port, u8 proto); -stream_session_t *stream_session_lookup4 (ip4_address_t * lcl, - ip4_address_t * rmt, u16 lcl_port, - u16 rmt_port, u8 proto); -stream_session_t *stream_session_lookup_listener6 (ip6_address_t * lcl, - u16 lcl_port, u8 proto); -stream_session_t *stream_session_lookup6 (ip6_address_t * lcl, - ip6_address_t * rmt, u16 lcl_port, - u16 rmt_port, u8 proto); -transport_connection_t - * stream_session_lookup_transport_wt4 (ip4_address_t * lcl, - ip4_address_t * rmt, u16 lcl_port, - u16 rmt_port, u8 proto, - u32 thread_index); -transport_connection_t *stream_session_lookup_transport4 (ip4_address_t * lcl, - ip4_address_t * rmt, - u16 lcl_port, - u16 rmt_port, - u8 proto); -transport_connection_t *stream_session_lookup_transport_wt6 (ip6_address_t * - lcl, - ip6_address_t * - rmt, - u16 lcl_port, - u16 rmt_port, - u8 proto, - u32 - thread_index); -transport_connection_t *stream_session_lookup_transport6 (ip6_address_t * lcl, - ip6_address_t * rmt, - u16 lcl_port, - u16 rmt_port, - u8 proto); - -stream_session_t *stream_session_lookup_listener (ip46_address_t * lcl, - u16 lcl_port, u8 proto); -transport_connection_t - * stream_session_lookup_half_open (transport_connection_t * tc); -void stream_session_table_add_for_tc (transport_connection_t * tc, u64 value); -int stream_session_table_del_for_tc (transport_connection_t * tc); - -always_inline stream_session_t * -stream_session_get_tsi (u64 ti_and_si, u32 thread_index) -{ - ASSERT ((u32) (ti_and_si >> 32) == thread_index); - return pool_elt_at_index (session_manager_main.sessions[thread_index], - ti_and_si & 0xFFFFFFFFULL); -} - always_inline u8 stream_session_is_valid (u32 si, u8 thread_index) { @@ -445,9 +308,6 @@ send_session_connected_callback (u32 app_index, u32 api_context, stream_session_t * s, u8 is_fail); -void session_register_transport (u8 type, const transport_proto_vft_t * vft); -transport_proto_vft_t *session_get_transport_vft (u8 type); - clib_error_t *vnet_session_enable_disable (vlib_main_t * vm, u8 is_en); always_inline unix_shared_memory_queue_t * @@ -510,6 +370,24 @@ listen_session_del (stream_session_t * s) pool_put (session_manager_main.listen_sessions[s->session_type], s); } +always_inline stream_session_t * +session_manager_get_listener (u8 type, u32 index) +{ + return pool_elt_at_index (session_manager_main.listen_sessions[type], + index); +} + +always_inline void +session_manager_set_transport_rx_fn (u8 type, u8 is_peek) +{ + /* If an offset function is provided, then peek instead of dequeue */ + session_manager_main.session_tx_fns[type] = (is_peek) ? + session_tx_fifo_peek_and_snd : session_tx_fifo_dequeue_and_snd; +} + +session_type_t +session_type_from_proto_and_ip (transport_proto_t proto, u8 is_ip4); + always_inline u8 session_manager_is_enabled () { diff --git a/src/vnet/session/session_lookup.c b/src/vnet/session/session_lookup.c new file mode 100644 index 00000000..b3862ee3 --- /dev/null +++ b/src/vnet/session/session_lookup.c @@ -0,0 +1,620 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Generate typed init functions for multiple hash table styles... */ +#include +#include + +#include + +#undef __included_bihash_template_h__ + +#include +#include + +#include +#include +#include + +static session_lookup_t session_lookup; +extern transport_proto_vft_t *tp_vfts; + +/* *INDENT-OFF* */ +/* 16 octets */ +typedef CLIB_PACKED (struct { + union + { + struct + { + ip4_address_t src; + ip4_address_t dst; + u16 src_port; + u16 dst_port; + /* align by making this 4 octets even though its a 1-bit field + * NOTE: avoid key overlap with other transports that use 5 tuples for + * session identification. + */ + u32 proto; + }; + u64 as_u64[2]; + }; +}) v4_connection_key_t; + +typedef CLIB_PACKED (struct { + union + { + struct + { + /* 48 octets */ + ip6_address_t src; + ip6_address_t dst; + u16 src_port; + u16 dst_port; + u32 proto; + u64 unused; + }; + u64 as_u64[6]; + }; +}) v6_connection_key_t; +/* *INDENT-ON* */ + +typedef clib_bihash_kv_16_8_t session_kv4_t; +typedef clib_bihash_kv_48_8_t session_kv6_t; + +always_inline void +make_v4_ss_kv (session_kv4_t * kv, ip4_address_t * lcl, ip4_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto) +{ + v4_connection_key_t *key = (v4_connection_key_t *) kv->key; + + key->src.as_u32 = lcl->as_u32; + key->dst.as_u32 = rmt->as_u32; + key->src_port = lcl_port; + key->dst_port = rmt_port; + key->proto = proto; + + kv->value = ~0ULL; +} + +always_inline void +make_v4_listener_kv (session_kv4_t * kv, ip4_address_t * lcl, u16 lcl_port, + u8 proto) +{ + v4_connection_key_t *key = (v4_connection_key_t *) kv->key; + + key->src.as_u32 = lcl->as_u32; + key->dst.as_u32 = 0; + key->src_port = lcl_port; + key->dst_port = 0; + key->proto = proto; + + kv->value = ~0ULL; +} + +always_inline void +make_v4_ss_kv_from_tc (session_kv4_t * kv, transport_connection_t * t) +{ + return make_v4_ss_kv (kv, &t->lcl_ip.ip4, &t->rmt_ip.ip4, t->lcl_port, + t->rmt_port, t->proto); +} + +always_inline void +make_v6_ss_kv (session_kv6_t * kv, ip6_address_t * lcl, ip6_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto) +{ + v6_connection_key_t *key = (v6_connection_key_t *) kv->key; + + key->src.as_u64[0] = lcl->as_u64[0]; + key->src.as_u64[1] = lcl->as_u64[1]; + key->dst.as_u64[0] = rmt->as_u64[0]; + key->dst.as_u64[1] = rmt->as_u64[1]; + key->src_port = lcl_port; + key->dst_port = rmt_port; + key->proto = proto; + key->unused = 0; + + kv->value = ~0ULL; +} + +always_inline void +make_v6_listener_kv (session_kv6_t * kv, ip6_address_t * lcl, u16 lcl_port, + u8 proto) +{ + v6_connection_key_t *key = (v6_connection_key_t *) kv->key; + + key->src.as_u64[0] = lcl->as_u64[0]; + key->src.as_u64[1] = lcl->as_u64[1]; + key->dst.as_u64[0] = 0; + key->dst.as_u64[1] = 0; + key->src_port = lcl_port; + key->dst_port = 0; + key->proto = proto; + key->unused = 0; + + kv->value = ~0ULL; +} + +always_inline void +make_v6_ss_kv_from_tc (session_kv6_t * kv, transport_connection_t * t) +{ + make_v6_ss_kv (kv, &t->lcl_ip.ip6, &t->rmt_ip.ip6, t->lcl_port, + t->rmt_port, t->proto); +} + +/* + * Session lookup key; (src-ip, dst-ip, src-port, dst-port, session-type) + * Value: (owner thread index << 32 | session_index); + */ +void +stream_session_table_add_for_tc (transport_connection_t * tc, u64 value) +{ + session_lookup_t *sl = &session_lookup; + session_kv4_t kv4; + session_kv6_t kv6; + + switch (tc->proto) + { + case SESSION_TYPE_IP4_UDP: + case SESSION_TYPE_IP4_TCP: + make_v4_ss_kv_from_tc (&kv4, tc); + kv4.value = value; + clib_bihash_add_del_16_8 (&sl->v4_session_hash, &kv4, 1 /* is_add */ ); + break; + case SESSION_TYPE_IP6_UDP: + case SESSION_TYPE_IP6_TCP: + make_v6_ss_kv_from_tc (&kv6, tc); + kv6.value = value; + clib_bihash_add_del_48_8 (&sl->v6_session_hash, &kv6, 1 /* is_add */ ); + break; + default: + clib_warning ("Session type not supported"); + ASSERT (0); + } +} + +void +stream_session_table_add (session_manager_main_t * smm, stream_session_t * s, + u64 value) +{ + transport_connection_t *tc; + + tc = tp_vfts[s->session_type].get_connection (s->connection_index, + s->thread_index); + stream_session_table_add_for_tc (tc, value); +} + +void +stream_session_half_open_table_add (session_type_t sst, + transport_connection_t * tc, u64 value) +{ + session_lookup_t *sl = &session_lookup; + session_kv4_t kv4; + session_kv6_t kv6; + + switch (sst) + { + case SESSION_TYPE_IP4_UDP: + case SESSION_TYPE_IP4_TCP: + make_v4_ss_kv_from_tc (&kv4, tc); + kv4.value = value; + clib_bihash_add_del_16_8 (&sl->v4_half_open_hash, &kv4, + 1 /* is_add */ ); + break; + case SESSION_TYPE_IP6_UDP: + case SESSION_TYPE_IP6_TCP: + make_v6_ss_kv_from_tc (&kv6, tc); + kv6.value = value; + clib_bihash_add_del_48_8 (&sl->v6_half_open_hash, &kv6, + 1 /* is_add */ ); + break; + default: + clib_warning ("Session type not supported"); + ASSERT (0); + } +} + +int +stream_session_table_del_for_tc (transport_connection_t * tc) +{ + session_lookup_t *sl = &session_lookup; + session_kv4_t kv4; + session_kv6_t kv6; + switch (tc->proto) + { + case SESSION_TYPE_IP4_UDP: + case SESSION_TYPE_IP4_TCP: + make_v4_ss_kv_from_tc (&kv4, tc); + return clib_bihash_add_del_16_8 (&sl->v4_session_hash, &kv4, + 0 /* is_add */ ); + break; + case SESSION_TYPE_IP6_UDP: + case SESSION_TYPE_IP6_TCP: + make_v6_ss_kv_from_tc (&kv6, tc); + return clib_bihash_add_del_48_8 (&sl->v6_session_hash, &kv6, + 0 /* is_add */ ); + break; + default: + clib_warning ("Session type not supported"); + ASSERT (0); + } + + return 0; +} + +int +stream_session_table_del (stream_session_t * s) +{ + transport_connection_t *ts; + ts = tp_vfts[s->session_type].get_connection (s->connection_index, + s->thread_index); + return stream_session_table_del_for_tc (ts); +} + +void +stream_session_half_open_table_del (u8 sst, transport_connection_t * tc) +{ + session_lookup_t *sl = &session_lookup; + session_kv4_t kv4; + session_kv6_t kv6; + + switch (sst) + { + case SESSION_TYPE_IP4_UDP: + case SESSION_TYPE_IP4_TCP: + make_v4_ss_kv_from_tc (&kv4, tc); + clib_bihash_add_del_16_8 (&sl->v4_half_open_hash, &kv4, + 0 /* is_add */ ); + break; + case SESSION_TYPE_IP6_UDP: + case SESSION_TYPE_IP6_TCP: + make_v6_ss_kv_from_tc (&kv6, tc); + clib_bihash_add_del_48_8 (&sl->v6_half_open_hash, &kv6, + 0 /* is_add */ ); + break; + default: + clib_warning ("Session type not supported"); + ASSERT (0); + } +} + +stream_session_t * +stream_session_lookup_listener4 (ip4_address_t * lcl, u16 lcl_port, u8 proto) +{ + session_lookup_t *sl = &session_lookup; + session_kv4_t kv4; + int rv; + + make_v4_listener_kv (&kv4, lcl, lcl_port, proto); + rv = clib_bihash_search_inline_16_8 (&sl->v4_session_hash, &kv4); + if (rv == 0) + return session_manager_get_listener (proto, (u32) kv4.value); + + /* Zero out the lcl ip */ + kv4.key[0] = 0; + rv = clib_bihash_search_inline_16_8 (&sl->v4_session_hash, &kv4); + if (rv == 0) + return session_manager_get_listener (proto, (u32) kv4.value); + + return 0; +} + +/** Looks up a session based on the 5-tuple passed as argument. + * + * First it tries to find an established session, if this fails, it tries + * finding a listener session if this fails, it tries a lookup with a + * wildcarded local source (listener bound to all interfaces) + */ +stream_session_t * +stream_session_lookup4 (ip4_address_t * lcl, ip4_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto) +{ + session_lookup_t *sl = &session_lookup; + session_kv4_t kv4; + stream_session_t *s; + int rv; + + /* Lookup session amongst established ones */ + make_v4_ss_kv (&kv4, lcl, rmt, lcl_port, rmt_port, proto); + rv = clib_bihash_search_inline_16_8 (&sl->v4_session_hash, &kv4); + if (rv == 0) + return stream_session_get_from_handle (kv4.value); + + /* If nothing is found, check if any listener is available */ + if ((s = stream_session_lookup_listener4 (lcl, lcl_port, proto))) + return s; + + /* Finally, try half-open connections */ + rv = clib_bihash_search_inline_16_8 (&sl->v4_half_open_hash, &kv4); + if (rv == 0) + return stream_session_get_from_handle (kv4.value); + return 0; +} + +stream_session_t * +stream_session_lookup_listener6 (ip6_address_t * lcl, u16 lcl_port, u8 proto) +{ + session_lookup_t *sl = &session_lookup; + session_kv6_t kv6; + int rv; + + make_v6_listener_kv (&kv6, lcl, lcl_port, proto); + rv = clib_bihash_search_inline_48_8 (&sl->v6_session_hash, &kv6); + if (rv == 0) + return session_manager_get_listener (proto, (u32) kv6.value); + + /* Zero out the lcl ip */ + kv6.key[0] = kv6.key[1] = 0; + rv = clib_bihash_search_inline_48_8 (&sl->v6_session_hash, &kv6); + if (rv == 0) + return session_manager_get_listener (proto, (u32) kv6.value); + + return 0; +} + +/* Looks up a session based on the 5-tuple passed as argument. + * First it tries to find an established session, if this fails, it tries + * finding a listener session if this fails, it tries a lookup with a + * wildcarded local source (listener bound to all interfaces) */ +stream_session_t * +stream_session_lookup6 (ip6_address_t * lcl, ip6_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto) +{ + session_lookup_t *sl = &session_lookup; + session_kv6_t kv6; + stream_session_t *s; + int rv; + + make_v6_ss_kv (&kv6, lcl, rmt, lcl_port, rmt_port, proto); + rv = clib_bihash_search_inline_48_8 (&sl->v6_session_hash, &kv6); + if (rv == 0) + return stream_session_get_from_handle (kv6.value); + + /* If nothing is found, check if any listener is available */ + if ((s = stream_session_lookup_listener6 (lcl, lcl_port, proto))) + return s; + + /* Finally, try half-open connections */ + rv = clib_bihash_search_inline_48_8 (&sl->v6_half_open_hash, &kv6); + if (rv == 0) + return stream_session_get_from_handle (kv6.value); + return 0; +} + +stream_session_t * +stream_session_lookup_listener (ip46_address_t * lcl, u16 lcl_port, u8 proto) +{ + switch (proto) + { + case SESSION_TYPE_IP4_UDP: + case SESSION_TYPE_IP4_TCP: + return stream_session_lookup_listener4 (&lcl->ip4, lcl_port, proto); + break; + case SESSION_TYPE_IP6_UDP: + case SESSION_TYPE_IP6_TCP: + return stream_session_lookup_listener6 (&lcl->ip6, lcl_port, proto); + break; + } + return 0; +} + +u64 +stream_session_half_open_lookup_handle (ip46_address_t * lcl, + ip46_address_t * rmt, u16 lcl_port, + u16 rmt_port, u8 proto) +{ + session_lookup_t *sl = &session_lookup; + session_kv4_t kv4; + session_kv6_t kv6; + int rv; + + switch (proto) + { + case SESSION_TYPE_IP4_UDP: + case SESSION_TYPE_IP4_TCP: + make_v4_ss_kv (&kv4, &lcl->ip4, &rmt->ip4, lcl_port, rmt_port, proto); + rv = clib_bihash_search_inline_16_8 (&sl->v4_half_open_hash, &kv4); + + if (rv == 0) + return kv4.value; + + return HALF_OPEN_LOOKUP_INVALID_VALUE; + break; + case SESSION_TYPE_IP6_UDP: + case SESSION_TYPE_IP6_TCP: + make_v6_ss_kv (&kv6, &lcl->ip6, &rmt->ip6, lcl_port, rmt_port, proto); + rv = clib_bihash_search_inline_48_8 (&sl->v6_half_open_hash, &kv6); + + if (rv == 0) + return kv6.value; + + return HALF_OPEN_LOOKUP_INVALID_VALUE; + break; + } + return HALF_OPEN_LOOKUP_INVALID_VALUE; +} + +transport_connection_t * +stream_session_half_open_lookup (ip46_address_t * lcl, ip46_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto) +{ + u64 handle; + handle = + stream_session_half_open_lookup_handle (lcl, rmt, lcl_port, rmt_port, + proto); + if (handle != HALF_OPEN_LOOKUP_INVALID_VALUE) + return tp_vfts[proto].get_half_open (handle & 0xFFFFFFFF); + return 0; +} + +always_inline stream_session_t * +stream_session_get_tsi (u64 ti_and_si, u32 thread_index) +{ + ASSERT ((u32) (ti_and_si >> 32) == thread_index); + return pool_elt_at_index (session_manager_main.sessions[thread_index], + ti_and_si & 0xFFFFFFFFULL); +} + +transport_connection_t * +stream_session_lookup_transport_wt4 (ip4_address_t * lcl, ip4_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto, + u32 my_thread_index) +{ + session_lookup_t *sl = &session_lookup; + session_kv4_t kv4; + stream_session_t *s; + int rv; + + /* Lookup session amongst established ones */ + make_v4_ss_kv (&kv4, lcl, rmt, lcl_port, rmt_port, proto); + rv = clib_bihash_search_inline_16_8 (&sl->v4_session_hash, &kv4); + if (rv == 0) + { + s = stream_session_get_tsi (kv4.value, my_thread_index); + return tp_vfts[s->session_type].get_connection (s->connection_index, + my_thread_index); + } + + /* If nothing is found, check if any listener is available */ + s = stream_session_lookup_listener4 (lcl, lcl_port, proto); + if (s) + return tp_vfts[s->session_type].get_listener (s->connection_index); + + /* Finally, try half-open connections */ + rv = clib_bihash_search_inline_16_8 (&sl->v4_half_open_hash, &kv4); + if (rv == 0) + return tp_vfts[proto].get_half_open (kv4.value & 0xFFFFFFFF); + return 0; +} + +transport_connection_t * +stream_session_lookup_transport4 (ip4_address_t * lcl, ip4_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto) +{ + session_lookup_t *sl = &session_lookup; + session_kv4_t kv4; + stream_session_t *s; + int rv; + + /* Lookup session amongst established ones */ + make_v4_ss_kv (&kv4, lcl, rmt, lcl_port, rmt_port, proto); + rv = clib_bihash_search_inline_16_8 (&sl->v4_session_hash, &kv4); + if (rv == 0) + { + s = stream_session_get_from_handle (kv4.value); + return tp_vfts[s->session_type].get_connection (s->connection_index, + s->thread_index); + } + + /* If nothing is found, check if any listener is available */ + s = stream_session_lookup_listener4 (lcl, lcl_port, proto); + if (s) + return tp_vfts[s->session_type].get_listener (s->connection_index); + + /* Finally, try half-open connections */ + rv = clib_bihash_search_inline_16_8 (&sl->v4_half_open_hash, &kv4); + if (rv == 0) + return tp_vfts[proto].get_half_open (kv4.value & 0xFFFFFFFF); + return 0; +} + +transport_connection_t * +stream_session_lookup_transport_wt6 (ip6_address_t * lcl, ip6_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto, + u32 my_thread_index) +{ + session_lookup_t *sl = &session_lookup; + stream_session_t *s; + session_kv6_t kv6; + int rv; + + make_v6_ss_kv (&kv6, lcl, rmt, lcl_port, rmt_port, proto); + rv = clib_bihash_search_inline_48_8 (&sl->v6_session_hash, &kv6); + if (rv == 0) + { + s = stream_session_get_tsi (kv6.value, my_thread_index); + return tp_vfts[s->session_type].get_connection (s->connection_index, + my_thread_index); + } + + /* If nothing is found, check if any listener is available */ + s = stream_session_lookup_listener6 (lcl, lcl_port, proto); + if (s) + return tp_vfts[s->session_type].get_listener (s->connection_index); + + /* Finally, try half-open connections */ + rv = clib_bihash_search_inline_48_8 (&sl->v6_half_open_hash, &kv6); + if (rv == 0) + return tp_vfts[proto].get_half_open (kv6.value & 0xFFFFFFFF); + + return 0; +} + +transport_connection_t * +stream_session_lookup_transport6 (ip6_address_t * lcl, ip6_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto) +{ + session_lookup_t *sl = &session_lookup; + stream_session_t *s; + session_kv6_t kv6; + int rv; + + make_v6_ss_kv (&kv6, lcl, rmt, lcl_port, rmt_port, proto); + rv = clib_bihash_search_inline_48_8 (&sl->v6_session_hash, &kv6); + if (rv == 0) + { + s = stream_session_get_from_handle (kv6.value); + return tp_vfts[s->session_type].get_connection (s->connection_index, + s->thread_index); + } + + /* If nothing is found, check if any listener is available */ + s = stream_session_lookup_listener6 (lcl, lcl_port, proto); + if (s) + return tp_vfts[s->session_type].get_listener (s->connection_index); + + /* Finally, try half-open connections */ + rv = clib_bihash_search_inline_48_8 (&sl->v6_half_open_hash, &kv6); + if (rv == 0) + return tp_vfts[proto].get_half_open (kv6.value & 0xFFFFFFFF); + + return 0; +} + +void +session_lookup_init (void) +{ + session_lookup_t *sl = &session_lookup; + clib_bihash_init_16_8 (&sl->v4_session_hash, "v4 session table", + 200000 /* $$$$ config parameter nbuckets */ , + (64 << 20) /*$$$ config parameter table size */ ); + clib_bihash_init_48_8 (&sl->v6_session_hash, "v6 session table", + 200000 /* $$$$ config parameter nbuckets */ , + (64 << 20) /*$$$ config parameter table size */ ); + + clib_bihash_init_16_8 (&sl->v4_half_open_hash, "v4 half-open table", + 200000 /* $$$$ config parameter nbuckets */ , + (64 << 20) /*$$$ config parameter table size */ ); + clib_bihash_init_48_8 (&sl->v6_half_open_hash, "v6 half-open table", + 200000 /* $$$$ config parameter nbuckets */ , + (64 << 20) /*$$$ config parameter table size */ ); +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/session_lookup.h b/src/vnet/session/session_lookup.h new file mode 100644 index 00000000..9e92dab1 --- /dev/null +++ b/src/vnet/session/session_lookup.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SRC_VNET_SESSION_SESSION_LOOKUP_H_ +#define SRC_VNET_SESSION_SESSION_LOOKUP_H_ + +#include +#include + +typedef struct _session_lookup +{ + /** Lookup tables for established sessions and listeners */ + clib_bihash_16_8_t v4_session_hash; + clib_bihash_48_8_t v6_session_hash; + + /** Lookup tables for half-open sessions */ + clib_bihash_16_8_t v4_half_open_hash; + clib_bihash_48_8_t v6_half_open_hash; +} session_lookup_t; + +stream_session_t *stream_session_lookup_listener4 (ip4_address_t * lcl, + u16 lcl_port, u8 proto); +stream_session_t *stream_session_lookup4 (ip4_address_t * lcl, + ip4_address_t * rmt, u16 lcl_port, + u16 rmt_port, u8 proto); +stream_session_t *stream_session_lookup_listener6 (ip6_address_t * lcl, + u16 lcl_port, u8 proto); +stream_session_t *stream_session_lookup6 (ip6_address_t * lcl, + ip6_address_t * rmt, u16 lcl_port, + u16 rmt_port, u8 proto); +transport_connection_t *stream_session_lookup_transport_wt4 (ip4_address_t * + lcl, + ip4_address_t * + rmt, + u16 lcl_port, + u16 rmt_port, + u8 proto, + u32 + thread_index); +transport_connection_t *stream_session_lookup_transport4 (ip4_address_t * lcl, + ip4_address_t * rmt, + u16 lcl_port, + u16 rmt_port, + u8 proto); +transport_connection_t *stream_session_lookup_transport_wt6 (ip6_address_t * + lcl, + ip6_address_t * + rmt, + u16 lcl_port, + u16 rmt_port, + u8 proto, + u32 + thread_index); +transport_connection_t *stream_session_lookup_transport6 (ip6_address_t * lcl, + ip6_address_t * rmt, + u16 lcl_port, + u16 rmt_port, + u8 proto); + +stream_session_t *stream_session_lookup_listener (ip46_address_t * lcl, + u16 lcl_port, u8 proto); +u64 stream_session_half_open_lookup_handle (ip46_address_t * lcl, + ip46_address_t * rmt, + u16 lcl_port, + u16 rmt_port, u8 proto); +transport_connection_t *stream_session_half_open_lookup (ip46_address_t * lcl, + ip46_address_t * rmt, + u16 lcl_port, + u16 rmt_port, + u8 proto); +void stream_session_table_add_for_tc (transport_connection_t * tc, u64 value); +int stream_session_table_del_for_tc (transport_connection_t * tc); +int stream_session_table_del (stream_session_t * s); +void stream_session_half_open_table_del (u8 sst, transport_connection_t * tc); +void stream_session_half_open_table_add (session_type_t sst, + transport_connection_t * tc, + u64 value); + +void session_lookup_init (void); + +#endif /* SRC_VNET_SESSION_SESSION_LOOKUP_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/session_node.c b/src/vnet/session/session_node.c new file mode 100644 index 00000000..8d703b0b --- /dev/null +++ b/src/vnet/session/session_node.c @@ -0,0 +1,685 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +vlib_node_registration_t session_queue_node; + +typedef struct +{ + u32 session_index; + u32 server_thread_index; +} session_queue_trace_t; + +/* packet trace format function */ +static u8 * +format_session_queue_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + session_queue_trace_t *t = va_arg (*args, session_queue_trace_t *); + + s = format (s, "SESSION_QUEUE: session index %d, server thread index %d", + t->session_index, t->server_thread_index); + return s; +} + +vlib_node_registration_t session_queue_node; + +#define foreach_session_queue_error \ +_(TX, "Packets transmitted") \ +_(TIMER, "Timer events") \ +_(NO_BUFFER, "Out of buffers") + +typedef enum +{ +#define _(sym,str) SESSION_QUEUE_ERROR_##sym, + foreach_session_queue_error +#undef _ + SESSION_QUEUE_N_ERROR, +} session_queue_error_t; + +static char *session_queue_error_strings[] = { +#define _(sym,string) string, + foreach_session_queue_error +#undef _ +}; + +static u32 session_type_to_next[] = { + SESSION_QUEUE_NEXT_TCP_IP4_OUTPUT, + SESSION_QUEUE_NEXT_IP4_LOOKUP, + SESSION_QUEUE_NEXT_TCP_IP6_OUTPUT, + SESSION_QUEUE_NEXT_IP6_LOOKUP, +}; + +always_inline void +session_tx_fifo_chain_tail (session_manager_main_t * smm, vlib_main_t * vm, + u8 thread_index, svm_fifo_t * fifo, + vlib_buffer_t * b0, u32 bi0, u8 n_bufs_per_seg, + u32 * left_to_snd0, u16 * n_bufs, u32 * rx_offset, + u16 deq_per_buf, u8 peek_data) +{ + vlib_buffer_t *chain_b0, *prev_b0; + u32 chain_bi0; + u16 len_to_deq0, n_bytes_read; + u8 *data0, j; + + chain_bi0 = bi0; + chain_b0 = b0; + for (j = 1; j < n_bufs_per_seg; j++) + { + prev_b0 = chain_b0; + len_to_deq0 = clib_min (*left_to_snd0, deq_per_buf); + + *n_bufs -= 1; + chain_bi0 = smm->tx_buffers[thread_index][*n_bufs]; + _vec_len (smm->tx_buffers[thread_index]) = *n_bufs; + + chain_b0 = vlib_get_buffer (vm, chain_bi0); + chain_b0->current_data = 0; + data0 = vlib_buffer_get_current (chain_b0); + if (peek_data) + { + n_bytes_read = svm_fifo_peek (fifo, *rx_offset, len_to_deq0, data0); + *rx_offset += n_bytes_read; + } + else + { + n_bytes_read = svm_fifo_dequeue_nowait (fifo, len_to_deq0, data0); + } + ASSERT (n_bytes_read == len_to_deq0); + chain_b0->current_length = n_bytes_read; + b0->total_length_not_including_first_buffer += chain_b0->current_length; + + /* update previous buffer */ + prev_b0->next_buffer = chain_bi0; + prev_b0->flags |= VLIB_BUFFER_NEXT_PRESENT; + + /* update current buffer */ + chain_b0->next_buffer = 0; + + *left_to_snd0 -= n_bytes_read; + if (*left_to_snd0 == 0) + break; + } +} + +always_inline int +session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, + session_manager_main_t * smm, + session_fifo_event_t * e0, + stream_session_t * s0, u32 thread_index, + int *n_tx_packets, u8 peek_data) +{ + u32 n_trace = vlib_get_trace_count (vm, node); + u32 left_to_snd0, max_len_to_snd0, len_to_deq0, snd_space0; + u32 n_bufs_per_evt, n_frames_per_evt; + transport_connection_t *tc0; + transport_proto_vft_t *transport_vft; + u32 next_index, next0, *to_next, n_left_to_next, bi0; + vlib_buffer_t *b0; + u32 rx_offset = 0, max_dequeue0, n_bytes_per_seg; + u16 snd_mss0, n_bufs_per_seg, n_bufs; + u8 *data0; + int i, n_bytes_read; + u32 n_bytes_per_buf, deq_per_buf; + u32 buffers_allocated, buffers_allocated_this_call; + + next_index = next0 = session_type_to_next[s0->session_type]; + + transport_vft = session_get_transport_vft (s0->session_type); + tc0 = transport_vft->get_connection (s0->connection_index, thread_index); + + /* Make sure we have space to send and there's something to dequeue */ + snd_mss0 = transport_vft->send_mss (tc0); + snd_space0 = transport_vft->send_space (tc0); + + /* Can't make any progress */ + if (snd_space0 == 0 || snd_mss0 == 0) + { + vec_add1 (smm->pending_event_vector[thread_index], *e0); + return 0; + } + + if (peek_data) + { + /* Offset in rx fifo from where to peek data */ + rx_offset = transport_vft->tx_fifo_offset (tc0); + } + + /* Check how much we can pull. If buffering, subtract the offset */ + max_dequeue0 = svm_fifo_max_dequeue (s0->server_tx_fifo) - rx_offset; + + /* Nothing to read return */ + if (max_dequeue0 == 0) + { + svm_fifo_unset_event (s0->server_tx_fifo); + return 0; + } + + /* Ensure we're not writing more than transport window allows */ + if (max_dequeue0 < snd_space0) + { + /* Constrained by tx queue. Try to send only fully formed segments */ + max_len_to_snd0 = (max_dequeue0 > snd_mss0) ? + max_dequeue0 - max_dequeue0 % snd_mss0 : max_dequeue0; + /* TODO Nagle ? */ + } + else + { + max_len_to_snd0 = snd_space0; + } + + n_bytes_per_buf = vlib_buffer_free_list_buffer_size + (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + n_bytes_per_seg = MAX_HDRS_LEN + snd_mss0; + n_bufs_per_seg = ceil ((double) n_bytes_per_seg / n_bytes_per_buf); + n_bufs_per_evt = (ceil ((double) max_len_to_snd0 / n_bytes_per_seg)) + * n_bufs_per_seg; + n_frames_per_evt = ceil ((double) n_bufs_per_evt / VLIB_FRAME_SIZE); + + deq_per_buf = clib_min (snd_mss0, n_bytes_per_buf); + + n_bufs = vec_len (smm->tx_buffers[thread_index]); + left_to_snd0 = max_len_to_snd0; + for (i = 0; i < n_frames_per_evt; i++) + { + /* Make sure we have at least one full frame of buffers ready */ + if (PREDICT_FALSE (n_bufs < VLIB_FRAME_SIZE)) + { + vec_validate (smm->tx_buffers[thread_index], + n_bufs + 2 * VLIB_FRAME_SIZE - 1); + + buffers_allocated = 0; + do + { + buffers_allocated_this_call = + vlib_buffer_alloc + (vm, + &smm->tx_buffers[thread_index][n_bufs + buffers_allocated], + 2 * VLIB_FRAME_SIZE - buffers_allocated); + buffers_allocated += buffers_allocated_this_call; + } + while (buffers_allocated_this_call > 0 + && ((buffers_allocated + n_bufs < VLIB_FRAME_SIZE))); + + n_bufs += buffers_allocated; + + _vec_len (smm->tx_buffers[thread_index]) = n_bufs; + + if (PREDICT_FALSE (n_bufs < VLIB_FRAME_SIZE)) + { + vec_add1 (smm->pending_event_vector[thread_index], *e0); + return -1; + } + } + /* Allow enqueuing of a new event */ + svm_fifo_unset_event (s0->server_tx_fifo); + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + while (left_to_snd0 && n_left_to_next >= n_bufs_per_seg) + { + /* + * Handle first buffer in chain separately + */ + + /* Get free buffer */ + ASSERT (n_bufs >= 1); + bi0 = smm->tx_buffers[thread_index][--n_bufs]; + ASSERT (bi0); + _vec_len (smm->tx_buffers[thread_index]) = n_bufs; + + /* usual speculation, or the enqueue_x1 macro will barf */ + to_next[0] = bi0; + to_next += 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + b0->error = 0; + b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID + | VNET_BUFFER_F_LOCALLY_ORIGINATED; + b0->current_data = 0; + b0->total_length_not_including_first_buffer = 0; + + len_to_deq0 = clib_min (left_to_snd0, deq_per_buf); + + data0 = vlib_buffer_make_headroom (b0, MAX_HDRS_LEN); + if (peek_data) + { + n_bytes_read = svm_fifo_peek (s0->server_tx_fifo, rx_offset, + len_to_deq0, data0); + /* Keep track of progress locally, transport is also supposed to + * increment it independently when pushing the header */ + rx_offset += n_bytes_read; + } + else + { + n_bytes_read = svm_fifo_dequeue_nowait (s0->server_tx_fifo, + len_to_deq0, data0); + } + + if (n_bytes_read <= 0) + goto dequeue_fail; + + b0->current_length = n_bytes_read; + + left_to_snd0 -= n_bytes_read; + *n_tx_packets = *n_tx_packets + 1; + + /* + * Fill in the remaining buffers in the chain, if any + */ + if (PREDICT_FALSE (n_bufs_per_seg > 1)) + session_tx_fifo_chain_tail (smm, vm, thread_index, + s0->server_tx_fifo, b0, bi0, + n_bufs_per_seg, &left_to_snd0, + &n_bufs, &rx_offset, deq_per_buf, + peek_data); + + /* Ask transport to push header after current_length and + * total_length_not_including_first_buffer are updated */ + transport_vft->push_header (tc0, b0); + + /* *INDENT-OFF* */ + SESSION_EVT_DBG(SESSION_EVT_DEQ, s0, ({ + ed->data[0] = e0->event_id; + ed->data[1] = max_dequeue0; + ed->data[2] = len_to_deq0; + ed->data[3] = left_to_snd0; + })); + /* *INDENT-ON* */ + + + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + if (PREDICT_FALSE (n_trace > 0)) + { + session_queue_trace_t *t0; + vlib_trace_buffer (vm, node, next_index, b0, + 1 /* follow_chain */ ); + vlib_set_trace_count (vm, node, --n_trace); + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + t0->session_index = s0->session_index; + t0->server_thread_index = s0->thread_index; + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + /* If we couldn't dequeue all bytes mark as partially read */ + if (max_len_to_snd0 < max_dequeue0) + { + /* If we don't already have new event */ + if (svm_fifo_set_event (s0->server_tx_fifo)) + { + vec_add1 (smm->pending_event_vector[thread_index], *e0); + } + } + return 0; + +dequeue_fail: + /* + * Can't read from fifo. If we don't already have an event, save as partially + * read, return buff to free list and return + */ + clib_warning ("dequeue fail"); + + if (svm_fifo_set_event (s0->server_tx_fifo)) + { + vec_add1 (smm->pending_event_vector[thread_index], *e0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next + 1); + _vec_len (smm->tx_buffers[thread_index]) += 1; + + return 0; +} + +int +session_tx_fifo_peek_and_snd (vlib_main_t * vm, vlib_node_runtime_t * node, + session_manager_main_t * smm, + session_fifo_event_t * e0, + stream_session_t * s0, u32 thread_index, + int *n_tx_pkts) +{ + return session_tx_fifo_read_and_snd_i (vm, node, smm, e0, s0, thread_index, + n_tx_pkts, 1); +} + +int +session_tx_fifo_dequeue_and_snd (vlib_main_t * vm, vlib_node_runtime_t * node, + session_manager_main_t * smm, + session_fifo_event_t * e0, + stream_session_t * s0, u32 thread_index, + int *n_tx_pkts) +{ + return session_tx_fifo_read_and_snd_i (vm, node, smm, e0, s0, thread_index, + n_tx_pkts, 0); +} + +always_inline stream_session_t * +session_event_get_session (session_fifo_event_t * e, u8 thread_index) +{ + ASSERT (e->fifo->master_thread_index == thread_index); + return stream_session_get_if_valid (e->fifo->master_session_index, + thread_index); +} + +void +dump_thread_0_event_queue (void) +{ + session_manager_main_t *smm = vnet_get_session_manager_main (); + vlib_main_t *vm = &vlib_global_main; + u32 my_thread_index = vm->thread_index; + session_fifo_event_t _e, *e = &_e; + stream_session_t *s0; + int i, index; + i8 *headp; + + unix_shared_memory_queue_t *q; + q = smm->vpp_event_queues[my_thread_index]; + + index = q->head; + + for (i = 0; i < q->cursize; i++) + { + headp = (i8 *) (&q->data[0] + q->elsize * index); + clib_memcpy (e, headp, q->elsize); + + switch (e->event_type) + { + case FIFO_EVENT_APP_TX: + s0 = session_event_get_session (e, my_thread_index); + fformat (stdout, "[%04d] TX session %d\n", i, s0->session_index); + break; + + case FIFO_EVENT_DISCONNECT: + s0 = stream_session_get_from_handle (e->session_handle); + fformat (stdout, "[%04d] disconnect session %d\n", i, + s0->session_index); + break; + + case FIFO_EVENT_BUILTIN_RX: + s0 = session_event_get_session (e, my_thread_index); + fformat (stdout, "[%04d] builtin_rx %d\n", i, s0->session_index); + break; + + case FIFO_EVENT_RPC: + fformat (stdout, "[%04d] RPC call %llx with %llx\n", + i, (u64) (e->rpc_args.fp), (u64) (e->rpc_args.arg)); + break; + + default: + fformat (stdout, "[%04d] unhandled event type %d\n", + i, e->event_type); + break; + } + + index++; + + if (index == q->maxsize) + index = 0; + } +} + +static u8 +session_node_cmp_event (session_fifo_event_t * e, svm_fifo_t * f) +{ + stream_session_t *s; + switch (e->event_type) + { + case FIFO_EVENT_APP_RX: + case FIFO_EVENT_APP_TX: + case FIFO_EVENT_BUILTIN_RX: + if (e->fifo == f) + return 1; + break; + case FIFO_EVENT_DISCONNECT: + break; + case FIFO_EVENT_RPC: + s = stream_session_get_from_handle (e->session_handle); + if (!s) + { + clib_warning ("session has event but doesn't exist!"); + break; + } + if (s->server_rx_fifo == f || s->server_tx_fifo == f) + return 1; + break; + default: + break; + } + return 0; +} + +u8 +session_node_lookup_fifo_event (svm_fifo_t * f, session_fifo_event_t * e) +{ + session_manager_main_t *smm = vnet_get_session_manager_main (); + unix_shared_memory_queue_t *q; + session_fifo_event_t *pending_event_vector, *evt; + int i, index, found = 0; + i8 *headp; + u8 thread_index; + + ASSERT (e); + thread_index = f->master_thread_index; + /* + * Search evt queue + */ + q = smm->vpp_event_queues[thread_index]; + index = q->head; + for (i = 0; i < q->cursize; i++) + { + headp = (i8 *) (&q->data[0] + q->elsize * index); + clib_memcpy (e, headp, q->elsize); + found = session_node_cmp_event (e, f); + if (found) + break; + if (++index == q->maxsize) + index = 0; + } + /* + * Search pending events vector + */ + pending_event_vector = smm->pending_event_vector[thread_index]; + vec_foreach (evt, pending_event_vector) + { + found = session_node_cmp_event (evt, f); + if (found) + { + clib_memcpy (e, evt, sizeof (*evt)); + break; + } + } + return found; +} + +static uword +session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + session_manager_main_t *smm = vnet_get_session_manager_main (); + session_fifo_event_t *my_pending_event_vector, *e; + session_fifo_event_t *my_fifo_events; + u32 n_to_dequeue, n_events; + unix_shared_memory_queue_t *q; + application_t *app; + int n_tx_packets = 0; + u32 my_thread_index = vm->thread_index; + int i, rv; + f64 now = vlib_time_now (vm); + void (*fp) (void *); + + SESSION_EVT_DBG (SESSION_EVT_POLL_GAP_TRACK, smm, my_thread_index); + + /* + * Update TCP time + */ + tcp_update_time (now, my_thread_index); + + /* + * Get vpp queue events + */ + q = smm->vpp_event_queues[my_thread_index]; + if (PREDICT_FALSE (q == 0)) + return 0; + + my_fifo_events = smm->free_event_vector[my_thread_index]; + + /* min number of events we can dequeue without blocking */ + n_to_dequeue = q->cursize; + my_pending_event_vector = smm->pending_event_vector[my_thread_index]; + + if (n_to_dequeue == 0 && vec_len (my_pending_event_vector) == 0) + return 0; + + SESSION_EVT_DBG (SESSION_EVT_DEQ_NODE, 0); + + /* + * If we didn't manage to process previous events try going + * over them again without dequeuing new ones. + */ + /* XXX: Block senders to sessions that can't keep up */ + if (0 && vec_len (my_pending_event_vector) >= 100) + { + clib_warning ("too many fifo events unsolved"); + goto skip_dequeue; + } + + /* See you in the next life, don't be late */ + if (pthread_mutex_trylock (&q->mutex)) + return 0; + + for (i = 0; i < n_to_dequeue; i++) + { + vec_add2 (my_fifo_events, e, 1); + unix_shared_memory_queue_sub_raw (q, (u8 *) e); + } + + /* The other side of the connection is not polling */ + if (q->cursize < (q->maxsize / 8)) + (void) pthread_cond_broadcast (&q->condvar); + pthread_mutex_unlock (&q->mutex); + + vec_append (my_fifo_events, my_pending_event_vector); + + _vec_len (my_pending_event_vector) = 0; + smm->pending_event_vector[my_thread_index] = my_pending_event_vector; + +skip_dequeue: + n_events = vec_len (my_fifo_events); + for (i = 0; i < n_events; i++) + { + stream_session_t *s0; /* $$$ prefetch 1 ahead maybe */ + session_fifo_event_t *e0; + + e0 = &my_fifo_events[i]; + + switch (e0->event_type) + { + case FIFO_EVENT_APP_TX: + s0 = session_event_get_session (e0, my_thread_index); + + if (CLIB_DEBUG && !s0) + { + clib_warning ("It's dead, Jim!"); + continue; + } + + if (PREDICT_FALSE (s0->session_state == SESSION_STATE_CLOSED)) + continue; + /* Spray packets in per session type frames, since they go to + * different nodes */ + rv = (smm->session_tx_fns[s0->session_type]) (vm, node, smm, e0, s0, + my_thread_index, + &n_tx_packets); + /* Out of buffers */ + if (PREDICT_FALSE (rv < 0)) + { + vlib_node_increment_counter (vm, node->node_index, + SESSION_QUEUE_ERROR_NO_BUFFER, 1); + continue; + } + break; + case FIFO_EVENT_DISCONNECT: + s0 = stream_session_get_from_handle (e0->session_handle); + stream_session_disconnect (s0); + break; + case FIFO_EVENT_BUILTIN_RX: + s0 = session_event_get_session (e0, my_thread_index); + svm_fifo_unset_event (s0->server_rx_fifo); + app = application_get (s0->app_index); + app->cb_fns.builtin_server_rx_callback (s0); + break; + case FIFO_EVENT_RPC: + fp = e0->rpc_args.fp; + (*fp) (e0->rpc_args.arg); + break; + + default: + clib_warning ("unhandled event type %d", e0->event_type); + } + } + + _vec_len (my_fifo_events) = 0; + smm->free_event_vector[my_thread_index] = my_fifo_events; + + vlib_node_increment_counter (vm, session_queue_node.index, + SESSION_QUEUE_ERROR_TX, n_tx_packets); + + SESSION_EVT_DBG (SESSION_EVT_DEQ_NODE, 1); + + return n_tx_packets; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (session_queue_node) = +{ + .function = session_queue_node_fn, + .name = "session-queue", + .format_trace = format_session_queue_trace, + .type = VLIB_NODE_TYPE_INPUT, + .n_errors = ARRAY_LEN (session_queue_error_strings), + .error_strings = session_queue_error_strings, + .n_next_nodes = SESSION_QUEUE_N_NEXT, + .state = VLIB_NODE_STATE_DISABLED, + .next_nodes = + { + [SESSION_QUEUE_NEXT_DROP] = "error-drop", + [SESSION_QUEUE_NEXT_IP4_LOOKUP] = "ip4-lookup", + [SESSION_QUEUE_NEXT_IP6_LOOKUP] = "ip6-lookup", + [SESSION_QUEUE_NEXT_TCP_IP4_OUTPUT] = "tcp4-output", + [SESSION_QUEUE_NEXT_TCP_IP6_OUTPUT] = "tcp6-output", + }, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/stream_session.h b/src/vnet/session/stream_session.h new file mode 100644 index 00000000..82bbf521 --- /dev/null +++ b/src/vnet/session/stream_session.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SRC_VNET_SESSION_STREAM_SESSION_H_ +#define SRC_VNET_SESSION_STREAM_SESSION_H_ + +#include +#include + +#define foreach_session_type \ + _(IP4_TCP, ip4_tcp) \ + _(IP4_UDP, ip4_udp) \ + _(IP6_TCP, ip6_tcp) \ + _(IP6_UDP, ip6_udp) + +typedef enum +{ +#define _(A, a) SESSION_TYPE_##A, + foreach_session_type +#undef _ + SESSION_N_TYPES, +} session_type_t; + +/* + * Application session state + */ +typedef enum +{ + SESSION_STATE_LISTENING, + SESSION_STATE_CONNECTING, + SESSION_STATE_ACCEPTING, + SESSION_STATE_READY, + SESSION_STATE_CLOSED, + SESSION_STATE_N_STATES, +} stream_session_state_t; + +typedef struct _stream_session_t +{ + /** fifo pointers. Once allocated, these do not move */ + svm_fifo_t *server_rx_fifo; + svm_fifo_t *server_tx_fifo; + + /** Type */ + u8 session_type; + + /** State */ + u8 session_state; + + u8 thread_index; + + /** To avoid n**2 "one event per frame" check */ + u8 enqueue_epoch; + + /** Pad to a multiple of 8 octets */ + u8 align_pad[4]; + + /** svm segment index where fifos were allocated */ + u32 svm_segment_index; + + /** Session index in per_thread pool */ + u32 session_index; + + /** Transport specific */ + u32 connection_index; + + /** stream server pool index */ + u32 app_index; + + /** Parent listener session if the result of an accept */ + u32 listener_index; + + u32 opaque2; + + /** Opaque, pad to a 64-octet boundary */ + u64 opaque[2]; +} stream_session_t; + +#endif /* SRC_VNET_SESSION_STREAM_SESSION_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/transport.c b/src/vnet/session/transport.c deleted file mode 100644 index abd94ba4..00000000 --- a/src/vnet/session/transport.c +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2017 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -u32 -transport_endpoint_lookup (transport_endpoint_table_t *ht, ip46_address_t *ip, - u16 port) -{ - clib_bihash_kv_24_8_t kv; - int rv; - - kv.key[0] = ip->as_u64[0]; - kv.key[1] = ip->as_u64[1]; - kv.key[2] = port; - - rv = clib_bihash_search_inline_24_8 (ht, &kv); - if (rv == 0) - return kv.value; - - return TRANSPORT_ENDPOINT_INVALID_INDEX; -} - -void -transport_endpoint_table_add (transport_endpoint_table_t *ht, - transport_endpoint_t *te, u32 value) -{ - clib_bihash_kv_24_8_t kv; - - kv.key[0] = te->ip.as_u64[0]; - kv.key[1] = te->ip.as_u64[1]; - kv.key[2] = te->port; - kv.value = value; - - clib_bihash_add_del_24_8 (ht, &kv, 1); -} - -void -transport_endpoint_table_del (transport_endpoint_table_t *ht, - transport_endpoint_t *te) -{ - clib_bihash_kv_24_8_t kv; - - kv.key[0] = te->ip.as_u64[0]; - kv.key[1] = te->ip.as_u64[1]; - kv.key[2] = te->port; - - clib_bihash_add_del_24_8 (ht, &kv, 0); -} - - - diff --git a/src/vnet/session/transport.h b/src/vnet/session/transport.h index 9c38bab9..3895a60a 100644 --- a/src/vnet/session/transport.h +++ b/src/vnet/session/transport.h @@ -21,6 +21,7 @@ #include #include #include + /* * Protocol independent transport properties associated to a session */ @@ -31,6 +32,7 @@ typedef struct _transport_connection u16 lcl_port; /**< Local port */ u16 rmt_port; /**< Remote port */ u8 proto; /**< Protocol id (also session type) */ + u32 vrf; /**< FIB table id */ u32 s_index; /**< Parent session index */ u32 c_index; /**< Connection index in transport pool */ @@ -55,6 +57,7 @@ typedef struct _transport_connection #define c_lcl_port connection.lcl_port #define c_rmt_port connection.rmt_port #define c_proto connection.proto +#define c_vrf connection.vrf #define c_state connection.state #define c_s_index connection.s_index #define c_c_index connection.c_index @@ -66,165 +69,6 @@ typedef struct _transport_connection #define c_rmt_dpo connection.rmt_dpo } transport_connection_t; -/* - * Transport protocol virtual function table - */ -typedef struct _transport_proto_vft -{ - /* - * Setup - */ - u32 (*bind) (u32, ip46_address_t *, u16); - u32 (*unbind) (u32); - int (*open) (ip46_address_t * addr, u16 port_host_byte_order); - void (*close) (u32 conn_index, u32 thread_index); - void (*cleanup) (u32 conn_index, u32 thread_index); - - /* - * Transmission - */ - u32 (*push_header) (transport_connection_t * tconn, vlib_buffer_t * b); - u16 (*send_mss) (transport_connection_t * tc); - u32 (*send_space) (transport_connection_t * tc); - u32 (*tx_fifo_offset) (transport_connection_t * tc); - - /* - * Connection retrieval - */ - transport_connection_t *(*get_connection) (u32 conn_idx, u32 thread_idx); - transport_connection_t *(*get_listener) (u32 conn_index); - transport_connection_t *(*get_half_open) (u32 conn_index); - - /* - * Format - */ - u8 *(*format_connection) (u8 * s, va_list * args); - u8 *(*format_listener) (u8 * s, va_list * args); - u8 *(*format_half_open) (u8 * s, va_list * args); -} transport_proto_vft_t; - -/* *INDENT-OFF* */ -/* 16 octets */ -typedef CLIB_PACKED (struct { - union - { - struct - { - ip4_address_t src; - ip4_address_t dst; - u16 src_port; - u16 dst_port; - /* align by making this 4 octets even though its a 1-bit field - * NOTE: avoid key overlap with other transports that use 5 tuples for - * session identification. - */ - u32 proto; - }; - u64 as_u64[2]; - }; -}) v4_connection_key_t; - -typedef CLIB_PACKED (struct { - union - { - struct - { - /* 48 octets */ - ip6_address_t src; - ip6_address_t dst; - u16 src_port; - u16 dst_port; - u32 proto; - u64 unused; - }; - u64 as_u64[6]; - }; -}) v6_connection_key_t; -/* *INDENT-ON* */ - -typedef clib_bihash_kv_16_8_t session_kv4_t; -typedef clib_bihash_kv_48_8_t session_kv6_t; - -always_inline void -make_v4_ss_kv (session_kv4_t * kv, ip4_address_t * lcl, ip4_address_t * rmt, - u16 lcl_port, u16 rmt_port, u8 proto) -{ - v4_connection_key_t *key = (v4_connection_key_t *) kv->key; - - key->src.as_u32 = lcl->as_u32; - key->dst.as_u32 = rmt->as_u32; - key->src_port = lcl_port; - key->dst_port = rmt_port; - key->proto = proto; - - kv->value = ~0ULL; -} - -always_inline void -make_v4_listener_kv (session_kv4_t * kv, ip4_address_t * lcl, u16 lcl_port, - u8 proto) -{ - v4_connection_key_t *key = (v4_connection_key_t *) kv->key; - - key->src.as_u32 = lcl->as_u32; - key->dst.as_u32 = 0; - key->src_port = lcl_port; - key->dst_port = 0; - key->proto = proto; - - kv->value = ~0ULL; -} - -always_inline void -make_v4_ss_kv_from_tc (session_kv4_t * kv, transport_connection_t * t) -{ - return make_v4_ss_kv (kv, &t->lcl_ip.ip4, &t->rmt_ip.ip4, t->lcl_port, - t->rmt_port, t->proto); -} - -always_inline void -make_v6_ss_kv (session_kv6_t * kv, ip6_address_t * lcl, ip6_address_t * rmt, - u16 lcl_port, u16 rmt_port, u8 proto) -{ - v6_connection_key_t *key = (v6_connection_key_t *) kv->key; - - key->src.as_u64[0] = lcl->as_u64[0]; - key->src.as_u64[1] = lcl->as_u64[1]; - key->dst.as_u64[0] = rmt->as_u64[0]; - key->dst.as_u64[1] = rmt->as_u64[1]; - key->src_port = lcl_port; - key->dst_port = rmt_port; - key->proto = proto; - key->unused = 0; - - kv->value = ~0ULL; -} - -always_inline void -make_v6_listener_kv (session_kv6_t * kv, ip6_address_t * lcl, u16 lcl_port, - u8 proto) -{ - v6_connection_key_t *key = (v6_connection_key_t *) kv->key; - - key->src.as_u64[0] = lcl->as_u64[0]; - key->src.as_u64[1] = lcl->as_u64[1]; - key->dst.as_u64[0] = 0; - key->dst.as_u64[1] = 0; - key->src_port = lcl_port; - key->dst_port = 0; - key->proto = proto; - key->unused = 0; - - kv->value = ~0ULL; -} - -always_inline void -make_v6_ss_kv_from_tc (session_kv6_t * kv, transport_connection_t * t) -{ - make_v6_ss_kv (kv, &t->lcl_ip.ip6, &t->rmt_ip.ip6, t->lcl_port, - t->rmt_port, t->proto); -} - typedef enum _transport_proto { TRANSPORT_PROTO_TCP, @@ -239,18 +83,6 @@ typedef struct _transport_endpoint u32 vrf; /** fib table the endpoint is associated with */ } transport_endpoint_t; -typedef clib_bihash_24_8_t transport_endpoint_table_t; - -#define TRANSPORT_ENDPOINT_INVALID_INDEX ((u32)~0) - -u32 -transport_endpoint_lookup (transport_endpoint_table_t * ht, - ip46_address_t * ip, u16 port); -void transport_endpoint_table_add (transport_endpoint_table_t * ht, - transport_endpoint_t * te, u32 value); -void transport_endpoint_table_del (transport_endpoint_table_t * ht, - transport_endpoint_t * te); - #endif /* VNET_VNET_URI_TRANSPORT_H_ */ /* diff --git a/src/vnet/session/transport_interface.c b/src/vnet/session/transport_interface.c new file mode 100644 index 00000000..eb12aa69 --- /dev/null +++ b/src/vnet/session/transport_interface.c @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +/** + * Per-type vector of transport protocol virtual function tables + */ +transport_proto_vft_t *tp_vfts; + +u32 +transport_endpoint_lookup (transport_endpoint_table_t * ht, + ip46_address_t * ip, u16 port) +{ + clib_bihash_kv_24_8_t kv; + int rv; + + kv.key[0] = ip->as_u64[0]; + kv.key[1] = ip->as_u64[1]; + kv.key[2] = port; + + rv = clib_bihash_search_inline_24_8 (ht, &kv); + if (rv == 0) + return kv.value; + + return TRANSPORT_ENDPOINT_INVALID_INDEX; +} + +void +transport_endpoint_table_add (transport_endpoint_table_t * ht, + transport_endpoint_t * te, u32 value) +{ + clib_bihash_kv_24_8_t kv; + + kv.key[0] = te->ip.as_u64[0]; + kv.key[1] = te->ip.as_u64[1]; + kv.key[2] = te->port; + kv.value = value; + + clib_bihash_add_del_24_8 (ht, &kv, 1); +} + +void +transport_endpoint_table_del (transport_endpoint_table_t * ht, + transport_endpoint_t * te) +{ + clib_bihash_kv_24_8_t kv; + + kv.key[0] = te->ip.as_u64[0]; + kv.key[1] = te->ip.as_u64[1]; + kv.key[2] = te->port; + + clib_bihash_add_del_24_8 (ht, &kv, 0); +} + +/** + * Register transport virtual function table. + * + * @param type - session type (not protocol type) + * @param vft - virtual function table + */ +void +session_register_transport (u8 session_type, + const transport_proto_vft_t * vft) +{ + vec_validate (tp_vfts, session_type); + tp_vfts[session_type] = *vft; + + /* If an offset function is provided, then peek instead of dequeue */ + session_manager_set_transport_rx_fn (session_type, + vft->tx_fifo_offset != 0); +} + +/** + * Get transport virtual function table + * + * @param type - session type (not protocol type) + */ +transport_proto_vft_t * +session_get_transport_vft (u8 session_type) +{ + if (session_type >= vec_len (tp_vfts)) + return 0; + return &tp_vfts[session_type]; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/transport_interface.h b/src/vnet/session/transport_interface.h new file mode 100644 index 00000000..b7e86ee7 --- /dev/null +++ b/src/vnet/session/transport_interface.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SRC_VNET_SESSION_TRANSPORT_INTERFACE_H_ +#define SRC_VNET_SESSION_TRANSPORT_INTERFACE_H_ + +#include +#include + +/* + * Transport protocol virtual function table + */ +typedef struct _transport_proto_vft +{ + /* + * Setup + */ + u32 (*bind) (u32 session_index, transport_endpoint_t * lcl); + u32 (*unbind) (u32); + int (*open) (transport_endpoint_t * rmt); + void (*close) (u32 conn_index, u32 thread_index); + void (*cleanup) (u32 conn_index, u32 thread_index); + + /* + * Transmission + */ + u32 (*push_header) (transport_connection_t * tconn, vlib_buffer_t * b); + u16 (*send_mss) (transport_connection_t * tc); + u32 (*send_space) (transport_connection_t * tc); + u32 (*tx_fifo_offset) (transport_connection_t * tc); + + /* + * Connection retrieval + */ + transport_connection_t *(*get_connection) (u32 conn_idx, u32 thread_idx); + transport_connection_t *(*get_listener) (u32 conn_index); + transport_connection_t *(*get_half_open) (u32 conn_index); + + /* + * Format + */ + u8 *(*format_connection) (u8 * s, va_list * args); + u8 *(*format_listener) (u8 * s, va_list * args); + u8 *(*format_half_open) (u8 * s, va_list * args); +} transport_proto_vft_t; + +typedef clib_bihash_24_8_t transport_endpoint_table_t; + +#define TRANSPORT_ENDPOINT_INVALID_INDEX ((u32)~0) + +u32 transport_endpoint_lookup (transport_endpoint_table_t * ht, + ip46_address_t * ip, u16 port); +void transport_endpoint_table_add (transport_endpoint_table_t * ht, + transport_endpoint_t * te, u32 value); +void transport_endpoint_table_del (transport_endpoint_table_t * ht, + transport_endpoint_t * te); + +void session_register_transport (u8 session_type, + const transport_proto_vft_t * vft); +transport_proto_vft_t *session_get_transport_vft (u8 session_type); + +#endif /* SRC_VNET_SESSION_TRANSPORT_INTERFACE_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index a2214158..6d1cfa07 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -22,8 +22,7 @@ tcp_main_t tcp_main; static u32 -tcp_connection_bind (u32 session_index, ip46_address_t * ip, - u16 port_host_byte_order, u8 is_ip4) +tcp_connection_bind (u32 session_index, transport_endpoint_t * lcl) { tcp_main_t *tm = &tcp_main; tcp_connection_t *listener; @@ -32,17 +31,18 @@ tcp_connection_bind (u32 session_index, ip46_address_t * ip, memset (listener, 0, sizeof (*listener)); listener->c_c_index = listener - tm->listener_pool; - listener->c_lcl_port = clib_host_to_net_u16 (port_host_byte_order); + listener->c_lcl_port = clib_host_to_net_u16 (lcl->port); - if (is_ip4) + if (lcl->is_ip4) { - listener->c_lcl_ip4.as_u32 = ip->ip4.as_u32; + listener->c_lcl_ip4.as_u32 = lcl->ip.ip4.as_u32; listener->c_is_ip4 = 1; listener->c_proto = SESSION_TYPE_IP4_TCP; } else { - clib_memcpy (&listener->c_lcl_ip6, &ip->ip6, sizeof (ip6_address_t)); + clib_memcpy (&listener->c_lcl_ip6, &lcl->ip.ip6, + sizeof (ip6_address_t)); listener->c_proto = SESSION_TYPE_IP6_TCP; } @@ -57,17 +57,9 @@ tcp_connection_bind (u32 session_index, ip46_address_t * ip, } u32 -tcp_session_bind_ip4 (u32 session_index, ip46_address_t * ip, - u16 port_host_byte_order) -{ - return tcp_connection_bind (session_index, ip, port_host_byte_order, 1); -} - -u32 -tcp_session_bind_ip6 (u32 session_index, ip46_address_t * ip, - u16 port_host_byte_order) +tcp_session_bind (u32 session_index, transport_endpoint_t * tep) { - return tcp_connection_bind (session_index, ip, port_host_byte_order, 0); + return tcp_connection_bind (session_index, tep); } static void @@ -133,10 +125,7 @@ tcp_connection_cleanup (tcp_connection_t * tc) /* Check if half-open */ if (tc->state == TCP_STATE_SYN_SENT) { - /* Poison the entry */ - if (CLIB_DEBUG > 0) - memset (tc, 0xFA, sizeof (*tc)); - pool_put (tm->half_open_connections, tc); + tcp_half_open_connection_del (tc); } else { @@ -172,9 +161,21 @@ tcp_half_open_connection_del (tcp_connection_t * tc) tcp_main_t *tm = vnet_get_tcp_main (); if (CLIB_DEBUG) memset (tc, 0xFA, sizeof (*tc)); - clib_spinlock_lock (&tm->half_open_lock); + clib_spinlock_lock_if_init (&tm->half_open_lock); pool_put (tm->half_open_connections, tc); - clib_spinlock_unlock (&tm->half_open_lock); + clib_spinlock_unlock_if_init (&tm->half_open_lock); +} + +tcp_connection_t * +tcp_half_open_connection_new () +{ + tcp_main_t *tm = vnet_get_tcp_main (); + tcp_connection_t *tc = 0; + clib_spinlock_lock_if_init (&tm->half_open_lock); + pool_get (tm->half_open_connections, tc); + clib_spinlock_unlock_if_init (&tm->half_open_lock); + memset (tc, 0, sizeof (*tc)); + return tc; } tcp_connection_t * @@ -456,11 +457,13 @@ fib_node_index_t tcp_lookup_rmt_in_fib (tcp_connection_t * tc) { fib_prefix_t prefix; + u32 fib_index; clib_memcpy (&prefix.fp_addr, &tc->c_rmt_ip, sizeof (prefix.fp_addr)); prefix.fp_proto = tc->c_is_ip4 ? FIB_PROTOCOL_IP4 : FIB_PROTOCOL_IP6; prefix.fp_len = tc->c_is_ip4 ? 32 : 128; - return fib_table_lookup (0, &prefix); + fib_index = fib_table_find (prefix.fp_proto, tc->c_vrf); + return fib_table_lookup (fib_index, &prefix); } static int @@ -512,13 +515,13 @@ tcp_connection_init_vars (tcp_connection_t * tc) } int -tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4) +tcp_connection_open (transport_endpoint_t * rmt) { tcp_main_t *tm = vnet_get_tcp_main (); tcp_connection_t *tc; fib_prefix_t prefix; fib_node_index_t fei; - u32 sw_if_index; + u32 sw_if_index, fib_index; ip46_address_t lcl_addr; int lcl_port; @@ -528,11 +531,12 @@ tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4) memset (&lcl_addr, 0, sizeof (lcl_addr)); /* Find a FIB path to the destination */ - clib_memcpy (&prefix.fp_addr, rmt_addr, sizeof (*rmt_addr)); - prefix.fp_proto = is_ip4 ? FIB_PROTOCOL_IP4 : FIB_PROTOCOL_IP6; - prefix.fp_len = is_ip4 ? 32 : 128; + clib_memcpy (&prefix.fp_addr, &rmt->ip, sizeof (rmt->ip)); + prefix.fp_proto = rmt->is_ip4 ? FIB_PROTOCOL_IP4 : FIB_PROTOCOL_IP6; + prefix.fp_len = rmt->is_ip4 ? 32 : 128; - fei = fib_table_lookup (0, &prefix); + fib_index = fib_table_find (prefix.fp_proto, rmt->vrf); + fei = fib_table_lookup (fib_index, &prefix); /* Couldn't find route to destination. Bail out. */ if (fei == FIB_NODE_INDEX_INVALID) @@ -546,11 +550,11 @@ tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4) if (sw_if_index == (u32) ~ 0) { clib_warning ("no resolving interface for %U", format_ip46_address, - rmt_addr, IP46_TYPE_IP4); + &rmt->ip, IP46_TYPE_IP4); return -1; } - if (is_ip4) + if (rmt->is_ip4) { ip4_address_t *ip4; int index; @@ -599,17 +603,16 @@ tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4) * Create connection and send SYN */ - pool_get (tm->half_open_connections, tc); - memset (tc, 0, sizeof (*tc)); + tc = tcp_half_open_connection_new (); - clib_memcpy (&tc->c_rmt_ip, rmt_addr, sizeof (ip46_address_t)); + clib_memcpy (&tc->c_rmt_ip, &rmt->ip, sizeof (ip46_address_t)); clib_memcpy (&tc->c_lcl_ip, &lcl_addr, sizeof (ip46_address_t)); - tc->c_rmt_port = clib_host_to_net_u16 (rmt_port); + tc->c_rmt_port = clib_host_to_net_u16 (rmt->port); tc->c_lcl_port = clib_host_to_net_u16 (lcl_port); tc->c_c_index = tc - tm->half_open_connections; - tc->c_is_ip4 = is_ip4; - tc->c_proto = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; - + tc->c_is_ip4 = rmt->is_ip4; + tc->c_proto = rmt->is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; + tc->c_vrf = rmt->vrf; /* The other connection vars will be initialized after SYN ACK */ tcp_connection_timers_init (tc); @@ -621,15 +624,9 @@ tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4) } int -tcp_session_open_ip4 (ip46_address_t * addr, u16 port) -{ - return tcp_connection_open (addr, port, 1); -} - -int -tcp_session_open_ip6 (ip46_address_t * addr, u16 port) +tcp_session_open (transport_endpoint_t * tep) { - return tcp_connection_open (addr, port, 0); + return tcp_connection_open (tep); } const char *tcp_dbg_evt_str[] = { @@ -1025,32 +1022,14 @@ tcp_session_tx_fifo_offset (transport_connection_t * trans_conn) } /* *INDENT-OFF* */ -const static transport_proto_vft_t tcp4_proto = { - .bind = tcp_session_bind_ip4, - .unbind = tcp_session_unbind, - .push_header = tcp_push_header, - .get_connection = tcp_session_get_transport, - .get_listener = tcp_session_get_listener, - .get_half_open = tcp_half_open_session_get_transport, - .open = tcp_session_open_ip4, - .close = tcp_session_close, - .cleanup = tcp_session_cleanup, - .send_mss = tcp_session_send_mss, - .send_space = tcp_session_send_space, - .tx_fifo_offset = tcp_session_tx_fifo_offset, - .format_connection = format_tcp_session, - .format_listener = format_tcp_listener_session, - .format_half_open = format_tcp_half_open_session, -}; - -const static transport_proto_vft_t tcp6_proto = { - .bind = tcp_session_bind_ip6, +const static transport_proto_vft_t tcp_proto = { + .bind = tcp_session_bind, .unbind = tcp_session_unbind, .push_header = tcp_push_header, .get_connection = tcp_session_get_transport, .get_listener = tcp_session_get_listener, .get_half_open = tcp_half_open_session_get_transport, - .open = tcp_session_open_ip6, + .open = tcp_session_open, .close = tcp_session_close, .cleanup = tcp_session_cleanup, .send_mss = tcp_session_send_mss, @@ -1200,9 +1179,9 @@ tcp_main_enable (vlib_main_t * vm) ip4_register_protocol (IP_PROTOCOL_TCP, tcp4_input_node.index); - /* Register as transport with URI */ - session_register_transport (SESSION_TYPE_IP4_TCP, &tcp4_proto); - session_register_transport (SESSION_TYPE_IP6_TCP, &tcp6_proto); + /* Register as transport with session layer */ + session_register_transport (SESSION_TYPE_IP4_TCP, &tcp_proto); + session_register_transport (SESSION_TYPE_IP6_TCP, &tcp_proto); /* * Initialize data structures @@ -1247,7 +1226,8 @@ tcp_main_enable (vlib_main_t * vm) clib_bihash_init_24_8 (&tm->local_endpoints_table, "local endpoint table", 200000 /* $$$$ config parameter nbuckets */ , (64 << 20) /*$$$ config parameter table size */ ); - clib_spinlock_init (&tm->half_open_lock); + if (num_threads > 1) + clib_spinlock_init (&tm->half_open_lock); return error; } diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index cc5cecdc..d32b4fc8 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -2742,7 +2742,10 @@ tcp_lookup_is_valid (tcp_connection_t * tc, tcp_header_t * hdr) if (!is_valid) { - if ((tmp = stream_session_lookup_half_open (&tc->connection))) + if ((tmp = + stream_session_half_open_lookup (&tc->c_lcl_ip, &tc->c_rmt_ip, + tc->c_lcl_port, tc->c_rmt_port, + tc->c_proto))) { if (tmp->lcl_port == hdr->dst_port && tmp->rmt_port == hdr->src_port) diff --git a/src/vnet/udp/udp.c b/src/vnet/udp/udp.c index 57e4a60e..ff76a82e 100644 --- a/src/vnet/udp/udp.c +++ b/src/vnet/udp/udp.c @@ -25,34 +25,32 @@ udp_uri_main_t udp_uri_main; u32 -udp_session_bind_ip4 (u32 session_index, - ip46_address_t * ip, u16 port_number_host_byte_order) +udp_session_bind_ip4 (u32 session_index, transport_endpoint_t * lcl) { udp_uri_main_t *um = vnet_get_udp_main (); udp_connection_t *listener; pool_get (um->udp_listeners, listener); memset (listener, 0, sizeof (udp_connection_t)); - listener->c_lcl_port = clib_host_to_net_u16 (port_number_host_byte_order); - listener->c_lcl_ip4.as_u32 = ip->ip4.as_u32; + listener->c_lcl_port = clib_host_to_net_u16 (lcl->port); + listener->c_lcl_ip4.as_u32 = lcl->ip.ip4.as_u32; listener->c_proto = SESSION_TYPE_IP4_UDP; - udp_register_dst_port (um->vlib_main, port_number_host_byte_order, - udp4_uri_input_node.index, 1 /* is_ipv4 */ ); + udp_register_dst_port (um->vlib_main, lcl->port, udp4_uri_input_node.index, + 1 /* is_ipv4 */ ); return 0; } u32 -udp_session_bind_ip6 (u32 session_index, - ip46_address_t * ip, u16 port_number_host_byte_order) +udp_session_bind_ip6 (u32 session_index, transport_endpoint_t * lcl) { udp_uri_main_t *um = vnet_get_udp_main (); udp_connection_t *listener; pool_get (um->udp_listeners, listener); - listener->c_lcl_port = clib_host_to_net_u16 (port_number_host_byte_order); - clib_memcpy (&listener->c_lcl_ip6, &ip->ip6, sizeof (ip6_address_t)); + listener->c_lcl_port = clib_host_to_net_u16 (lcl->port); + clib_memcpy (&listener->c_lcl_ip6, &lcl->ip.ip6, sizeof (ip6_address_t)); listener->c_proto = SESSION_TYPE_IP6_UDP; - udp_register_dst_port (um->vlib_main, port_number_host_byte_order, + udp_register_dst_port (um->vlib_main, lcl->port, udp4_uri_input_node.index, 0 /* is_ipv4 */ ); return 0; } @@ -251,7 +249,7 @@ udp_send_space_uri (transport_connection_t * t) } int -udp_open_connection (ip46_address_t * addr, u16 port) +udp_open_connection (transport_endpoint_t * tep) { clib_warning ("Not implemented"); return 0; -- cgit 1.2.3-korg From 68810624f84467503482b82662c980e8f0e36deb Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Mon, 24 Jul 2017 17:40:28 -0700 Subject: Make tcp active open data structures thread safe - Cleanup half-open connections and timers on the right thread - Ensure half-open connection and transport endpoint pools are thread safe - Enqueue TX events to the correct vpp thread in the builtin client - Use transport proto in transport connections instead of session type Change-Id: Id13239a206afbff6f34a38afa510fe014e4b2049 Signed-off-by: Florin Coras Signed-off-by: Dave Barach --- src/svm/svm_fifo_segment.c | 6 ++ src/vnet/session/session.c | 14 +-- src/vnet/session/session.h | 3 +- src/vnet/session/session_cli.c | 2 +- src/vnet/session/session_lookup.c | 103 ++++++++------------ src/vnet/session/session_lookup.h | 5 +- src/vnet/session/transport.h | 6 +- src/vnet/session/transport_interface.c | 5 +- src/vnet/session/transport_interface.h | 2 +- src/vnet/tcp/builtin_client.c | 125 +++++++++++++++---------- src/vnet/tcp/builtin_client.h | 9 +- src/vnet/tcp/tcp.c | 166 ++++++++++++++++++++------------- src/vnet/tcp/tcp.h | 18 ++-- src/vnet/tcp/tcp_debug.h | 32 +++---- src/vnet/tcp/tcp_input.c | 36 ++++--- src/vnet/tcp/tcp_output.c | 19 +++- src/vnet/tcp/tcp_test.c | 14 +-- src/vnet/udp/udp.c | 8 +- src/vnet/udp/udp_input.c | 2 +- 19 files changed, 321 insertions(+), 254 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/svm/svm_fifo_segment.c b/src/svm/svm_fifo_segment.c index c80374a7..a01e26e4 100644 --- a/src/svm/svm_fifo_segment.c +++ b/src/svm/svm_fifo_segment.c @@ -376,6 +376,12 @@ svm_fifo_segment_free_fifo (svm_fifo_segment_private_t * s, svm_fifo_t * f, break; } + if (CLIB_DEBUG) + { + f->master_session_index = ~0; + f->master_thread_index = ~0; + } + ssvm_pop_heap (oldheap); ssvm_unlock_non_recursive (sh); } diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 48000a6f..004c7193 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -64,7 +64,8 @@ stream_session_create_i (segment_manager_t * sm, transport_connection_t * tc, s->server_tx_fifo = server_tx_fifo; /* Initialize state machine, such as it is... */ - s->session_type = tc->proto; + s->session_type = session_type_from_proto_and_ip (tc->transport_proto, + tc->is_ip4); s->session_state = SESSION_STATE_CONNECTING; s->svm_segment_index = fifo_segment_index; s->thread_index = thread_index; @@ -354,8 +355,7 @@ stream_session_init_fifos_pointers (transport_connection_t * tc, } int -stream_session_connect_notify (transport_connection_t * tc, u8 sst, - u8 is_fail) +stream_session_connect_notify (transport_connection_t * tc, u8 is_fail) { application_t *app; stream_session_t *new_s = 0; @@ -365,7 +365,7 @@ stream_session_connect_notify (transport_connection_t * tc, u8 sst, handle = stream_session_half_open_lookup_handle (&tc->lcl_ip, &tc->rmt_ip, tc->lcl_port, tc->rmt_port, - tc->proto); + tc->transport_proto); if (handle == HALF_OPEN_LOOKUP_INVALID_VALUE) { clib_warning ("This can't be good!"); @@ -391,7 +391,7 @@ stream_session_connect_notify (transport_connection_t * tc, u8 sst, new_s->app_index = app->index; } - /* Notify client */ + /* Notify client application */ if (app->cb_fns.session_connected_callback (app->index, api_context, new_s, is_fail)) { @@ -406,7 +406,7 @@ stream_session_connect_notify (transport_connection_t * tc, u8 sst, } /* Cleanup session lookup */ - stream_session_half_open_table_del (sst, tc); + stream_session_half_open_table_del (tc); return error; } @@ -567,7 +567,7 @@ stream_session_open (u32 app_index, session_type_t st, handle = (((u64) app_index) << 32) | (u64) tc->c_index; /* Add to the half-open lookup table */ - stream_session_half_open_table_add (st, tc, handle); + stream_session_half_open_table_add (tc, handle); *res = tc; diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index bb22f100..180b9f8a 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -274,8 +274,7 @@ stream_session_peek_bytes (transport_connection_t * tc, u8 * buffer, u32 offset, u32 max_bytes); u32 stream_session_dequeue_drop (transport_connection_t * tc, u32 max_bytes); -int stream_session_connect_notify (transport_connection_t * tc, u8 sst, - u8 is_fail); +int stream_session_connect_notify (transport_connection_t * tc, u8 is_fail); void stream_session_init_fifos_pointers (transport_connection_t * tc, u32 rx_pointer, u32 tx_pointer); diff --git a/src/vnet/session/session_cli.c b/src/vnet/session/session_cli.c index 4d432977..de564ea7 100755 --- a/src/vnet/session/session_cli.c +++ b/src/vnet/session/session_cli.c @@ -57,7 +57,7 @@ format_stream_session (u8 * s, va_list * args) u8 *str = 0; tp_vft = session_get_transport_vft (ss->session_type); - if (verbose == 1) + if (verbose == 1 && ss->session_state >= SESSION_STATE_ACCEPTING) str = format (0, "%-10u%-10u%-10lld", svm_fifo_max_dequeue (ss->server_rx_fifo), svm_fifo_max_enqueue (ss->server_tx_fifo), diff --git a/src/vnet/session/session_lookup.c b/src/vnet/session/session_lookup.c index b3862ee3..1ce22f80 100644 --- a/src/vnet/session/session_lookup.c +++ b/src/vnet/session/session_lookup.c @@ -107,7 +107,7 @@ always_inline void make_v4_ss_kv_from_tc (session_kv4_t * kv, transport_connection_t * t) { return make_v4_ss_kv (kv, &t->lcl_ip.ip4, &t->rmt_ip.ip4, t->lcl_port, - t->rmt_port, t->proto); + t->rmt_port, t->transport_proto); } always_inline void @@ -150,7 +150,7 @@ always_inline void make_v6_ss_kv_from_tc (session_kv6_t * kv, transport_connection_t * t) { make_v6_ss_kv (kv, &t->lcl_ip.ip6, &t->rmt_ip.ip6, t->lcl_port, - t->rmt_port, t->proto); + t->rmt_port, t->transport_proto); } /* @@ -164,23 +164,17 @@ stream_session_table_add_for_tc (transport_connection_t * tc, u64 value) session_kv4_t kv4; session_kv6_t kv6; - switch (tc->proto) + if (tc->is_ip4) { - case SESSION_TYPE_IP4_UDP: - case SESSION_TYPE_IP4_TCP: make_v4_ss_kv_from_tc (&kv4, tc); kv4.value = value; clib_bihash_add_del_16_8 (&sl->v4_session_hash, &kv4, 1 /* is_add */ ); - break; - case SESSION_TYPE_IP6_UDP: - case SESSION_TYPE_IP6_TCP: + } + else + { make_v6_ss_kv_from_tc (&kv6, tc); kv6.value = value; clib_bihash_add_del_48_8 (&sl->v6_session_hash, &kv6, 1 /* is_add */ ); - break; - default: - clib_warning ("Session type not supported"); - ASSERT (0); } } @@ -195,59 +189,24 @@ stream_session_table_add (session_manager_main_t * smm, stream_session_t * s, stream_session_table_add_for_tc (tc, value); } -void -stream_session_half_open_table_add (session_type_t sst, - transport_connection_t * tc, u64 value) -{ - session_lookup_t *sl = &session_lookup; - session_kv4_t kv4; - session_kv6_t kv6; - - switch (sst) - { - case SESSION_TYPE_IP4_UDP: - case SESSION_TYPE_IP4_TCP: - make_v4_ss_kv_from_tc (&kv4, tc); - kv4.value = value; - clib_bihash_add_del_16_8 (&sl->v4_half_open_hash, &kv4, - 1 /* is_add */ ); - break; - case SESSION_TYPE_IP6_UDP: - case SESSION_TYPE_IP6_TCP: - make_v6_ss_kv_from_tc (&kv6, tc); - kv6.value = value; - clib_bihash_add_del_48_8 (&sl->v6_half_open_hash, &kv6, - 1 /* is_add */ ); - break; - default: - clib_warning ("Session type not supported"); - ASSERT (0); - } -} - int stream_session_table_del_for_tc (transport_connection_t * tc) { session_lookup_t *sl = &session_lookup; session_kv4_t kv4; session_kv6_t kv6; - switch (tc->proto) + + if (tc->is_ip4) { - case SESSION_TYPE_IP4_UDP: - case SESSION_TYPE_IP4_TCP: make_v4_ss_kv_from_tc (&kv4, tc); return clib_bihash_add_del_16_8 (&sl->v4_session_hash, &kv4, 0 /* is_add */ ); - break; - case SESSION_TYPE_IP6_UDP: - case SESSION_TYPE_IP6_TCP: + } + else + { make_v6_ss_kv_from_tc (&kv6, tc); return clib_bihash_add_del_48_8 (&sl->v6_session_hash, &kv6, 0 /* is_add */ ); - break; - default: - clib_warning ("Session type not supported"); - ASSERT (0); } return 0; @@ -262,30 +221,48 @@ stream_session_table_del (stream_session_t * s) return stream_session_table_del_for_tc (ts); } + void -stream_session_half_open_table_del (u8 sst, transport_connection_t * tc) +stream_session_half_open_table_add (transport_connection_t * tc, u64 value) { session_lookup_t *sl = &session_lookup; session_kv4_t kv4; session_kv6_t kv6; - switch (sst) + if (tc->is_ip4) + { + make_v4_ss_kv_from_tc (&kv4, tc); + kv4.value = value; + clib_bihash_add_del_16_8 (&sl->v4_half_open_hash, &kv4, + 1 /* is_add */ ); + } + else + { + make_v6_ss_kv_from_tc (&kv6, tc); + kv6.value = value; + clib_bihash_add_del_48_8 (&sl->v6_half_open_hash, &kv6, + 1 /* is_add */ ); + } +} + +void +stream_session_half_open_table_del (transport_connection_t * tc) +{ + session_lookup_t *sl = &session_lookup; + session_kv4_t kv4; + session_kv6_t kv6; + + if (tc->is_ip4) { - case SESSION_TYPE_IP4_UDP: - case SESSION_TYPE_IP4_TCP: make_v4_ss_kv_from_tc (&kv4, tc); clib_bihash_add_del_16_8 (&sl->v4_half_open_hash, &kv4, 0 /* is_add */ ); - break; - case SESSION_TYPE_IP6_UDP: - case SESSION_TYPE_IP6_TCP: + } + else + { make_v6_ss_kv_from_tc (&kv6, tc); clib_bihash_add_del_48_8 (&sl->v6_half_open_hash, &kv6, 0 /* is_add */ ); - break; - default: - clib_warning ("Session type not supported"); - ASSERT (0); } } diff --git a/src/vnet/session/session_lookup.h b/src/vnet/session/session_lookup.h index 9e92dab1..cf1dc013 100644 --- a/src/vnet/session/session_lookup.h +++ b/src/vnet/session/session_lookup.h @@ -83,9 +83,8 @@ transport_connection_t *stream_session_half_open_lookup (ip46_address_t * lcl, void stream_session_table_add_for_tc (transport_connection_t * tc, u64 value); int stream_session_table_del_for_tc (transport_connection_t * tc); int stream_session_table_del (stream_session_t * s); -void stream_session_half_open_table_del (u8 sst, transport_connection_t * tc); -void stream_session_half_open_table_add (session_type_t sst, - transport_connection_t * tc, +void stream_session_half_open_table_del (transport_connection_t * tc); +void stream_session_half_open_table_add (transport_connection_t * tc, u64 value); void session_lookup_init (void); diff --git a/src/vnet/session/transport.h b/src/vnet/session/transport.h index 3895a60a..e56be338 100644 --- a/src/vnet/session/transport.h +++ b/src/vnet/session/transport.h @@ -31,12 +31,12 @@ typedef struct _transport_connection ip46_address_t lcl_ip; /**< Local IP */ u16 lcl_port; /**< Local port */ u16 rmt_port; /**< Remote port */ - u8 proto; /**< Protocol id (also session type) */ + u8 transport_proto; /**< Protocol id */ + u8 is_ip4; /**< Flag if IP4 connection */ u32 vrf; /**< FIB table id */ u32 s_index; /**< Parent session index */ u32 c_index; /**< Connection index in transport pool */ - u8 is_ip4; /**< Flag if IP4 connection */ u32 thread_index; /**< Worker-thread index */ fib_node_index_t rmt_fei; /**< FIB entry index for rmt */ @@ -56,7 +56,7 @@ typedef struct _transport_connection #define c_rmt_ip6 connection.rmt_ip.ip6 #define c_lcl_port connection.lcl_port #define c_rmt_port connection.rmt_port -#define c_proto connection.proto +#define c_transport_proto connection.transport_proto #define c_vrf connection.vrf #define c_state connection.state #define c_s_index connection.s_index diff --git a/src/vnet/session/transport_interface.c b/src/vnet/session/transport_interface.c index eb12aa69..ef8d1e49 100644 --- a/src/vnet/session/transport_interface.c +++ b/src/vnet/session/transport_interface.c @@ -73,9 +73,12 @@ transport_endpoint_table_del (transport_endpoint_table_t * ht, * @param vft - virtual function table */ void -session_register_transport (u8 session_type, +session_register_transport (transport_proto_t transport_proto, u8 is_ip4, const transport_proto_vft_t * vft) { + u8 session_type; + session_type = session_type_from_proto_and_ip (transport_proto, is_ip4); + vec_validate (tp_vfts, session_type); tp_vfts[session_type] = *vft; diff --git a/src/vnet/session/transport_interface.h b/src/vnet/session/transport_interface.h index b7e86ee7..661221c4 100644 --- a/src/vnet/session/transport_interface.h +++ b/src/vnet/session/transport_interface.h @@ -67,7 +67,7 @@ void transport_endpoint_table_add (transport_endpoint_table_t * ht, void transport_endpoint_table_del (transport_endpoint_table_t * ht, transport_endpoint_t * te); -void session_register_transport (u8 session_type, +void session_register_transport (transport_proto_t transport_proto, u8 is_ip4, const transport_proto_vft_t * vft); transport_proto_vft_t *session_get_transport_vft (u8 session_type); diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index 744f50e7..27e20f8e 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -45,6 +45,24 @@ #define TCP_BUILTIN_CLIENT_DBG (0) +static void +signal_evt_to_cli_i (int *code) +{ + tclient_main_t *tm = &tclient_main; + ASSERT (vlib_get_thread_index () == 0); + vlib_process_signal_event (tm->vlib_main, tm->cli_node_index, *code, 0); +} + +static void +signal_evt_to_cli (int code) +{ + if (vlib_get_thread_index () != 0) + vl_api_rpc_call_main_thread (signal_evt_to_cli_i, (u8 *) & code, + sizeof (code)); + else + signal_evt_to_cli_i (&code); +} + static void send_test_chunk (tclient_main_t * tm, session_t * s) { @@ -53,6 +71,7 @@ send_test_chunk (tclient_main_t * tm, session_t * s) u32 bytes_this_chunk; session_fifo_event_t evt; static int serial_number = 0; + svm_fifo_t *txf; int rv; ASSERT (vec_len (test_data) > 0); @@ -63,7 +82,8 @@ send_test_chunk (tclient_main_t * tm, session_t * s) bytes_this_chunk = bytes_this_chunk < s->bytes_to_send ? bytes_this_chunk : s->bytes_to_send; - rv = svm_fifo_enqueue_nowait (s->server_tx_fifo, bytes_this_chunk, + txf = s->server_tx_fifo; + rv = svm_fifo_enqueue_nowait (txf, bytes_this_chunk, test_data + test_buf_offset); /* If we managed to enqueue data... */ @@ -93,15 +113,16 @@ send_test_chunk (tclient_main_t * tm, session_t * s) } /* Poke the session layer */ - if (svm_fifo_set_event (s->server_tx_fifo)) + if (svm_fifo_set_event (txf)) { /* Fabricate TX event, send to vpp */ - evt.fifo = s->server_tx_fifo; + evt.fifo = txf; evt.event_type = FIFO_EVENT_APP_TX; evt.event_id = serial_number++; - if (unix_shared_memory_queue_add (tm->vpp_event_queue, (u8 *) & evt, - 0 /* do wait for mutex */ )) + if (unix_shared_memory_queue_add + (tm->vpp_event_queue[txf->master_thread_index], (u8 *) & evt, + 0 /* do wait for mutex */ )) clib_warning ("could not enqueue event"); } } @@ -112,14 +133,16 @@ receive_test_chunk (tclient_main_t * tm, session_t * s) { svm_fifo_t *rx_fifo = s->server_rx_fifo; int n_read, test_bytes = 0; + u32 my_thread_index = vlib_get_thread_index (); /* Allow enqueuing of new event */ // svm_fifo_unset_event (rx_fifo); if (test_bytes) { - n_read = svm_fifo_dequeue_nowait (rx_fifo, vec_len (tm->rx_buf), - tm->rx_buf); + n_read = svm_fifo_dequeue_nowait (rx_fifo, + vec_len (tm->rx_buf[my_thread_index]), + tm->rx_buf[my_thread_index]); } else { @@ -151,10 +174,12 @@ receive_test_chunk (tclient_main_t * tm, session_t * s) int i; for (i = 0; i < n_read; i++) { - if (tm->rx_buf[i] != ((s->bytes_received + i) & 0xff)) + if (tm->rx_buf[my_thread_index][i] + != ((s->bytes_received + i) & 0xff)) { clib_warning ("read %d error at byte %lld, 0x%x not 0x%x", - n_read, s->bytes_received + i, tm->rx_buf[i], + n_read, s->bytes_received + i, + tm->rx_buf[my_thread_index][i], ((s->bytes_received + i) & 0xff)); } } @@ -247,7 +272,11 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, if (s) { - stream_session_disconnect (s); + vnet_disconnect_args_t _a, *a = &_a; + a->handle = stream_session_handle (s); + a->app_index = tm->app_index; + vnet_disconnect_session (a); + vec_delete (connections_this_batch, 1, i); i--; __sync_fetch_and_add (&tm->ready_connections, -1); @@ -258,9 +287,7 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, /* Kick the debug CLI process */ if (tm->ready_connections == 0) { - tm->test_end_time = vlib_time_now (vm); - vlib_process_signal_event (vm, tm->cli_node_index, - 2, 0 /* data */ ); + signal_evt_to_cli (2); } } } @@ -369,27 +396,31 @@ static int tcp_test_clients_init (vlib_main_t * vm) { tclient_main_t *tm = &tclient_main; - vlib_thread_main_t *thread_main = vlib_get_thread_main (); + vlib_thread_main_t *vtm = vlib_get_thread_main (); + u32 num_threads; int i; tclient_api_hookup (vm); if (create_api_loopback (tm)) return -1; + num_threads = 1 /* main thread */ + vtm->n_threads; + /* Init test data. Big buffer */ vec_validate (tm->connect_test_data, 1024 * 1024 - 1); for (i = 0; i < vec_len (tm->connect_test_data); i++) tm->connect_test_data[i] = i & 0xff; - tm->session_index_by_vpp_handles = hash_create (0, sizeof (uword)); - vec_validate (tm->rx_buf, vec_len (tm->connect_test_data) - 1); + vec_validate (tm->rx_buf, num_threads - 1); + for (i = 0; i < num_threads; i++) + vec_validate (tm->rx_buf[i], vec_len (tm->connect_test_data) - 1); tm->is_init = 1; - tm->vlib_main = vm; - vec_validate (tm->connection_index_by_thread, thread_main->n_vlib_mains); - vec_validate (tm->connections_this_batch_by_thread, - thread_main->n_vlib_mains); + vec_validate (tm->connection_index_by_thread, vtm->n_vlib_mains); + vec_validate (tm->connections_this_batch_by_thread, vtm->n_vlib_mains); + vec_validate (tm->vpp_event_queue, vtm->n_vlib_mains); + return 0; } @@ -400,23 +431,28 @@ builtin_session_connected_callback (u32 app_index, u32 api_context, tclient_main_t *tm = &tclient_main; session_t *session; u32 session_index; - int i; + u8 thread_index = vlib_get_thread_index (); + + ASSERT (s->thread_index == thread_index); if (is_fail) { clib_warning ("connection %d failed!", api_context); - vlib_process_signal_event (tm->vlib_main, tm->cli_node_index, -1, - 0 /* data */ ); - return -1; + signal_evt_to_cli (-1); + return 0; } - tm->our_event_queue = session_manager_get_vpp_event_queue (s->thread_index); - tm->vpp_event_queue = session_manager_get_vpp_event_queue (s->thread_index); + if (!tm->vpp_event_queue[thread_index]) + tm->vpp_event_queue[thread_index] = + session_manager_get_vpp_event_queue (thread_index); /* * Setup session */ + clib_spinlock_lock_if_init (&tm->sessions_lock); pool_get (tm->sessions, session); + clib_spinlock_unlock_if_init (&tm->sessions_lock); + memset (session, 0, sizeof (*session)); session_index = session - tm->sessions; session->bytes_to_send = tm->bytes_to_send; @@ -427,32 +463,13 @@ builtin_session_connected_callback (u32 app_index, u32 api_context, session->server_tx_fifo->client_session_index = session_index; session->vpp_session_handle = stream_session_handle (s); - /* Add it to the session lookup table */ - hash_set (tm->session_index_by_vpp_handles, session->vpp_session_handle, - session_index); - - if (tm->ready_connections == tm->expected_connections - 1) - { - vlib_thread_main_t *thread_main = vlib_get_thread_main (); - int thread_index; - - thread_index = 0; - for (i = 0; i < pool_elts (tm->sessions); i++) - { - vec_add1 (tm->connection_index_by_thread[thread_index], i); - thread_index++; - if (thread_index == thread_main->n_vlib_mains) - thread_index = 0; - } - } + vec_add1 (tm->connection_index_by_thread[thread_index], session_index); __sync_fetch_and_add (&tm->ready_connections, 1); if (tm->ready_connections == tm->expected_connections) { tm->run_test = 1; - tm->test_start_time = vlib_time_now (tm->vlib_main); /* Signal the CLI process that the action is starting... */ - vlib_process_signal_event (tm->vlib_main, tm->cli_node_index, 1, - 0 /* data */ ); + signal_evt_to_cli (1); } return 0; @@ -606,7 +623,9 @@ test_tcp_clients_command_fn (vlib_main_t * vm, tm->connections_per_batch = 1000; tm->private_segment_count = 0; tm->private_segment_size = 0; - + tm->vlib_main = vm; + if (thread_main->n_vlib_mains > 1) + clib_spinlock_init (&tm->sessions_lock); vec_free (tm->connect_uri); while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) @@ -668,7 +687,9 @@ test_tcp_clients_command_fn (vlib_main_t * vm, start_tx_pthread (); #endif + vlib_worker_thread_barrier_sync (vm); vnet_session_enable_disable (vm, 1 /* turn on TCP, etc. */ ); + vlib_worker_thread_barrier_release (vm); if (tm->test_client_attached == 0) { @@ -688,9 +709,8 @@ test_tcp_clients_command_fn (vlib_main_t * vm, clients_connect (vm, uri, n_clients); /* Park until the sessions come up, or ten seconds elapse... */ - vlib_process_wait_for_event_or_clock (vm, 10.0 /* timeout, seconds */ ); + vlib_process_wait_for_event_or_clock (vm, 10 /* timeout, seconds */ ); event_type = vlib_process_get_events (vm, &event_data); - switch (event_type) { case ~0: @@ -699,6 +719,7 @@ test_tcp_clients_command_fn (vlib_main_t * vm, goto cleanup; case 1: + tm->test_start_time = vlib_time_now (tm->vlib_main); vlib_cli_output (vm, "Test started at %.6f", tm->test_start_time); break; @@ -710,7 +731,6 @@ test_tcp_clients_command_fn (vlib_main_t * vm, /* Now wait for the sessions to finish... */ vlib_process_wait_for_event_or_clock (vm, cli_timeout); event_type = vlib_process_get_events (vm, &event_data); - switch (event_type) { case ~0: @@ -719,6 +739,7 @@ test_tcp_clients_command_fn (vlib_main_t * vm, goto cleanup; case 2: + tm->test_end_time = vlib_time_now (vm); vlib_cli_output (vm, "Test finished at %.6f", tm->test_end_time); break; @@ -753,6 +774,7 @@ cleanup: vec_reset_length (tm->connection_index_by_thread[i]); vec_reset_length (tm->connections_this_batch_by_thread[i]); } + pool_free (tm->sessions); return 0; @@ -765,6 +787,7 @@ VLIB_CLI_COMMAND (test_clients_command, static) = .short_help = "test tcp clients [nclients %d]" "[iterations %d] [bytes %d] [uri tcp://6.0.1.1/1234]", .function = test_tcp_clients_command_fn, + .is_mp_safe = 1, }; /* *INDENT-ON* */ diff --git a/src/vnet/tcp/builtin_client.h b/src/vnet/tcp/builtin_client.h index 38af231d..06d239ef 100644 --- a/src/vnet/tcp/builtin_client.h +++ b/src/vnet/tcp/builtin_client.h @@ -48,8 +48,7 @@ typedef struct * Application setup parameters */ unix_shared_memory_queue_t *vl_input_queue; /**< vpe input queue */ - unix_shared_memory_queue_t *our_event_queue; /**< Our event queue */ - unix_shared_memory_queue_t *vpp_event_queue; /**< $$$ single thread */ + unix_shared_memory_queue_t **vpp_event_queue; u32 cli_node_index; /**< cli process node index */ u32 my_client_index; /**< loopback API client handle */ @@ -70,9 +69,9 @@ typedef struct /* * Test state variables */ - session_t *sessions; /**< Sessions pool */ - u8 *rx_buf; /**< intermediate rx buffer */ - uword *session_index_by_vpp_handles; /**< Hash table for disconnecting */ + session_t *sessions; /**< Session pool, shared */ + clib_spinlock_t sessions_lock; + u8 **rx_buf; /**< intermediate rx buffers */ u8 *connect_test_data; /**< Pre-computed test data */ u32 **connection_index_by_thread; u32 **connections_this_batch_by_thread; /**< active connection batch */ diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 6d1cfa07..59b20747 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -37,15 +37,14 @@ tcp_connection_bind (u32 session_index, transport_endpoint_t * lcl) { listener->c_lcl_ip4.as_u32 = lcl->ip.ip4.as_u32; listener->c_is_ip4 = 1; - listener->c_proto = SESSION_TYPE_IP4_TCP; } else { clib_memcpy (&listener->c_lcl_ip6, &lcl->ip.ip6, sizeof (ip6_address_t)); - listener->c_proto = SESSION_TYPE_IP6_TCP; - } + } + listener->c_transport_proto = TRANSPORT_PROTO_TCP; listener->c_s_index = session_index; listener->state = TCP_STATE_LISTEN; @@ -95,6 +94,71 @@ tcp_session_get_listener (u32 listener_index) return &tc->connection; } +always_inline void +transport_endpoint_del (u32 tepi) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + clib_spinlock_lock_if_init (&tm->local_endpoints_lock); + pool_put_index (tm->local_endpoints, tepi); + clib_spinlock_unlock_if_init (&tm->local_endpoints_lock); +} + +always_inline transport_endpoint_t * +transport_endpoint_new (void) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + transport_endpoint_t *tep; + pool_get (tm->local_endpoints, tep); + return tep; +} + +/** + * Cleanup half-open connection + * + */ +void +tcp_half_open_connection_del (tcp_connection_t * tc) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + clib_spinlock_lock_if_init (&tm->half_open_lock); + pool_put_index (tm->half_open_connections, tc->c_c_index); + if (CLIB_DEBUG) + memset (tc, 0xFA, sizeof (*tc)); + clib_spinlock_unlock_if_init (&tm->half_open_lock); +} + +/** + * Try to cleanup half-open connection + * + * If called from a thread that doesn't own tc, the call won't have any + * effect. + * + * @param tc - connection to be cleaned up + * @return non-zero if cleanup failed. + */ +int +tcp_half_open_connection_cleanup (tcp_connection_t * tc) +{ + /* Make sure this is the owning thread */ + if (tc->c_thread_index != vlib_get_thread_index ()) + return 1; + tcp_timer_reset (tc, TCP_TIMER_ESTABLISH); + tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT_SYN); + tcp_half_open_connection_del (tc); + return 0; +} + +tcp_connection_t * +tcp_half_open_connection_new (void) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + tcp_connection_t *tc = 0; + pool_get (tm->half_open_connections, tc); + memset (tc, 0, sizeof (*tc)); + tc->c_c_index = tc - tm->half_open_connections; + return tc; +} + /** * Cleans up connection state. * @@ -110,26 +174,28 @@ tcp_connection_cleanup (tcp_connection_t * tc) /* Cleanup local endpoint if this was an active connect */ tepi = transport_endpoint_lookup (&tm->local_endpoints_table, &tc->c_lcl_ip, tc->c_lcl_port); - - /*XXX lock */ if (tepi != TRANSPORT_ENDPOINT_INVALID_INDEX) { tep = pool_elt_at_index (tm->local_endpoints, tepi); transport_endpoint_table_del (&tm->local_endpoints_table, tep); - pool_put (tm->local_endpoints, tep); + transport_endpoint_del (tepi); } - /* Make sure all timers are cleared */ - tcp_connection_timers_reset (tc); - - /* Check if half-open */ + /* Check if connection is not yet fully established */ if (tc->state == TCP_STATE_SYN_SENT) { - tcp_half_open_connection_del (tc); + /* Try to remove the half-open connection. If this is not the owning + * thread, tc won't be removed. Retransmit or establish timers will + * eventually expire and call again cleanup on the right thread. */ + tcp_half_open_connection_cleanup (tc); } else { int thread_index = tc->c_thread_index; + + /* Make sure all timers are cleared */ + tcp_connection_timers_reset (tc); + /* Poison the entry */ if (CLIB_DEBUG > 0) memset (tc, 0xFA, sizeof (*tc)); @@ -152,32 +218,6 @@ tcp_connection_del (tcp_connection_t * tc) tcp_connection_cleanup (tc); } -/** - * Cleanup half-open connection - */ -void -tcp_half_open_connection_del (tcp_connection_t * tc) -{ - tcp_main_t *tm = vnet_get_tcp_main (); - if (CLIB_DEBUG) - memset (tc, 0xFA, sizeof (*tc)); - clib_spinlock_lock_if_init (&tm->half_open_lock); - pool_put (tm->half_open_connections, tc); - clib_spinlock_unlock_if_init (&tm->half_open_lock); -} - -tcp_connection_t * -tcp_half_open_connection_new () -{ - tcp_main_t *tm = vnet_get_tcp_main (); - tcp_connection_t *tc = 0; - clib_spinlock_lock_if_init (&tm->half_open_lock); - pool_get (tm->half_open_connections, tc); - clib_spinlock_unlock_if_init (&tm->half_open_lock); - memset (tc, 0, sizeof (*tc)); - return tc; -} - tcp_connection_t * tcp_connection_new (u8 thread_index) { @@ -207,9 +247,7 @@ tcp_connection_reset (tcp_connection_t * tc) tcp_connection_cleanup (tc); break; case TCP_STATE_SYN_SENT: - /* XXX remove sst from call */ - stream_session_connect_notify (&tc->connection, tc->connection.proto, - 1 /* fail */ ); + stream_session_connect_notify (&tc->connection, 1 /* fail */ ); tcp_connection_cleanup (tc); break; case TCP_STATE_ESTABLISHED: @@ -225,7 +263,7 @@ tcp_connection_reset (tcp_connection_t * tc) stream_session_reset_notify (&tc->connection); /* Wait for cleanup from session layer but not forever */ - tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); + tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); break; case TCP_STATE_CLOSED: return; @@ -325,8 +363,9 @@ ip_interface_get_first_ip (u32 sw_if_index, u8 is_ip4) * table to mark the pair as used. */ int -tcp_allocate_local_port (tcp_main_t * tm, ip46_address_t * ip) +tcp_allocate_local_port (ip46_address_t * ip) { + tcp_main_t *tm = vnet_get_tcp_main (); transport_endpoint_t *tep; u32 time_now, tei; u16 min = 1024, max = 65535; /* XXX configurable ? */ @@ -338,10 +377,6 @@ tcp_allocate_local_port (tcp_main_t * tm, ip46_address_t * ip) /* Only support active opens from thread 0 */ ASSERT (vlib_get_thread_index () == 0); - /* Start at random point or max */ - pool_get (tm->local_endpoints, tep); - clib_memcpy (&tep->ip, ip, sizeof (*ip)); - /* Search for first free slot */ for (; tries >= 0; tries--) { @@ -355,21 +390,22 @@ tcp_allocate_local_port (tcp_main_t * tm, ip46_address_t * ip) break; } - tep->port = port; - /* Look it up */ - tei = transport_endpoint_lookup (&tm->local_endpoints_table, &tep->ip, - tep->port); + tei = transport_endpoint_lookup (&tm->local_endpoints_table, ip, port); /* If not found, we're done */ if (tei == TRANSPORT_ENDPOINT_INVALID_INDEX) { + clib_spinlock_lock_if_init (&tm->local_endpoints_lock); + tep = transport_endpoint_new (); + clib_memcpy (&tep->ip, ip, sizeof (*ip)); + tep->port = port; transport_endpoint_table_add (&tm->local_endpoints_table, tep, tep - tm->local_endpoints); + clib_spinlock_unlock_if_init (&tm->local_endpoints_lock); + return tep->port; } } - /* No free ports */ - pool_put (tm->local_endpoints, tep); return -1; } @@ -592,7 +628,7 @@ tcp_connection_open (transport_endpoint_t * rmt) } /* Allocate source port */ - lcl_port = tcp_allocate_local_port (tm, &lcl_addr); + lcl_port = tcp_allocate_local_port (&lcl_addr); if (lcl_port < 1) { clib_warning ("Failed to allocate src port"); @@ -602,16 +638,14 @@ tcp_connection_open (transport_endpoint_t * rmt) /* * Create connection and send SYN */ - + clib_spinlock_lock_if_init (&tm->half_open_lock); tc = tcp_half_open_connection_new (); - clib_memcpy (&tc->c_rmt_ip, &rmt->ip, sizeof (ip46_address_t)); clib_memcpy (&tc->c_lcl_ip, &lcl_addr, sizeof (ip46_address_t)); tc->c_rmt_port = clib_host_to_net_u16 (rmt->port); tc->c_lcl_port = clib_host_to_net_u16 (lcl_port); - tc->c_c_index = tc - tm->half_open_connections; tc->c_is_ip4 = rmt->is_ip4; - tc->c_proto = rmt->is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; + tc->c_transport_proto = TRANSPORT_PROTO_TCP; tc->c_vrf = rmt->vrf; /* The other connection vars will be initialized after SYN ACK */ tcp_connection_timers_init (tc); @@ -619,6 +653,7 @@ tcp_connection_open (transport_endpoint_t * rmt) TCP_EVT_DBG (TCP_EVT_OPEN, tc); tc->state = TCP_STATE_SYN_SENT; tcp_send_syn (tc); + clib_spinlock_unlock_if_init (&tm->half_open_lock); return tc->c_c_index; } @@ -1057,16 +1092,12 @@ void tcp_timer_establish_handler (u32 conn_index) { tcp_connection_t *tc; - u8 sst; tc = tcp_half_open_connection_get (conn_index); tc->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID; ASSERT (tc->state == TCP_STATE_SYN_SENT); - - sst = tc->c_is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; - stream_session_connect_notify (&tc->connection, sst, 1 /* fail */ ); - + stream_session_connect_notify (&tc->connection, 1 /* fail */ ); tcp_connection_cleanup (tc); } @@ -1077,6 +1108,8 @@ tcp_timer_waitclose_handler (u32 conn_index) tcp_connection_t *tc; tc = tcp_connection_get (conn_index, thread_index); + if (!tc) + return; tc->timers[TCP_TIMER_WAITCLOSE] = TCP_TIMER_HANDLE_INVALID; /* Session didn't come back with a close(). Send FIN either way @@ -1180,8 +1213,8 @@ tcp_main_enable (vlib_main_t * vm) ip4_register_protocol (IP_PROTOCOL_TCP, tcp4_input_node.index); /* Register as transport with session layer */ - session_register_transport (SESSION_TYPE_IP4_TCP, &tcp_proto); - session_register_transport (SESSION_TYPE_IP6_TCP, &tcp_proto); + session_register_transport (TRANSPORT_PROTO_TCP, 1, &tcp_proto); + session_register_transport (TRANSPORT_PROTO_TCP, 0, &tcp_proto); /* * Initialize data structures @@ -1227,7 +1260,10 @@ tcp_main_enable (vlib_main_t * vm) 200000 /* $$$$ config parameter nbuckets */ , (64 << 20) /*$$$ config parameter table size */ ); if (num_threads > 1) - clib_spinlock_init (&tm->half_open_lock); + { + clib_spinlock_init (&tm->half_open_lock); + clib_spinlock_init (&tm->local_endpoints_lock); + } return error; } diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 89c30616..4fa681f8 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -115,7 +115,8 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler; _(SENT_RCV_WND0, "Sent 0 receive window") \ _(RECOVERY, "Recovery on") \ _(FAST_RECOVERY, "Fast Recovery on") \ - _(FR_1_SMSS, "Sent 1 SMSS") + _(FR_1_SMSS, "Sent 1 SMSS") \ + _(HALF_OPEN_DONE, "Half-open completed") typedef enum _tcp_connection_flag_bits { @@ -381,6 +382,7 @@ typedef struct _tcp_main /* Local endpoints lookup table */ transport_endpoint_table_t local_endpoints_table; + clib_spinlock_t local_endpoints_lock; /* Congestion control algorithms registered */ tcp_cc_algorithm_t *cc_algos; @@ -430,7 +432,8 @@ clib_error_t *vnet_tcp_enable_disable (vlib_main_t * vm, u8 is_en); always_inline tcp_connection_t * tcp_connection_get (u32 conn_index, u32 thread_index) { - if (pool_is_free_index (tcp_main.connections[thread_index], conn_index)) + if (PREDICT_FALSE + (pool_is_free_index (tcp_main.connections[thread_index], conn_index))) return 0; return pool_elt_at_index (tcp_main.connections[thread_index], conn_index); } @@ -454,7 +457,7 @@ tcp_get_connection_from_transport (transport_connection_t * tconn) void tcp_connection_close (tcp_connection_t * tc); void tcp_connection_cleanup (tcp_connection_t * tc); void tcp_connection_del (tcp_connection_t * tc); -void tcp_half_open_connection_del (tcp_connection_t * tc); +int tcp_half_open_connection_cleanup (tcp_connection_t * tc); tcp_connection_t *tcp_connection_new (u8 thread_index); void tcp_connection_reset (tcp_connection_t * tc); @@ -473,9 +476,12 @@ tcp_listener_get (u32 tli) always_inline tcp_connection_t * tcp_half_open_connection_get (u32 conn_index) { - if (pool_is_free_index (tcp_main.half_open_connections, conn_index)) - return 0; - return pool_elt_at_index (tcp_main.half_open_connections, conn_index); + tcp_connection_t *tc = 0; + clib_spinlock_lock_if_init (&tcp_main.half_open_lock); + if (!pool_is_free_index (tcp_main.half_open_connections, conn_index)) + tc = pool_elt_at_index (tcp_main.half_open_connections, conn_index); + clib_spinlock_unlock_if_init (&tcp_main.half_open_lock); + return tc; } void tcp_make_ack (tcp_connection_t * ts, vlib_buffer_t * b); diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index e3da56f4..fc36eb29 100755 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -19,9 +19,9 @@ #include #define TCP_DEBUG (1) -#define TCP_DEBUG_SM (2) -#define TCP_DEBUG_CC (0) -#define TCP_DEBUG_CC_STAT (0) +#define TCP_DEBUG_SM (0) +#define TCP_DEBUG_CC (1) +#define TCP_DEBUG_CC_STAT (1) #define foreach_tcp_dbg_evt \ _(INIT, "") \ @@ -197,6 +197,19 @@ typedef enum _tcp_dbg_evt ed->data[0] = _tc->c_c_index; \ } +#define TCP_EVT_SYN_RCVD_HANDLER(_tc, ...) \ +{ \ + TCP_EVT_INIT_HANDLER(_tc, 0); \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "syn-rx: irs %u", \ + .format_args = "i4", \ + }; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->irs; \ + TCP_EVT_STATE_CHANGE_HANDLER(_tc); \ +} + #define TCP_EVT_UNBIND_HANDLER(_tc, ...) \ { \ TCP_EVT_DEALLOC_HANDLER(_tc); \ @@ -258,19 +271,6 @@ typedef enum _tcp_dbg_evt ed->data[0] = _tc->state; \ } -#define TCP_EVT_SYN_RCVD_HANDLER(_tc, ...) \ -{ \ - TCP_EVT_INIT_HANDLER(_tc, 0); \ - ELOG_TYPE_DECLARE (_e) = \ - { \ - .format = "syn-rx: irs %u", \ - .format_args = "i4", \ - }; \ - DECLARE_ETD(_tc, _e, 1); \ - ed->data[0] = _tc->irs; \ - TCP_EVT_STATE_CHANGE_HANDLER(_tc); \ -} - #define TCP_EVT_SYN_SENT_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index d32b4fc8..6c59d70f 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -1724,9 +1724,13 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */ tc0->state = TCP_STATE_CLOSE_WAIT; TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0); - tc0->rcv_nxt += (vnet_buffer (b0)->tcp.data_len == 0); + if (vnet_buffer (b0)->tcp.data_len == 0) + { + tc0->rcv_nxt += 1; + next0 = TCP_ESTABLISHED_NEXT_DROP; + } stream_session_disconnect_notify (&tc0->connection); - tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); } done: @@ -1819,7 +1823,6 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_main_t *tm = vnet_get_tcp_main (); u32 n_left_from, next_index, *from, *to_next; u32 my_thread_index = vm->thread_index, errors = 0; - u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -1936,10 +1939,6 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (tcp_options_parse (tcp0, &tc0->rcv_opts)) goto drop; - /* Stop connection establishment and retransmit timers */ - tcp_timer_reset (tc0, TCP_TIMER_ESTABLISH); - tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT_SYN); - /* Valid SYN or SYN-ACK. Move connection from half-open pool to * current thread pool. */ pool_get (tm->connections[my_thread_index], new_tc0); @@ -1948,7 +1947,14 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, new_tc0->c_thread_index = my_thread_index; new_tc0->rcv_nxt = vnet_buffer (b0)->tcp.seq_end; new_tc0->irs = seq0; - tcp_half_open_connection_del (tc0); + new_tc0->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID; + new_tc0->timers[TCP_TIMER_RETRANSMIT_SYN] = + TCP_TIMER_HANDLE_INVALID; + + /* If this is not the owning thread, wait for syn retransmit to + * expire and cleanup then */ + if (tcp_half_open_connection_cleanup (tc0)) + tc0->flags |= TCP_CONN_HALF_OPEN_DONE; if (tcp_opts_tstamp (&new_tc0->rcv_opts)) { @@ -1980,11 +1986,10 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Notify app that we have connection. If session layer can't * allocate session send reset */ - if (stream_session_connect_notify (&new_tc0->connection, sst, - 0)) + if (stream_session_connect_notify (&new_tc0->connection, 0)) { + tcp_send_reset (new_tc0, b0, is_ip4); tcp_connection_cleanup (new_tc0); - tcp_send_reset (tc0, b0, is_ip4); goto drop; } @@ -2002,8 +2007,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, new_tc0->state = TCP_STATE_SYN_RCVD; /* Notify app that we have connection */ - if (stream_session_connect_notify - (&new_tc0->connection, sst, 0)) + if (stream_session_connect_notify (&new_tc0->connection, 0)) { tcp_connection_cleanup (new_tc0); tcp_send_reset (tc0, b0, is_ip4); @@ -2250,6 +2254,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (tc0->snd_una == tc0->snd_una_max) { ASSERT (tcp_fin (tcp0)); + tc0->rcv_nxt += 1; tc0->state = TCP_STATE_FIN_WAIT_2; TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); @@ -2263,6 +2268,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, * acknowledged ("ok") but do not delete the TCB. */ if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) goto drop; + /* check if rtx queue is empty and ack CLOSE TODO */ break; case TCP_STATE_CLOSE_WAIT: @@ -2384,7 +2390,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Got FIN, send ACK! */ tc0->state = TCP_STATE_TIME_WAIT; tcp_connection_timers_reset (tc0); - tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); tcp_make_ack (tc0, b0); next0 = tcp_next_output (is_ip4); TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); @@ -2745,7 +2751,7 @@ tcp_lookup_is_valid (tcp_connection_t * tc, tcp_header_t * hdr) if ((tmp = stream_session_half_open_lookup (&tc->c_lcl_ip, &tc->c_rmt_ip, tc->c_lcl_port, tc->c_rmt_port, - tc->c_proto))) + tc->c_transport_proto))) { if (tmp->lcl_port == hdr->dst_port && tmp->rmt_port == hdr->src_port) diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 5e9ecf11..1ecb6ce6 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -1087,15 +1087,14 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) if (is_syn) { tc = tcp_half_open_connection_get (index); + tc->timers[TCP_TIMER_RETRANSMIT_SYN] = TCP_TIMER_HANDLE_INVALID; } else { tc = tcp_connection_get (index, thread_index); + tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID; } - /* Make sure timer handle is set to invalid */ - tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID; - if (!tcp_in_recovery (tc) && tc->rto_boff > 0 && tc->state >= TCP_STATE_ESTABLISHED) { @@ -1154,6 +1153,20 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Retransmit for SYN/SYNACK */ else if (tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_SYN_SENT) { + /* Half-open connection actually moved to established but we were + * waiting for syn retransmit to pop to call cleanup from the right + * thread. */ + if (tc->flags & TCP_CONN_HALF_OPEN_DONE) + { + ASSERT (tc->state == TCP_STATE_SYN_SENT); + if (tcp_half_open_connection_cleanup (tc)) + { + clib_warning ("could not remove half-open connection"); + ASSERT (0); + } + return; + } + /* Try without increasing RTO a number of times. If this fails, * start growing RTO exponentially */ if (tc->rto_boff > TCP_RTO_SYN_RETRIES) diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c index 5c40ddf9..37640cc6 100644 --- a/src/vnet/tcp/tcp_test.c +++ b/src/vnet/tcp/tcp_test.c @@ -1574,7 +1574,7 @@ tcp_test_lookup (vlib_main_t * vm, unformat_input_t * input) tc->connection.rmt_ip.ip4.as_u32 = clib_host_to_net_u32 (0x06000103); tc->connection.lcl_port = 35051; tc->connection.rmt_port = 53764; - tc->connection.proto = 0; + tc->connection.transport_proto = 0; clib_memcpy (tc1, &tc->connection, sizeof (*tc1)); pool_get (session_manager_main.sessions[0], s); @@ -1590,7 +1590,7 @@ tcp_test_lookup (vlib_main_t * vm, unformat_input_t * input) tc->connection.rmt_ip.ip4.as_u32 = clib_host_to_net_u32 (0x06000102); tc->connection.lcl_port = 38225; tc->connection.rmt_port = 53764; - tc->connection.proto = 0; + tc->connection.transport_proto = 0; clib_memcpy (tc2, &tc->connection, sizeof (*tc2)); /* @@ -1601,7 +1601,7 @@ tcp_test_lookup (vlib_main_t * vm, unformat_input_t * input) tconn = stream_session_lookup_transport_wt4 (&tc1->lcl_ip.ip4, &tc1->rmt_ip.ip4, tc1->lcl_port, tc1->rmt_port, - tc1->proto, 0); + tc1->transport_proto, 0); cmp = (memcmp (&tconn->rmt_ip, &tc1->rmt_ip, sizeof (tc1->rmt_ip)) == 0); TCP_TEST ((cmp), "rmt ip is identical %d", cmp); TCP_TEST ((tconn->lcl_port == tc1->lcl_port), @@ -1614,7 +1614,7 @@ tcp_test_lookup (vlib_main_t * vm, unformat_input_t * input) tconn = stream_session_lookup_transport_wt4 (&tc2->lcl_ip.ip4, &tc2->rmt_ip.ip4, tc2->lcl_port, tc2->rmt_port, - tc2->proto, 0); + tc2->transport_proto, 0); TCP_TEST ((tconn == 0), "lookup result should be null"); /* @@ -1624,12 +1624,12 @@ tcp_test_lookup (vlib_main_t * vm, unformat_input_t * input) tconn = stream_session_lookup_transport_wt4 (&tc1->lcl_ip.ip4, &tc1->rmt_ip.ip4, tc1->lcl_port, tc1->rmt_port, - tc1->proto, 0); + tc1->transport_proto, 0); TCP_TEST ((tconn == 0), "lookup result should be null"); tconn = stream_session_lookup_transport_wt4 (&tc2->lcl_ip.ip4, &tc2->rmt_ip.ip4, tc2->lcl_port, tc2->rmt_port, - tc2->proto, 0); + tc2->transport_proto, 0); TCP_TEST ((tconn == 0), "lookup result should be null"); /* @@ -1639,7 +1639,7 @@ tcp_test_lookup (vlib_main_t * vm, unformat_input_t * input) tconn = stream_session_lookup_transport_wt4 (&tc2->lcl_ip.ip4, &tc2->rmt_ip.ip4, tc2->lcl_port, tc2->rmt_port, - tc2->proto, 0); + tc2->transport_proto, 0); TCP_TEST ((tconn == 0), "lookup result should be null"); return 0; diff --git a/src/vnet/udp/udp.c b/src/vnet/udp/udp.c index ff76a82e..fedf2cc0 100644 --- a/src/vnet/udp/udp.c +++ b/src/vnet/udp/udp.c @@ -34,7 +34,7 @@ udp_session_bind_ip4 (u32 session_index, transport_endpoint_t * lcl) memset (listener, 0, sizeof (udp_connection_t)); listener->c_lcl_port = clib_host_to_net_u16 (lcl->port); listener->c_lcl_ip4.as_u32 = lcl->ip.ip4.as_u32; - listener->c_proto = SESSION_TYPE_IP4_UDP; + listener->c_transport_proto = TRANSPORT_PROTO_UDP; udp_register_dst_port (um->vlib_main, lcl->port, udp4_uri_input_node.index, 1 /* is_ipv4 */ ); return 0; @@ -49,7 +49,7 @@ udp_session_bind_ip6 (u32 session_index, transport_endpoint_t * lcl) pool_get (um->udp_listeners, listener); listener->c_lcl_port = clib_host_to_net_u16 (lcl->port); clib_memcpy (&listener->c_lcl_ip6, &lcl->ip.ip6, sizeof (ip6_address_t)); - listener->c_proto = SESSION_TYPE_IP6_UDP; + listener->c_transport_proto = TRANSPORT_PROTO_UDP; udp_register_dst_port (um->vlib_main, lcl->port, udp4_uri_input_node.index, 0 /* is_ipv4 */ ); return 0; @@ -318,8 +318,8 @@ udp_init (vlib_main_t * vm) /* Register as transport with URI */ - session_register_transport (SESSION_TYPE_IP4_UDP, &udp4_proto); - session_register_transport (SESSION_TYPE_IP6_UDP, &udp6_proto); + session_register_transport (TRANSPORT_PROTO_UDP, 1, &udp4_proto); + session_register_transport (TRANSPORT_PROTO_UDP, 0, &udp6_proto); /* * Initialize data structures diff --git a/src/vnet/udp/udp_input.c b/src/vnet/udp/udp_input.c index 9a8ff076..6ccb1e52 100644 --- a/src/vnet/udp/udp_input.c +++ b/src/vnet/udp/udp_input.c @@ -179,7 +179,7 @@ udp4_uri_input_node_fn (vlib_main_t * vm, us->c_rmt_ip4.as_u32 = ip0->src_address.as_u32; us->c_lcl_port = udp0->dst_port; us->c_rmt_port = udp0->src_port; - us->c_proto = SESSION_TYPE_IP4_UDP; + us->c_transport_proto = TRANSPORT_PROTO_UDP; us->c_c_index = us - um->udp_sessions[my_thread_index]; /* -- cgit 1.2.3-korg From 66b11318a1e5f24880e3ec77c95d70647732a4a8 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Mon, 31 Jul 2017 17:18:03 -0700 Subject: Fix tcp tx buffer allocation - Make tcp output buffer allocation macro an inline function - Use per ip version per thread tx frames for retransmits and timer events - Fix / parameterize tcp data structure preallocation - Add a couple of gdb-callable show commands - Fix local endpoint cleanup Change-Id: I67b47b7570aa14cb4634b6fd93c57cd2eacbfa29 Signed-off-by: Florin Coras Signed-off-by: Dave Barach --- src/plugins/dpdk/device/cli.c | 1 + src/vlib/error.c | 2 +- src/vlib/node_funcs.h | 16 ++-- src/vnet/ip/ip4.h | 2 - src/vnet/session/session.c | 82 +++++++++++++++-- src/vnet/session/session.h | 10 ++ src/vnet/session/session_cli.c | 2 +- src/vnet/session/session_lookup.c | 40 ++++++-- src/vnet/tcp/builtin_client.c | 39 ++++++-- src/vnet/tcp/tcp.c | 52 +++++++---- src/vnet/tcp/tcp.h | 12 ++- src/vnet/tcp/tcp_input.c | 2 + src/vnet/tcp/tcp_output.c | 188 +++++++++++++++++++++++++------------- src/vnet/unix/gdb_funcs.c | 45 ++++++++- src/vppinfra/pool.h | 2 +- 15 files changed, 375 insertions(+), 120 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/plugins/dpdk/device/cli.c b/src/plugins/dpdk/device/cli.c index aeeb772d..fe1c41c2 100644 --- a/src/plugins/dpdk/device/cli.c +++ b/src/plugins/dpdk/device/cli.c @@ -357,6 +357,7 @@ show_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input, "name=\"%s\" available = %7d allocated = %7d total = %7d\n", rmp->name, (u32) count, (u32) free_count, (u32) (count + free_count)); + rte_mempool_dump (stderr, rmp); } else { diff --git a/src/vlib/error.c b/src/vlib/error.c index e4ed4ee3..dec90bbe 100644 --- a/src/vlib/error.c +++ b/src/vlib/error.c @@ -280,7 +280,7 @@ show_errors (vlib_main_t * vm, } /* *INDENT-OFF* */ -VLIB_CLI_COMMAND (cli_show_errors, static) = { +VLIB_CLI_COMMAND (vlib_cli_show_errors) = { .path = "show errors", .short_help = "Show error counts", .function = show_errors, diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h index c0389b2f..c4c06454 100644 --- a/src/vlib/node_funcs.h +++ b/src/vlib/node_funcs.h @@ -410,19 +410,21 @@ vlib_frame_t *vlib_get_frame_to_node (vlib_main_t * vm, u32 to_node_index); void vlib_put_frame_to_node (vlib_main_t * vm, u32 to_node_index, vlib_frame_t * f); -always_inline vlib_process_t * -vlib_get_current_process (vlib_main_t * vm) -{ - vlib_node_main_t *nm = &vm->node_main; - return vec_elt (nm->processes, nm->current_process_index); -} - always_inline uword vlib_in_process_context (vlib_main_t * vm) { return vm->node_main.current_process_index != ~0; } +always_inline vlib_process_t * +vlib_get_current_process (vlib_main_t * vm) +{ + vlib_node_main_t *nm = &vm->node_main; + if (vlib_in_process_context (vm)) + return vec_elt (nm->processes, nm->current_process_index); + return 0; +} + always_inline uword vlib_current_process (vlib_main_t * vm) { diff --git a/src/vnet/ip/ip4.h b/src/vnet/ip/ip4.h index 8f9a8e27..74faa059 100644 --- a/src/vnet/ip/ip4.h +++ b/src/vnet/ip/ip4.h @@ -354,8 +354,6 @@ vlib_buffer_push_ip4 (vlib_main_t * vm, vlib_buffer_t * b, ih->checksum = 0; b->flags |= VNET_BUFFER_F_OFFLOAD_IP_CKSUM | VNET_BUFFER_F_IS_IP4; vnet_buffer (b)->l3_hdr_offset = (u8 *) ih - b->data; - vnet_buffer (b)->l4_hdr_offset = vnet_buffer (b)->l3_hdr_offset + - sizeof (*ih); } else ih->checksum = ip4_header_checksum (ih); diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 004c7193..4ba15291 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -759,6 +759,7 @@ session_manager_main_enable (vlib_main_t * vm) session_manager_main_t *smm = &session_manager_main; vlib_thread_main_t *vtm = vlib_get_thread_main (); u32 num_threads; + u32 preallocated_sessions_per_worker; int i; num_threads = 1 /* main thread */ + vtm->n_threads; @@ -795,15 +796,35 @@ session_manager_main_enable (vlib_main_t * vm) for (i = 0; i < vec_len (smm->vpp_event_queues); i++) session_vpp_event_queue_allocate (smm, i); - /* $$$$ preallocate hack config parameter */ - for (i = 0; i < smm->preallocated_sessions; i++) + /* Preallocate sessions */ + if (num_threads == 1) { - stream_session_t *ss __attribute__ ((unused)); - pool_get_aligned (smm->sessions[0], ss, CLIB_CACHE_LINE_BYTES); + for (i = 0; i < smm->preallocated_sessions; i++) + { + stream_session_t *ss __attribute__ ((unused)); + pool_get_aligned (smm->sessions[0], ss, CLIB_CACHE_LINE_BYTES); + } + + for (i = 0; i < smm->preallocated_sessions; i++) + pool_put_index (smm->sessions[0], i); } + else + { + int j; + preallocated_sessions_per_worker = smm->preallocated_sessions / + (num_threads - 1); - for (i = 0; i < smm->preallocated_sessions; i++) - pool_put_index (smm->sessions[0], i); + for (j = 1; j < num_threads; j++) + { + for (i = 0; i < preallocated_sessions_per_worker; i++) + { + stream_session_t *ss __attribute__ ((unused)); + pool_get_aligned (smm->sessions[j], ss, CLIB_CACHE_LINE_BYTES); + } + for (i = 0; i < preallocated_sessions_per_worker; i++) + pool_put_index (smm->sessions[j], i); + } + } session_lookup_init (); @@ -863,6 +884,7 @@ session_config_fn (vlib_main_t * vm, unformat_input_t * input) { session_manager_main_t *smm = &session_manager_main; u32 nitems; + uword tmp; while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { @@ -873,9 +895,53 @@ session_config_fn (vlib_main_t * vm, unformat_input_t * input) else clib_warning ("event queue length %d too small, ignored", nitems); } - if (unformat (input, "preallocated-sessions %d", - &smm->preallocated_sessions)) + else if (unformat (input, "preallocated-sessions %d", + &smm->preallocated_sessions)) + ; + else if (unformat (input, "v4-session-table-buckets %d", + &smm->configured_v4_session_table_buckets)) ; + else if (unformat (input, "v4-halfopen-table-buckets %d", + &smm->configured_v4_halfopen_table_buckets)) + ; + else if (unformat (input, "v6-session-table-buckets %d", + &smm->configured_v6_session_table_buckets)) + ; + else if (unformat (input, "v6-halfopen-table-buckets %d", + &smm->configured_v6_halfopen_table_buckets)) + ; + else if (unformat (input, "v4-session-table-memory %U", + unformat_memory_size, &tmp)) + { + if (tmp >= 0x100000000) + return clib_error_return (0, "memory size %llx (%lld) too large", + tmp, tmp); + smm->configured_v4_session_table_memory = tmp; + } + else if (unformat (input, "v4-halfopen-table-memory %U", + unformat_memory_size, &tmp)) + { + if (tmp >= 0x100000000) + return clib_error_return (0, "memory size %llx (%lld) too large", + tmp, tmp); + smm->configured_v4_halfopen_table_memory = tmp; + } + else if (unformat (input, "v6-session-table-memory %U", + unformat_memory_size, &tmp)) + { + if (tmp >= 0x100000000) + return clib_error_return (0, "memory size %llx (%lld) too large", + tmp, tmp); + smm->configured_v6_session_table_memory = tmp; + } + else if (unformat (input, "v6-halfopen-table-memory %U", + unformat_memory_size, &tmp)) + { + if (tmp >= 0x100000000) + return clib_error_return (0, "memory size %llx (%lld) too large", + tmp, tmp); + smm->configured_v6_halfopen_table_memory = tmp; + } else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index 180b9f8a..538433da 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -133,6 +133,16 @@ struct _session_manager_main /** vpp fifo event queue configured length */ u32 configured_event_queue_length; + /** session table size parameters */ + u32 configured_v4_session_table_buckets; + u32 configured_v4_session_table_memory; + u32 configured_v4_halfopen_table_buckets; + u32 configured_v4_halfopen_table_memory; + u32 configured_v6_session_table_buckets; + u32 configured_v6_session_table_memory; + u32 configured_v6_halfopen_table_buckets; + u32 configured_v6_halfopen_table_memory; + /** Unique segment name counter */ u32 unique_segment_name_counter; diff --git a/src/vnet/session/session_cli.c b/src/vnet/session/session_cli.c index de564ea7..9f3d217c 100755 --- a/src/vnet/session/session_cli.c +++ b/src/vnet/session/session_cli.c @@ -312,7 +312,7 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, } /* *INDENT-OFF* */ -VLIB_CLI_COMMAND (show_session_command, static) = +VLIB_CLI_COMMAND (vlib_cli_show_session_command) = { .path = "show session", .short_help = "show session [verbose]", diff --git a/src/vnet/session/session_lookup.c b/src/vnet/session/session_lookup.c index 1ce22f80..41f9dbf0 100644 --- a/src/vnet/session/session_lookup.c +++ b/src/vnet/session/session_lookup.c @@ -569,23 +569,45 @@ stream_session_lookup_transport6 (ip6_address_t * lcl, ip6_address_t * rmt, return 0; } +#define foreach_hash_table_parameter \ + _(v4,session,buckets,20000) \ + _(v4,session,memory,(64<<20)) \ + _(v6,session,buckets,20000) \ + _(v6,session,memory,(64<<20)) \ + _(v4,halfopen,buckets,20000) \ + _(v4,halfopen,memory,(64<<20)) \ + _(v6,halfopen,buckets,20000) \ + _(v6,halfopen,memory,(64<<20)) + void session_lookup_init (void) { session_lookup_t *sl = &session_lookup; + +#define _(af,table,parm,value) \ + u32 configured_##af##_##table##_table_##parm = value; + foreach_hash_table_parameter; +#undef _ + +#define _(af,table,parm,value) \ + if (session_manager_main.configured_##af##_##table##_table_##parm) \ + configured_##af##_##table##_table_##parm = \ + session_manager_main.configured_##af##_##table##_table_##parm; + foreach_hash_table_parameter; +#undef _ + clib_bihash_init_16_8 (&sl->v4_session_hash, "v4 session table", - 200000 /* $$$$ config parameter nbuckets */ , - (64 << 20) /*$$$ config parameter table size */ ); + configured_v4_session_table_buckets, + configured_v4_session_table_memory); clib_bihash_init_48_8 (&sl->v6_session_hash, "v6 session table", - 200000 /* $$$$ config parameter nbuckets */ , - (64 << 20) /*$$$ config parameter table size */ ); - + configured_v6_session_table_buckets, + configured_v6_session_table_memory); clib_bihash_init_16_8 (&sl->v4_half_open_hash, "v4 half-open table", - 200000 /* $$$$ config parameter nbuckets */ , - (64 << 20) /*$$$ config parameter table size */ ); + configured_v4_halfopen_table_buckets, + configured_v4_halfopen_table_memory); clib_bihash_init_48_8 (&sl->v6_half_open_hash, "v6 half-open table", - 200000 /* $$$$ config parameter nbuckets */ , - (64 << 20) /*$$$ config parameter table size */ ); + configured_v6_halfopen_table_buckets, + configured_v6_halfopen_table_memory); } /* diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index 27e20f8e..48daffb4 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -597,8 +597,9 @@ clients_connect (vlib_main_t * vm, u8 * uri, u32 n_clients) a->mp = 0; vnet_connect_uri (a); - /* Crude pacing for call setups, 100k/sec */ - vlib_process_suspend (vm, 10e-6); + /* Crude pacing for call setups */ + if ((i % 4) == 0) + vlib_process_suspend (vm, 10e-6); } } @@ -612,8 +613,10 @@ test_tcp_clients_command_fn (vlib_main_t * vm, uword *event_data = 0, event_type; u8 *default_connect_uri = (u8 *) "tcp://6.0.1.1/1234", *uri; u64 tmp, total_bytes; - f64 cli_timeout = 20.0, delta; + f64 test_timeout = 20.0, syn_timeout = 20.0, delta; + f64 time_before_connects; u32 n_clients = 1; + int preallocate_sessions = 0; char *transfer_type; int i; @@ -640,7 +643,9 @@ test_tcp_clients_command_fn (vlib_main_t * vm, ; else if (unformat (input, "uri %s", &tm->connect_uri)) ; - else if (unformat (input, "cli-timeout %f", &cli_timeout)) + else if (unformat (input, "test-timeout %f", &test_timeout)) + ; + else if (unformat (input, "syn-timeout %f", &syn_timeout)) ; else if (unformat (input, "no-return")) tm->no_return = 1; @@ -657,6 +662,8 @@ test_tcp_clients_command_fn (vlib_main_t * vm, tm->private_segment_size = tmp; else if (unformat (input, "preallocate-fifos")) tm->prealloc_fifos = 1; + else if (unformat (input, "preallocate-sessions")) + preallocate_sessions = 1; else if (unformat (input, "client-batch %d", &tm->connections_per_batch)) ; @@ -674,6 +681,7 @@ test_tcp_clients_command_fn (vlib_main_t * vm, return clib_error_return (0, "failed init"); } + tm->ready_connections = 0; tm->expected_connections = n_clients; tm->rx_total = 0; @@ -705,11 +713,21 @@ test_tcp_clients_command_fn (vlib_main_t * vm, vlib_node_set_state (vlib_mains[i], builtin_client_node.index, VLIB_NODE_STATE_POLLING); + if (preallocate_sessions) + { + session_t *sp __attribute__ ((unused)); + for (i = 0; i < n_clients; i++) + pool_get (tm->sessions, sp); + for (i = 0; i < n_clients; i++) + pool_put_index (tm->sessions, i); + } + /* Fire off connect requests */ + time_before_connects = vlib_time_now (vm); clients_connect (vm, uri, n_clients); /* Park until the sessions come up, or ten seconds elapse... */ - vlib_process_wait_for_event_or_clock (vm, 10 /* timeout, seconds */ ); + vlib_process_wait_for_event_or_clock (vm, syn_timeout); event_type = vlib_process_get_events (vm, &event_data); switch (event_type) { @@ -719,6 +737,15 @@ test_tcp_clients_command_fn (vlib_main_t * vm, goto cleanup; case 1: + delta = vlib_time_now (vm) - time_before_connects; + + if (delta != 0.0) + { + vlib_cli_output + (vm, "%d three-way handshakes in %.2f seconds, %.2f/sec", + n_clients, delta, ((f64) n_clients) / delta); + } + tm->test_start_time = vlib_time_now (tm->vlib_main); vlib_cli_output (vm, "Test started at %.6f", tm->test_start_time); break; @@ -729,7 +756,7 @@ test_tcp_clients_command_fn (vlib_main_t * vm, } /* Now wait for the sessions to finish... */ - vlib_process_wait_for_event_or_clock (vm, cli_timeout); + vlib_process_wait_for_event_or_clock (vm, test_timeout); event_type = vlib_process_get_events (vm, &event_data); switch (event_type) { diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 59b20747..8e2eb9f4 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -173,7 +173,7 @@ tcp_connection_cleanup (tcp_connection_t * tc) /* Cleanup local endpoint if this was an active connect */ tepi = transport_endpoint_lookup (&tm->local_endpoints_table, &tc->c_lcl_ip, - tc->c_lcl_port); + clib_net_to_host_u16 (tc->c_lcl_port)); if (tepi != TRANSPORT_ENDPOINT_INVALID_INDEX) { tep = pool_elt_at_index (tm->local_endpoints, tepi); @@ -367,25 +367,24 @@ tcp_allocate_local_port (ip46_address_t * ip) { tcp_main_t *tm = vnet_get_tcp_main (); transport_endpoint_t *tep; - u32 time_now, tei; + u32 tei; u16 min = 1024, max = 65535; /* XXX configurable ? */ - int tries; + int tries, limit; - tries = max - min; - time_now = tcp_time_now (); + limit = max - min; /* Only support active opens from thread 0 */ ASSERT (vlib_get_thread_index () == 0); /* Search for first free slot */ - for (; tries >= 0; tries--) + for (tries = 0; tries < limit; tries++) { u16 port = 0; /* Find a port in the specified range */ while (1) { - port = random_u32 (&time_now) & PORT_MASK; + port = random_u32 (&tm->port_allocator_seed) & PORT_MASK; if (PREDICT_TRUE (port >= min && port < max)) break; } @@ -1189,8 +1188,9 @@ tcp_main_enable (vlib_main_t * vm) vlib_thread_main_t *vtm = vlib_get_thread_main (); clib_error_t *error = 0; u32 num_threads; - int thread, i; + int i, thread; tcp_connection_t *tc __attribute__ ((unused)); + u32 preallocated_connections_per_thread; if ((error = vlib_call_init_function (vm, ip_main_init))) return error; @@ -1224,14 +1224,26 @@ tcp_main_enable (vlib_main_t * vm) vec_validate (tm->connections, num_threads - 1); /* - * Preallocate connections + * Preallocate connections. Assume that thread 0 won't + * use preallocated threads when running multi-core */ - for (thread = 0; thread < num_threads; thread++) + if (num_threads == 1) { - for (i = 0; i < tm->preallocated_connections; i++) + thread = 0; + preallocated_connections_per_thread = tm->preallocated_connections; + } + else + { + thread = 1; + preallocated_connections_per_thread = + tm->preallocated_connections / (num_threads - 1); + } + for (; thread < num_threads; thread++) + { + for (i = 0; i < preallocated_connections_per_thread; i++) pool_get (tm->connections[thread], tc); - for (i = 0; i < tm->preallocated_connections; i++) + for (i = 0; i < preallocated_connections_per_thread; i++) pool_put_index (tm->connections[thread], i); } @@ -1257,13 +1269,21 @@ tcp_main_enable (vlib_main_t * vm) / TCP_TSTAMP_RESOLUTION; clib_bihash_init_24_8 (&tm->local_endpoints_table, "local endpoint table", - 200000 /* $$$$ config parameter nbuckets */ , - (64 << 20) /*$$$ config parameter table size */ ); + 1000000 /* $$$$ config parameter nbuckets */ , + (512 << 20) /*$$$ config parameter table size */ ); + + /* Initialize [port-allocator] random number seed */ + tm->port_allocator_seed = (u32) clib_cpu_time_now (); + if (num_threads > 1) { clib_spinlock_init (&tm->half_open_lock); clib_spinlock_init (&tm->local_endpoints_lock); } + + vec_validate (tm->tx_frames[0], num_threads - 1); + vec_validate (tm->tx_frames[1], num_threads - 1); + return error; } @@ -1289,16 +1309,12 @@ clib_error_t * tcp_init (vlib_main_t * vm) { tcp_main_t *tm = vnet_get_tcp_main (); - - tm->vnet_main = vnet_get_main (); tm->is_enabled = 0; - return 0; } VLIB_INIT_FUNCTION (tcp_init); - static clib_error_t * tcp_config_fn (vlib_main_t * vm, unformat_input_t * input) { diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 4fa681f8..997df76f 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -369,6 +369,8 @@ typedef struct _tcp_main /** per-worker tx buffer free lists */ u32 **tx_buffers; + /** per-worker tx frames to 4/6 output nodes */ + vlib_frame_t **tx_frames[2]; /* Per worker-thread timer wheel for connections timers */ tw_timer_wheel_16t_2w_512sl_t *timer_wheels; @@ -400,11 +402,8 @@ typedef struct _tcp_main u32 last_v6_address_rotor; ip6_address_t *ip6_src_addresses; - /* convenience */ - vlib_main_t *vlib_main; - vnet_main_t *vnet_main; - ip4_main_t *ip4_main; - ip6_main_t *ip6_main; + /** Port allocator random number generator seed */ + u32 port_allocator_seed; } tcp_main_t; extern tcp_main_t tcp_main; @@ -493,6 +492,8 @@ void tcp_send_fin (tcp_connection_t * tc); void tcp_init_mss (tcp_connection_t * tc); void tcp_update_snd_mss (tcp_connection_t * tc); void tcp_update_rto (tcp_connection_t * tc); +void tcp_flush_frame_to_output (vlib_main_t * vm, u8 thread_index, u8 is_ip4); +void tcp_flush_frames_to_output (u8 thread_index); always_inline u32 tcp_end_seq (tcp_header_t * th, u32 len) @@ -614,6 +615,7 @@ tcp_update_time (f64 now, u32 thread_index) { tw_timer_expire_timers_16t_2w_512sl (&tcp_main.timer_wheels[thread_index], now); + tcp_flush_frames_to_output (thread_index); } u32 tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b); diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 6c59d70f..29f4f08d 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -1751,6 +1751,8 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, errors = session_manager_flush_enqueue_events (my_thread_index); tcp_established_inc_counter (vm, is_ip4, TCP_ERROR_EVENT_FIFO_FULL, errors); + tcp_flush_frame_to_output (vm, my_thread_index, is_ip4); + return from_frame->n_vectors; } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index ad13493a..f8fbb8a9 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -436,34 +436,41 @@ tcp_init_mss (tcp_connection_t * tc) tc->snd_mss -= TCP_OPTION_LEN_TIMESTAMP; } -#define tcp_get_free_buffer_index(tm, bidx) \ -do { \ - u32 *my_tx_buffers, n_free_buffers; \ - u32 thread_index = vlib_get_thread_index(); \ - my_tx_buffers = tm->tx_buffers[thread_index]; \ - if (PREDICT_FALSE(vec_len (my_tx_buffers) == 0)) \ - { \ - n_free_buffers = 32; /* TODO config or macro */ \ - vec_validate (my_tx_buffers, n_free_buffers - 1); \ - _vec_len(my_tx_buffers) = vlib_buffer_alloc_from_free_list ( \ - vlib_get_main(), my_tx_buffers, n_free_buffers, \ - VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); \ - tm->tx_buffers[thread_index] = my_tx_buffers; \ - } \ - /* buffer shortage */ \ - if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) \ - return; \ - *bidx = my_tx_buffers[_vec_len (my_tx_buffers)-1]; \ - _vec_len (my_tx_buffers) -= 1; \ -} while (0) - -#define tcp_return_buffer(tm) \ -do { \ - u32 *my_tx_buffers; \ - u32 thread_index = vlib_get_thread_index(); \ - my_tx_buffers = tm->tx_buffers[thread_index]; \ - _vec_len (my_tx_buffers) +=1; \ -} while (0) +always_inline int +tcp_get_free_buffer_index (tcp_main_t * tm, u32 * bidx) +{ + u32 *my_tx_buffers, n_free_buffers; + u32 thread_index = vlib_get_thread_index (); + my_tx_buffers = tm->tx_buffers[thread_index]; + if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) + { + n_free_buffers = VLIB_FRAME_SIZE; + vec_validate (my_tx_buffers, n_free_buffers - 1); + _vec_len (my_tx_buffers) = + vlib_buffer_alloc_from_free_list (vlib_get_main (), my_tx_buffers, + n_free_buffers, + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + /* buffer shortage, report failure */ + if (vec_len (my_tx_buffers) == 0) + { + clib_warning ("out of buffers"); + return -1; + } + tm->tx_buffers[thread_index] = my_tx_buffers; + } + *bidx = my_tx_buffers[_vec_len (my_tx_buffers) - 1]; + _vec_len (my_tx_buffers) -= 1; + return 0; +} + +always_inline void +tcp_return_buffer (tcp_main_t * tm) +{ + u32 *my_tx_buffers; + u32 thread_index = vlib_get_thread_index (); + my_tx_buffers = tm->tx_buffers[thread_index]; + _vec_len (my_tx_buffers) += 1; +} always_inline void tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b) @@ -706,7 +713,9 @@ tcp_send_reset (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4) ip4_header_t *ih4, *pkt_ih4; ip6_header_t *ih6, *pkt_ih6; - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); /* Leave enough space for headers */ @@ -811,7 +820,9 @@ tcp_send_syn (tcp_connection_t * tc) u16 initial_wnd; tcp_options_t snd_opts; - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); /* Leave enough space for headers */ @@ -854,8 +865,11 @@ tcp_send_syn (tcp_connection_t * tc) } always_inline void -tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) +tcp_enqueue_to_output_i (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, + u8 is_ip4, u8 flush) { + tcp_main_t *tm = vnet_get_tcp_main (); + u32 thread_index = vlib_get_thread_index (); u32 *to_next, next_index; vlib_frame_t *f; @@ -872,12 +886,62 @@ tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) b->pre_data[1] = next_index; } - /* Enqueue the packet */ - f = vlib_get_frame_to_node (vm, next_index); + /* Get frame to v4/6 output node */ + f = tm->tx_frames[!is_ip4][thread_index]; + if (!f) + { + f = vlib_get_frame_to_node (vm, next_index); + ASSERT (f); + tm->tx_frames[!is_ip4][thread_index] = f; + } to_next = vlib_frame_vector_args (f); - to_next[0] = bi; - f->n_vectors = 1; - vlib_put_frame_to_node (vm, next_index, f); + to_next[f->n_vectors] = bi; + f->n_vectors += 1; + if (flush || f->n_vectors == VLIB_FRAME_SIZE) + { + vlib_put_frame_to_node (vm, next_index, f); + tm->tx_frames[!is_ip4][thread_index] = 0; + } +} + +always_inline void +tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) +{ + tcp_enqueue_to_output_i (vm, b, bi, is_ip4, 0); +} + +always_inline void +tcp_enqueue_to_output_now (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, + u8 is_ip4) +{ + tcp_enqueue_to_output_i (vm, b, bi, is_ip4, 1); +} + +/** + * Flush tx frame populated by retransmits and timer pops + */ +void +tcp_flush_frame_to_output (vlib_main_t * vm, u8 thread_index, u8 is_ip4) +{ + if (tcp_main.tx_frames[!is_ip4][thread_index]) + { + u32 next_index; + next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index; + vlib_put_frame_to_node (vm, next_index, + tcp_main.tx_frames[!is_ip4][thread_index]); + tcp_main.tx_frames[!is_ip4][thread_index] = 0; + } +} + +/** + * Flush both v4 and v6 tx frames for thread index + */ +void +tcp_flush_frames_to_output (u8 thread_index) +{ + vlib_main_t *vm = vlib_get_main (); + tcp_flush_frame_to_output (vm, thread_index, 1); + tcp_flush_frame_to_output (vm, thread_index, 0); } /** @@ -891,14 +955,15 @@ tcp_send_fin (tcp_connection_t * tc) tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; b = vlib_get_buffer (vm, bi); /* Leave enough space for headers */ vlib_buffer_make_headroom (b, MAX_HDRS_LEN); tcp_make_fin (tc, b); - tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4); tc->flags |= TCP_CONN_FINSNT; tcp_retransmit_timer_force_update (tc); TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc); @@ -981,7 +1046,8 @@ tcp_send_ack (tcp_connection_t * tc) u32 bi; /* Get buffer */ - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; b = vlib_get_buffer (vm, bi); /* Fill in the ACK */ @@ -1108,7 +1174,9 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Go back to first un-acked byte */ tc->snd_nxt = tc->snd_una; - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); if (tc->state >= TCP_STATE_ESTABLISHED) @@ -1116,6 +1184,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Lost FIN, retransmit and return */ if (tc->flags & TCP_CONN_FINSNT) { + tcp_return_buffer (tm); tcp_send_fin (tc); return; } @@ -1143,6 +1212,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) tcp_retransmit_timer_set (tc); ASSERT (0 || (tc->rto_boff > 1 && tc->snd_una == tc->snd_congestion)); + tcp_return_buffer (tm); return; } @@ -1164,6 +1234,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) clib_warning ("could not remove half-open connection"); ASSERT (0); } + tcp_return_buffer (tm); return; } @@ -1185,6 +1256,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { ASSERT (tc->state == TCP_STATE_CLOSED); clib_warning ("connection closed ..."); + tcp_return_buffer (tm); return; } @@ -1254,7 +1326,9 @@ tcp_timer_persist_handler (u32 index) tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); /* Try to force the first unsent segment */ - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); @@ -1300,7 +1374,9 @@ tcp_retransmit_first_unacked (tcp_connection_t * tc) tc->snd_nxt = tc->snd_una; /* Get buffer */ - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2); @@ -1344,9 +1420,10 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); while (hole && snd_space > 0) { - tcp_get_free_buffer_index (tm, &bi); - b = vlib_get_buffer (vm, bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); hole = scoreboard_next_rxt_hole (sb, hole, tcp_fastrecovery_sent_1_smss (tc), &can_rescue, &snd_limited); @@ -1414,9 +1491,9 @@ tcp_fast_retransmit_no_sack (tcp_connection_t * tc) while (snd_space > 0) { - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; b = vlib_get_buffer (vm, bi); - offset += n_written; n_written = tcp_prepare_retransmit_segment (tc, b, offset, snd_space); @@ -1506,32 +1583,21 @@ tcp46_output_inline (vlib_main_t * vm, if (is_ip4) { - ip4_header_t *ih0; - ih0 = vlib_buffer_push_ip4 (vm, b0, &tc0->c_lcl_ip4, - &tc0->c_rmt_ip4, IP_PROTOCOL_TCP, - 1); - b0->flags |= - VNET_BUFFER_F_IS_IP4 | VNET_BUFFER_F_OFFLOAD_IP_CKSUM | - VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; - vnet_buffer (b0)->l3_hdr_offset = (u8 *) ih0 - b0->data; + vlib_buffer_push_ip4 (vm, b0, &tc0->c_lcl_ip4, &tc0->c_rmt_ip4, + IP_PROTOCOL_TCP, 1); + b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; vnet_buffer (b0)->l4_hdr_offset = (u8 *) th0 - b0->data; th0->checksum = 0; } else { ip6_header_t *ih0; - int bogus = ~0; - ih0 = vlib_buffer_push_ip6 (vm, b0, &tc0->c_lcl_ip6, &tc0->c_rmt_ip6, IP_PROTOCOL_TCP); - - b0->flags |= VNET_BUFFER_F_IS_IP6 | - VNET_BUFFER_F_OFFLOAD_IP_CKSUM | - VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; + b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; vnet_buffer (b0)->l3_hdr_offset = (u8 *) ih0 - b0->data; vnet_buffer (b0)->l4_hdr_offset = (u8 *) th0 - b0->data; th0->checksum = 0; - ASSERT (!bogus); } /* Filter out DUPACKs if there are no OOO segments left */ diff --git a/src/vnet/unix/gdb_funcs.c b/src/vnet/unix/gdb_funcs.c index cca2e420..32e22d92 100644 --- a/src/vnet/unix/gdb_funcs.c +++ b/src/vnet/unix/gdb_funcs.c @@ -21,7 +21,7 @@ #include #include - +#include /** * @brief GDB callable function: vl - Return vector length of vector @@ -135,6 +135,47 @@ void vlib_runtime_index_to_node_name (u32 index) fformat(stderr, "node runtime index %d name %s\n", index, nm->nodes[index]->name); } +void gdb_show_errors (int verbose) +{ + extern vlib_cli_command_t vlib_cli_show_errors; + unformat_input_t input; + vlib_main_t * vm = vlib_get_main(); + + if (verbose == 0) + unformat_init_string (&input, "verbose 0", 9); + else if (verbose == 1) + unformat_init_string (&input, "verbose 1", 9); + else + { + fformat(stderr, "verbose not 0 or 1\n"); + return; + } + + vlib_cli_show_errors.function (vm, &input, 0 /* cmd */); + unformat_free (&input); +} + +void gdb_show_session (int verbose) +{ + extern vlib_cli_command_t vlib_cli_show_session_command; + unformat_input_t input; + vlib_main_t * vm = vlib_get_main(); + + if (verbose == 0) + unformat_init_string (&input, "verbose 0", 9); + else if (verbose == 1) + unformat_init_string (&input, "verbose 1", 9); + else if (verbose == 2) + unformat_init_string (&input, "verbose 2", 9); + else + { + fformat(stderr, "verbose not 0 - 2\n"); + return; + } + + vlib_cli_show_session_command.function (vm, &input, 0 /* cmd */); + unformat_free (&input); +} /** * @brief GDB callable function: show_gdb_command_fn - show gdb @@ -151,6 +192,8 @@ show_gdb_command_fn (vlib_main_t * vm, vlib_cli_output (vm, "vl(p) returns vec_len(p)"); vlib_cli_output (vm, "pe(p) returns pool_elts(p)"); vlib_cli_output (vm, "pifi(p, i) returns pool_is_free_index(p, i)"); + vlib_cli_output (vm, "gdb_show_errors(0|1) dumps error counters"); + vlib_cli_output (vm, "gdb_show_session dumps session counters"); vlib_cli_output (vm, "debug_hex_bytes (ptr, n_bytes) dumps n_bytes in hex"); vlib_cli_output (vm, "vlib_dump_frame_ownership() does what it says"); vlib_cli_output (vm, "vlib_runtime_index_to_node_name (index) prints NN"); diff --git a/src/vppinfra/pool.h b/src/vppinfra/pool.h index 57838e1c..56536b77 100644 --- a/src/vppinfra/pool.h +++ b/src/vppinfra/pool.h @@ -200,7 +200,7 @@ do { \ #define pool_get(P,E) pool_get_aligned(P,E,0) /** See if pool_get will expand the pool or not */ -#define pool_get_aligned_will_expand (P,YESNO,A) \ +#define pool_get_aligned_will_expand(P,YESNO,A) \ do { \ pool_header_t * _pool_var (p) = pool_header (P); \ uword _pool_var (l); \ -- cgit 1.2.3-korg From b2215d6b0d8ef7d425d2b9eea524a1c055a9f3b3 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Tue, 1 Aug 2017 16:56:58 -0700 Subject: Fix tcp multi buffer segments retransmission - Fix tcp/udp sw checksum computation - Fix allocation of multi buffer tcp segments for retransmits - Send FIN only if/when tx fifo is empty Change-Id: I2e43a14b87a72c9e547b4339b9a51811cf5732c4 Signed-off-by: Florin Coras --- src/vlib/buffer_funcs.h | 7 +- src/vnet/ip/ip4_forward.c | 12 +- src/vnet/session/session.c | 6 +- src/vnet/session/session_node.c | 39 ++++-- src/vnet/tcp/tcp.c | 39 ++++-- src/vnet/tcp/tcp.h | 18 ++- src/vnet/tcp/tcp_input.c | 48 +++++-- src/vnet/tcp/tcp_output.c | 290 +++++++++++++++++++++++++--------------- 8 files changed, 304 insertions(+), 155 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 72008dad..6a662416 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -833,7 +833,12 @@ vlib_buffer_init_for_free_list (vlib_buffer_t * dst, _(current_length); _(flags); #undef _ - ASSERT (dst->total_length_not_including_first_buffer == 0); + /* ASSERT (dst->total_length_not_including_first_buffer == 0); */ + /* total_length_not_including_first_buffer is not in the template anymore + * so it may actually not zeroed for some buffers. One option is to + * uncomment the line lower (comes at a cost), the other, is to just not + * care */ + /* dst->total_length_not_including_first_buffer = 0; */ ASSERT (dst->n_add_refs == 0); } diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index 7a8d7a0c..496df3c7 100755 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -1454,7 +1454,7 @@ ip4_tcp_udp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0, { ip_csum_t sum0; u32 ip_header_length, payload_length_host_byte_order; - u32 n_this_buffer, n_bytes_left; + u32 n_this_buffer, n_bytes_left, n_ip_bytes_this_buffer; u16 sum16; void *data_this_buffer; @@ -1481,10 +1481,12 @@ ip4_tcp_udp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0, n_bytes_left = n_this_buffer = payload_length_host_byte_order; data_this_buffer = (void *) ip0 + ip_header_length; - if (n_this_buffer + ip_header_length > p0->current_length) - n_this_buffer = - p0->current_length > - ip_header_length ? p0->current_length - ip_header_length : 0; + n_ip_bytes_this_buffer = p0->current_length - (((u8 *) ip0 - p0->data) - p0->current_data); + if (n_this_buffer + ip_header_length > n_ip_bytes_this_buffer) + { + n_this_buffer = n_ip_bytes_this_buffer > ip_header_length ? + n_ip_bytes_this_buffer - ip_header_length : 0; + } while (1) { sum0 = ip_incremental_checksum (sum0, data_this_buffer, n_this_buffer); diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 533a6c22..3a3e4dfe 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -98,9 +98,9 @@ session_enqueue_chain_tail (stream_session_t * s, vlib_buffer_t * b, u32 offset, u8 is_in_order) { vlib_buffer_t *chain_b; - u32 chain_bi = b->next_buffer; + u32 chain_bi = b->next_buffer, len; vlib_main_t *vm = vlib_get_main (); - u8 *data, len; + u8 *data; u16 written = 0; int rv = 0; @@ -226,7 +226,7 @@ u32 stream_session_tx_fifo_max_dequeue (transport_connection_t * tc) { stream_session_t *s = stream_session_get (tc->s_index, tc->thread_index); - if (s->session_state != SESSION_STATE_READY) + if (!s->server_tx_fifo) return 0; return svm_fifo_max_dequeue (s->server_tx_fifo); } diff --git a/src/vnet/session/session_node.c b/src/vnet/session/session_node.c index 8d703b0b..9c5b17d9 100644 --- a/src/vnet/session/session_node.c +++ b/src/vnet/session/session_node.c @@ -75,20 +75,25 @@ always_inline void session_tx_fifo_chain_tail (session_manager_main_t * smm, vlib_main_t * vm, u8 thread_index, svm_fifo_t * fifo, vlib_buffer_t * b0, u32 bi0, u8 n_bufs_per_seg, - u32 * left_to_snd0, u16 * n_bufs, u32 * rx_offset, - u16 deq_per_buf, u8 peek_data) + u32 left_from_seg, u32 * left_to_snd0, + u16 * n_bufs, u32 * rx_offset, u16 deq_per_buf, + u8 peek_data) { vlib_buffer_t *chain_b0, *prev_b0; - u32 chain_bi0; + u32 chain_bi0, to_deq; u16 len_to_deq0, n_bytes_read; u8 *data0, j; + b0->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + b0->total_length_not_including_first_buffer = 0; + chain_bi0 = bi0; chain_b0 = b0; + to_deq = left_from_seg; for (j = 1; j < n_bufs_per_seg; j++) { prev_b0 = chain_b0; - len_to_deq0 = clib_min (*left_to_snd0, deq_per_buf); + len_to_deq0 = clib_min (to_deq, deq_per_buf); *n_bufs -= 1; chain_bi0 = smm->tx_buffers[thread_index][*n_bufs]; @@ -117,10 +122,12 @@ session_tx_fifo_chain_tail (session_manager_main_t * smm, vlib_main_t * vm, /* update current buffer */ chain_b0->next_buffer = 0; - *left_to_snd0 -= n_bytes_read; - if (*left_to_snd0 == 0) + to_deq -= n_bytes_read; + if (to_deq == 0) break; } + ASSERT (to_deq == 0); + *left_to_snd0 -= left_from_seg; } always_inline int @@ -223,7 +230,6 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, && ((buffers_allocated + n_bufs < VLIB_FRAME_SIZE))); n_bufs += buffers_allocated; - _vec_len (smm->tx_buffers[thread_index]) = n_bufs; if (PREDICT_FALSE (n_bufs < VLIB_FRAME_SIZE)) @@ -289,11 +295,15 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, * Fill in the remaining buffers in the chain, if any */ if (PREDICT_FALSE (n_bufs_per_seg > 1)) - session_tx_fifo_chain_tail (smm, vm, thread_index, - s0->server_tx_fifo, b0, bi0, - n_bufs_per_seg, &left_to_snd0, - &n_bufs, &rx_offset, deq_per_buf, - peek_data); + { + u32 left_for_seg; + left_for_seg = clib_min (snd_mss0 - n_bytes_read, left_to_snd0); + session_tx_fifo_chain_tail (smm, vm, thread_index, + s0->server_tx_fifo, b0, bi0, + n_bufs_per_seg, left_for_seg, + &left_to_snd0, &n_bufs, &rx_offset, + deq_per_buf, peek_data); + } /* Ask transport to push header after current_length and * total_length_not_including_first_buffer are updated */ @@ -607,8 +617,9 @@ skip_dequeue: clib_warning ("It's dead, Jim!"); continue; } - - if (PREDICT_FALSE (s0->session_state == SESSION_STATE_CLOSED)) + /* Can retransmit for closed sessions but can't do anything if + * session is not ready or closed */ + if (PREDICT_FALSE (s0->session_state < SESSION_STATE_READY)) continue; /* Spray packets in per session type frames, since they go to * different nodes */ diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 8e2eb9f4..4652618b 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -288,18 +288,31 @@ tcp_connection_close (tcp_connection_t * tc) { TCP_EVT_DBG (TCP_EVT_CLOSE, tc); - /* Send FIN if needed */ - if (tc->state == TCP_STATE_ESTABLISHED - || tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_CLOSE_WAIT) - tcp_send_fin (tc); - - /* Switch state */ - if (tc->state == TCP_STATE_ESTABLISHED || tc->state == TCP_STATE_SYN_RCVD) - tc->state = TCP_STATE_FIN_WAIT_1; - else if (tc->state == TCP_STATE_SYN_SENT) - tc->state = TCP_STATE_CLOSED; - else if (tc->state == TCP_STATE_CLOSE_WAIT) - tc->state = TCP_STATE_LAST_ACK; + /* Send/Program FIN if needed and switch state */ + switch (tc->state) + { + case TCP_STATE_SYN_SENT: + tc->state = TCP_STATE_CLOSED; + break; + case TCP_STATE_SYN_RCVD: + tcp_send_fin (tc); + tc->state = TCP_STATE_FIN_WAIT_1; + break; + case TCP_STATE_ESTABLISHED: + if (!stream_session_tx_fifo_max_dequeue (&tc->connection)) + tcp_send_fin (tc); + else + tc->flags |= TCP_CONN_FINPNDG; + tc->state = TCP_STATE_FIN_WAIT_1; + break; + case TCP_STATE_CLOSE_WAIT: + tcp_send_fin (tc); + tc->state = TCP_STATE_LAST_ACK; + break; + default: + clib_warning ("shouldn't be here"); + } + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc); /* If in CLOSED and WAITCLOSE timer is not set, delete connection now */ @@ -1284,6 +1297,8 @@ tcp_main_enable (vlib_main_t * vm) vec_validate (tm->tx_frames[0], num_threads - 1); vec_validate (tm->tx_frames[1], num_threads - 1); + tm->bytes_per_buffer = vlib_buffer_free_list_buffer_size + (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); return error; } diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 997df76f..a17262fa 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -116,7 +116,8 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler; _(RECOVERY, "Recovery on") \ _(FAST_RECOVERY, "Fast Recovery on") \ _(FR_1_SMSS, "Sent 1 SMSS") \ - _(HALF_OPEN_DONE, "Half-open completed") + _(HALF_OPEN_DONE, "Half-open completed") \ + _(FINPNDG, "FIN pending") typedef enum _tcp_connection_flag_bits { @@ -404,6 +405,9 @@ typedef struct _tcp_main /** Port allocator random number generator seed */ u32 port_allocator_seed; + + /** vlib buffer size */ + u32 bytes_per_buffer; } tcp_main_t; extern tcp_main_t tcp_main; @@ -587,6 +591,14 @@ tcp_available_snd_space (const tcp_connection_t * tc) return available_wnd - flight_size; } +always_inline u8 +tcp_is_lost_fin (tcp_connection_t * tc) +{ + if ((tc->flags & TCP_CONN_FINSNT) && tc->snd_una_max - tc->snd_una == 1) + return 1; + return 0; +} + i32 tcp_rcv_wnd_available (tcp_connection_t * tc); u32 tcp_snd_space (tcp_connection_t * tc); void tcp_update_rcv_wnd (tcp_connection_t * tc); @@ -621,8 +633,8 @@ tcp_update_time (f64 now, u32 thread_index) u32 tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b); u32 -tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, - u32 offset, u32 max_bytes); +tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, + u32 max_bytes, vlib_buffer_t ** b); void tcp_connection_timers_init (tcp_connection_t * tc); void tcp_connection_timers_reset (tcp_connection_t * tc); diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 29f4f08d..a3b48d83 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -492,14 +492,6 @@ tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 prev_snd_wnd, && (prev_snd_wnd == tc->snd_wnd)); } -static u8 -tcp_is_lost_fin (tcp_connection_t * tc) -{ - if ((tc->flags & TCP_CONN_FINSNT) && tc->snd_una_max - tc->snd_una == 1) - return 1; - return 0; -} - /** * Checks if ack is a congestion control event. */ @@ -1162,7 +1154,8 @@ partial_ack: /* Remove retransmitted bytes that have been delivered */ ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv - >= tc->sack_sb.last_bytes_delivered); + >= tc->sack_sb.last_bytes_delivered + || (tc->flags & TCP_CONN_FINSNT)); if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt)) { @@ -1273,6 +1266,8 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, if (tcp_ack_is_cc_event (tc, b, prev_snd_wnd, prev_snd_una, &is_dack)) { tcp_cc_handle_event (tc, is_dack); + if (!tcp_in_cong_recovery (tc)) + return 0; *error = TCP_ERROR_ACK_DUP; TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc, 1); return vnet_buffer (b)->tcp.data_len ? 0 : -1; @@ -1497,6 +1492,29 @@ tcp_can_delack (tcp_connection_t * tc) return 1; } +static int +tcp_buffer_discard_bytes (vlib_buffer_t * b, u32 n_bytes_to_drop) +{ + u32 discard; + vlib_main_t *vm = vlib_get_main (); + + /* Handle multi segment packets */ + if (n_bytes_to_drop > b->current_length) + { + if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT)) + return -1; + do + { + discard = clib_min (n_bytes_to_drop, b->current_length); + vlib_buffer_advance (b, discard); + b = vlib_get_buffer (vm, b->next_buffer); + n_bytes_to_drop -= discard; + } + while (n_bytes_to_drop); + } + return 0; +} + static int tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, u32 * next0) @@ -1530,7 +1548,8 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number; n_data_bytes -= n_bytes_to_drop; vnet_buffer (b)->tcp.seq_number = tc->rcv_nxt; - vlib_buffer_advance (b, n_bytes_to_drop); + if (tcp_buffer_discard_bytes (b, n_bytes_to_drop)) + goto done; goto in_order; } @@ -2252,8 +2271,15 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) goto drop; + /* Still have to send the FIN */ + if (tc0->flags & TCP_CONN_FINPNDG) + { + /* TX fifo finally drained */ + if (!stream_session_tx_fifo_max_dequeue (&tc0->connection)) + tcp_send_fin (tc0); + } /* If FIN is ACKed */ - if (tc0->snd_una == tc0->snd_una_max) + else if (tc0->snd_una == tc0->snd_una_max) { ASSERT (tcp_fin (tcp0)); tc0->rcv_nxt += 1; diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index f8fbb8a9..4c1add21 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -15,6 +15,7 @@ #include #include +#include vlib_node_registration_t tcp4_output_node; vlib_node_registration_t tcp6_output_node; @@ -84,7 +85,7 @@ void tcp_update_rcv_mss (tcp_connection_t * tc) { /* TODO find our iface MTU */ - tc->mss = dummy_mtu; + tc->mss = dummy_mtu - sizeof (tcp_header_t); } /** @@ -436,28 +437,35 @@ tcp_init_mss (tcp_connection_t * tc) tc->snd_mss -= TCP_OPTION_LEN_TIMESTAMP; } +always_inline int +tcp_alloc_tx_buffers (tcp_main_t * tm, u8 thread_index, u32 n_free_buffers) +{ + vec_validate (tm->tx_buffers[thread_index], n_free_buffers - 1); + _vec_len (tm->tx_buffers[thread_index]) = + vlib_buffer_alloc_from_free_list (vlib_get_main (), + tm->tx_buffers[thread_index], + n_free_buffers, + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + /* buffer shortage, report failure */ + if (vec_len (tm->tx_buffers[thread_index]) == 0) + { + clib_warning ("out of buffers"); + return -1; + } + return 0; +} + always_inline int tcp_get_free_buffer_index (tcp_main_t * tm, u32 * bidx) { - u32 *my_tx_buffers, n_free_buffers; + u32 *my_tx_buffers; u32 thread_index = vlib_get_thread_index (); - my_tx_buffers = tm->tx_buffers[thread_index]; - if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) + if (PREDICT_FALSE (vec_len (tm->tx_buffers[thread_index]) == 0)) { - n_free_buffers = VLIB_FRAME_SIZE; - vec_validate (my_tx_buffers, n_free_buffers - 1); - _vec_len (my_tx_buffers) = - vlib_buffer_alloc_from_free_list (vlib_get_main (), my_tx_buffers, - n_free_buffers, - VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); - /* buffer shortage, report failure */ - if (vec_len (my_tx_buffers) == 0) - { - clib_warning ("out of buffers"); - return -1; - } - tm->tx_buffers[thread_index] = my_tx_buffers; + if (tcp_alloc_tx_buffers (tm, thread_index, VLIB_FRAME_SIZE)) + return -1; } + my_tx_buffers = tm->tx_buffers[thread_index]; *bidx = my_tx_buffers[_vec_len (my_tx_buffers) - 1]; _vec_len (my_tx_buffers) -= 1; return 0; @@ -476,6 +484,7 @@ always_inline void tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b) { vlib_buffer_t *it = b; + u32 save_free_list = b->flags & VLIB_BUFFER_FREE_LIST_INDEX_MASK; do { it->current_data = 0; @@ -485,6 +494,10 @@ tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b) while ((it->flags & VLIB_BUFFER_NEXT_PRESENT) && (it = vlib_get_buffer (vm, it->next_buffer))); + if (b->flags & VLIB_BUFFER_NEXT_PRESENT) + vlib_buffer_free_one (vm, b->next_buffer); + b->flags = save_free_list; + /* Leave enough space for headers */ vlib_buffer_make_headroom (b, MAX_HDRS_LEN); vnet_buffer (b)->tcp.flags = 0; @@ -959,18 +972,16 @@ tcp_send_fin (tcp_connection_t * tc) return; b = vlib_get_buffer (vm, bi); - /* Leave enough space for headers */ - vlib_buffer_make_headroom (b, MAX_HDRS_LEN); - tcp_make_fin (tc, b); tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4); tc->flags |= TCP_CONN_FINSNT; + tc->flags &= ~TCP_CONN_FINPNDG; tcp_retransmit_timer_force_update (tc); TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc); } always_inline u8 -tcp_make_state_flags (tcp_state_t next_state) +tcp_make_state_flags (tcp_connection_t * tc, tcp_state_t next_state) { switch (next_state) { @@ -982,7 +993,10 @@ tcp_make_state_flags (tcp_state_t next_state) return TCP_FLAG_SYN; case TCP_STATE_LAST_ACK: case TCP_STATE_FIN_WAIT_1: - return TCP_FLAG_FIN; + if (tc->snd_nxt + 1 < tc->snd_una_max) + return TCP_FLAG_ACK; + else + return TCP_FLAG_FIN; default: clib_warning ("Shouldn't be here!"); } @@ -1008,7 +1022,7 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t); advertise_wnd = tcp_window_to_advertise (tc, next_state); - flags = tcp_make_state_flags (next_state); + flags = tcp_make_state_flags (tc, next_state); /* Push header and options */ th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt, @@ -1055,7 +1069,11 @@ tcp_send_ack (tcp_connection_t * tc) tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); } -/* Send delayed ACK when timer expires */ +/** + * Delayed ack timer handler + * + * Sends delayed ACK when timer expires + */ void tcp_timer_delack_handler (u32 index) { @@ -1067,49 +1085,138 @@ tcp_timer_delack_handler (u32 index) tcp_send_ack (tc); } -/** Build a retransmit segment +/** + * Build a retransmit segment * * @return the number of bytes in the segment or 0 if there's nothing to * retransmit */ u32 -tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, - u32 offset, u32 max_bytes) +tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, + u32 max_deq_bytes, vlib_buffer_t ** b) { + tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); int n_bytes = 0; - u32 start; - - tcp_reuse_buffer (vm, b); + u32 start, bi, available_bytes; ASSERT (tc->state >= TCP_STATE_ESTABLISHED); - ASSERT (max_bytes != 0); + ASSERT (max_deq_bytes != 0); - max_bytes = clib_min (tc->snd_mss, max_bytes); + /* + * Make sure we can retransmit something + */ + max_deq_bytes = clib_min (tc->snd_mss, max_deq_bytes); + available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection); + if (!available_bytes) + return 0; + max_deq_bytes = clib_min (available_bytes, max_deq_bytes); start = tc->snd_una + offset; /* Start is beyond snd_congestion */ if (seq_geq (start, tc->snd_congestion)) - goto done; + { + goto done; + } /* Don't overshoot snd_congestion */ - if (seq_gt (start + max_bytes, tc->snd_congestion)) + if (seq_gt (start + max_deq_bytes, tc->snd_congestion)) { - max_bytes = tc->snd_congestion - start; - if (max_bytes == 0) - goto done; + max_deq_bytes = tc->snd_congestion - start; + if (max_deq_bytes == 0) + { + goto done; + } } + /* + * Prepare options + */ tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); - ASSERT (max_bytes <= tc->snd_mss); + /* + * Allocate and fill in buffer(s) + */ + + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return 0; + *b = vlib_get_buffer (vm, bi); + + /* Easy case, buffer size greater than mss */ + if (PREDICT_TRUE (max_deq_bytes <= tm->bytes_per_buffer)) + { + n_bytes = stream_session_peek_bytes (&tc->connection, + vlib_buffer_get_current (*b), + offset, max_deq_bytes); + ASSERT (n_bytes == max_deq_bytes); + b[0]->current_length = n_bytes; + tcp_push_hdr_i (tc, *b, tc->state, 0); + } + /* Split mss into multiple buffers */ + else + { + u32 chain_bi = ~0, n_bufs_per_seg; + u32 thread_index = vlib_get_thread_index (); + u16 n_peeked, len_to_deq, available_bufs; + vlib_buffer_t *chain_b, *prev_b; + u8 *data0; + int i; + + n_bufs_per_seg = ceil ((double) max_deq_bytes / tm->bytes_per_buffer); + ASSERT (available_bytes >= max_deq_bytes); + + /* Make sure we have enough buffers */ + available_bufs = vec_len (tm->tx_buffers[thread_index]); + if (n_bufs_per_seg > available_bufs) + { + if (tcp_alloc_tx_buffers (tm, thread_index, + VLIB_FRAME_SIZE - available_bufs)) + { + tcp_return_buffer (tm); + return 0; + } + } + + n_bytes = stream_session_peek_bytes (&tc->connection, + vlib_buffer_get_current (*b), + offset, tm->bytes_per_buffer); + b[0]->current_length = n_bytes; + b[0]->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + b[0]->total_length_not_including_first_buffer = 0; + + tcp_push_hdr_i (tc, *b, tc->state, 0); + max_deq_bytes -= n_bytes; + + chain_b = *b; + for (i = 1; i < n_bufs_per_seg; i++) + { + prev_b = chain_b; + len_to_deq = clib_min (max_deq_bytes, tm->bytes_per_buffer); + tcp_get_free_buffer_index (tm, &chain_bi); + ASSERT (chain_bi != (u32) ~ 0); + chain_b = vlib_get_buffer (vm, chain_bi); + chain_b->current_data = 0; + data0 = vlib_buffer_get_current (chain_b); + n_peeked = stream_session_peek_bytes (&tc->connection, data0, + n_bytes, len_to_deq); + n_bytes += n_peeked; + ASSERT (n_peeked == len_to_deq); + chain_b->current_length = n_peeked; + b[0]->total_length_not_including_first_buffer += + chain_b->current_length; + + /* update previous buffer */ + prev_b->next_buffer = chain_bi; + prev_b->flags |= VLIB_BUFFER_NEXT_PRESENT; + + /* update current buffer */ + chain_b->next_buffer = 0; + + max_deq_bytes -= n_peeked; + } + } - n_bytes = stream_session_peek_bytes (&tc->connection, - vlib_buffer_get_current (b), offset, - max_bytes); ASSERT (n_bytes > 0); - b->current_length = n_bytes; - tcp_push_hdr_i (tc, b, tc->state, 0); if (tcp_in_fastrecovery (tc)) tc->snd_rxt_bytes += n_bytes; @@ -1147,7 +1254,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) vlib_main_t *vm = vlib_get_main (); u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; - vlib_buffer_t *b; + vlib_buffer_t *b = 0; u32 bi, n_bytes; if (is_syn) @@ -1174,17 +1281,11 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Go back to first un-acked byte */ tc->snd_nxt = tc->snd_una; - if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) - return; - - b = vlib_get_buffer (vm, bi); - if (tc->state >= TCP_STATE_ESTABLISHED) { /* Lost FIN, retransmit and return */ - if (tc->flags & TCP_CONN_FINSNT) + if (tcp_is_lost_fin (tc)) { - tcp_return_buffer (tm); tcp_send_fin (tc); return; } @@ -1199,7 +1300,9 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1); /* Send one segment */ - n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); + n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b); + ASSERT (n_bytes); + bi = vlib_get_buffer_index (vm, b); /* TODO be less aggressive about this */ scoreboard_clear (&tc->sack_sb); @@ -1212,7 +1315,6 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) tcp_retransmit_timer_set (tc); ASSERT (0 || (tc->rto_boff > 1 && tc->snd_una == tc->snd_congestion)); - tcp_return_buffer (tm); return; } @@ -1234,7 +1336,6 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) clib_warning ("could not remove half-open connection"); ASSERT (0); } - tcp_return_buffer (tm); return; } @@ -1243,6 +1344,9 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) if (tc->rto_boff > TCP_RTO_SYN_RETRIES) tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); vlib_buffer_make_headroom (b, MAX_HDRS_LEN); tcp_push_hdr_i (tc, b, tc->state, 1); @@ -1256,7 +1360,6 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { ASSERT (tc->state == TCP_STATE_CLOSED); clib_warning ("connection closed ..."); - tcp_return_buffer (tm); return; } @@ -1305,7 +1408,7 @@ tcp_timer_persist_handler (u32 index) u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; - u32 bi, old_snd_nxt; + u32 bi, old_snd_nxt, snd_bytes = 0, available_bytes = 0; int n_bytes = 0; tc = tcp_connection_get_if_valid (index, thread_index); @@ -1317,34 +1420,31 @@ tcp_timer_persist_handler (u32 index) tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID; /* Problem already solved or worse */ + available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection); if (tc->state == TCP_STATE_CLOSED || tc->state > TCP_STATE_ESTABLISHED - || tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc)) + || tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc) + || !available_bytes) return; /* Increment RTO backoff */ tc->rto_boff += 1; tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); - /* Try to force the first unsent segment */ + /* + * Try to force the first unsent segment (or buffer) + */ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) return; - b = vlib_get_buffer (vm, bi); tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); + snd_bytes = clib_min (tc->snd_mss, tm->bytes_per_buffer); n_bytes = stream_session_peek_bytes (&tc->connection, vlib_buffer_get_current (b), tc->snd_una_max - tc->snd_una, - tc->snd_mss); - /* Nothing to send */ - if (n_bytes <= 0) - { - // clib_warning ("persist found nothing to send"); - tcp_return_buffer (tm); - return; - } - + snd_bytes); + ASSERT (n_bytes != 0); b->current_length = n_bytes; ASSERT (tc->snd_nxt == tc->snd_una_max || tc->rto_boff > 1 || tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)); @@ -1365,32 +1465,20 @@ tcp_timer_persist_handler (u32 index) void tcp_retransmit_first_unacked (tcp_connection_t * tc) { - tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); vlib_buffer_t *b; - u32 bi, n_bytes, old_snd_nxt; + u32 bi, old_snd_nxt, n_bytes; old_snd_nxt = tc->snd_nxt; tc->snd_nxt = tc->snd_una; - /* Get buffer */ - if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) - return; - - b = vlib_get_buffer (vm, bi); - TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2); - - n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); - if (n_bytes == 0) - { - tcp_return_buffer (tm); - goto done; - } - + n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b); + if (!n_bytes) + return; + bi = vlib_get_buffer_index (vm, b); tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); -done: tc->snd_nxt = old_snd_nxt; } @@ -1400,10 +1488,9 @@ done: void tcp_fast_retransmit_sack (tcp_connection_t * tc) { - tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); u32 n_written = 0, offset = 0, max_bytes; - vlib_buffer_t *b; + vlib_buffer_t *b = 0; sack_scoreboard_hole_t *hole; sack_scoreboard_t *sb; u32 bi, old_snd_nxt; @@ -1420,10 +1507,6 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); while (hole && snd_space > 0) { - if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) - return; - - b = vlib_get_buffer (vm, bi); hole = scoreboard_next_rxt_hole (sb, hole, tcp_fastrecovery_sent_1_smss (tc), &can_rescue, &snd_limited); @@ -1443,7 +1526,10 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) offset = tc->snd_congestion - tc->snd_una - max_bytes; sb->rescue_rxt = tc->snd_congestion; tc->snd_nxt = tc->snd_una + offset; - tcp_prepare_retransmit_segment (tc, b, offset, max_bytes); + n_written = tcp_prepare_retransmit_segment (tc, offset, max_bytes, + &b); + ASSERT (n_written); + bi = vlib_get_buffer_index (vm, b); tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); break; } @@ -1451,15 +1537,13 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) max_bytes = snd_limited ? tc->snd_mss : hole->end - sb->high_rxt; offset = sb->high_rxt - tc->snd_una; tc->snd_nxt = tc->snd_una + offset; - n_written = tcp_prepare_retransmit_segment (tc, b, offset, max_bytes); + n_written = tcp_prepare_retransmit_segment (tc, offset, max_bytes, &b); /* Nothing left to retransmit */ if (n_written == 0) - { - tcp_return_buffer (tm); - break; - } + break; + bi = vlib_get_buffer_index (vm, b); sb->high_rxt += n_written; tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); snd_space -= n_written; @@ -1475,7 +1559,6 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) void tcp_fast_retransmit_no_sack (tcp_connection_t * tc) { - tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); u32 n_written = 0, offset = 0, bi, old_snd_nxt; int snd_space; @@ -1491,19 +1574,14 @@ tcp_fast_retransmit_no_sack (tcp_connection_t * tc) while (snd_space > 0) { - if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) - return; - b = vlib_get_buffer (vm, bi); offset += n_written; - n_written = tcp_prepare_retransmit_segment (tc, b, offset, snd_space); + n_written = tcp_prepare_retransmit_segment (tc, offset, snd_space, &b); /* Nothing left to retransmit */ if (n_written == 0) - { - tcp_return_buffer (tm); - break; - } + break; + bi = vlib_get_buffer_index (vm, b); tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); snd_space -= n_written; } -- cgit 1.2.3-korg From 82d3ec8b513455fb1a4a9fd045065beddb87bad7 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Mon, 14 Aug 2017 08:10:42 -0700 Subject: TCP: Update time_now once per burst Change-Id: I58089d7a9867ede9d8a36b2aea62edef04cb5b81 Signed-off-by: Florin Coras --- src/vnet/tcp/tcp.c | 2 ++ src/vnet/tcp/tcp.h | 12 +++++++++++- src/vnet/tcp/tcp_input.c | 2 +- src/vnet/tcp/tcp_output.c | 2 +- 4 files changed, 15 insertions(+), 3 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 4652618b..10ecf2f3 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -1299,6 +1299,8 @@ tcp_main_enable (vlib_main_t * vm) tm->bytes_per_buffer = vlib_buffer_free_list_buffer_size (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + + vec_validate (tm->time_now, num_threads - 1); return error; } diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index a17262fa..52610ddd 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -367,6 +367,7 @@ typedef struct _tcp_main u8 log2_tstamp_clocks_per_tick; f64 tstamp_ticks_per_clock; + u32 *time_now; /** per-worker tx buffer free lists */ u32 **tx_buffers; @@ -619,12 +620,21 @@ void tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end); always_inline u32 tcp_time_now (void) { - return clib_cpu_time_now () * tcp_main.tstamp_ticks_per_clock; + return tcp_main.time_now[vlib_get_thread_index ()]; +} + +always_inline u32 +tcp_set_time_now (u32 thread_index) +{ + tcp_main.time_now[thread_index] = clib_cpu_time_now () + * tcp_main.tstamp_ticks_per_clock; + return tcp_main.time_now[thread_index]; } always_inline void tcp_update_time (f64 now, u32 thread_index) { + tcp_set_time_now (thread_index); tw_timer_expire_timers_16t_2w_512sl (&tcp_main.timer_wheels[thread_index], now); tcp_flush_frames_to_output (thread_index); diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index a3b48d83..2d36c85e 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -2801,8 +2801,8 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; - next_index = node->cached_next_index; + tcp_set_time_now (my_thread_index); while (n_left_from > 0) { diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 4c1add21..5a395b9f 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -1621,8 +1621,8 @@ tcp46_output_inline (vlib_main_t * vm, from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; - next_index = node->cached_next_index; + tcp_set_time_now (my_thread_index); while (n_left_from > 0) { -- cgit 1.2.3-korg From ab0289a85c45699878d203b4a0d2e5b38c36cc55 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Mon, 14 Aug 2017 11:25:25 -0700 Subject: tcp: state machine improvements - Add SYN_RCVD timeout - Fix FIN_WAIT_1 to CLOSING transition Change-Id: I42ca7fc087f6fdfae15bd7a6175dd3226ed341c7 Signed-off-by: Florin Coras --- src/vnet/session/application.c | 2 +- src/vnet/session/application.h | 2 +- src/vnet/session/session.c | 10 ++++++---- src/vnet/session/stream_session.h | 3 --- src/vnet/tcp/tcp.c | 15 +++++++++++---- src/vnet/tcp/tcp.h | 1 + src/vnet/tcp/tcp_input.c | 30 ++++++++++++++++++++---------- 7 files changed, 40 insertions(+), 23 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/vnet/session/application.c b/src/vnet/session/application.c index 78c41b93..bc837bb2 100644 --- a/src/vnet/session/application.c +++ b/src/vnet/session/application.c @@ -329,7 +329,7 @@ application_open_session (application_t * app, session_type_t sst, return rv; /* Store api_context for when the reply comes. Not the nicest thing - * but better allocating a separate half-open pool. */ + * but better than allocating a separate half-open pool. */ tc->s_index = api_context; return 0; diff --git a/src/vnet/session/application.h b/src/vnet/session/application.h index 29d37a06..45bc0013 100644 --- a/src/vnet/session/application.h +++ b/src/vnet/session/application.h @@ -37,7 +37,7 @@ typedef struct _stream_session_cb_vft int (*session_accept_callback) (stream_session_t * new_session); /* Connection request callback */ - int (*session_connected_callback) (u32 app_index, u32 api_context, + int (*session_connected_callback) (u32 app_index, u32 opaque, stream_session_t * s, u8 code); /** Notify app that session is closing */ diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 3a3e4dfe..843d474f 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -373,7 +373,7 @@ stream_session_connect_notify (transport_connection_t * tc, u8 is_fail) application_t *app; stream_session_t *new_s = 0; u64 handle; - u32 api_context = 0; + u32 opaque = 0; int error = 0; handle = stream_session_half_open_lookup_handle (&tc->lcl_ip, &tc->rmt_ip, @@ -385,9 +385,11 @@ stream_session_connect_notify (transport_connection_t * tc, u8 is_fail) return -1; } - /* Get the app's index from the handle we stored when opening connection */ + /* Get the app's index from the handle we stored when opening connection + * and the opaque (api_context for external apps) from transport session + * index*/ app = application_get (handle >> 32); - api_context = tc->s_index; + opaque = tc->s_index; if (!is_fail) { @@ -406,7 +408,7 @@ stream_session_connect_notify (transport_connection_t * tc, u8 is_fail) } /* Notify client application */ - if (app->cb_fns.session_connected_callback (app->index, api_context, new_s, + if (app->cb_fns.session_connected_callback (app->index, opaque, new_s, is_fail)) { clib_warning ("failed to notify app"); diff --git a/src/vnet/session/stream_session.h b/src/vnet/session/stream_session.h index 4c263211..39bf846a 100644 --- a/src/vnet/session/stream_session.h +++ b/src/vnet/session/stream_session.h @@ -83,9 +83,6 @@ typedef struct _stream_session_t u32 opaque2; - /** connected (server) session handle */ - u64 server_session_handle; - /** Opaque, pad to a 64-octet boundary */ u64 opaque[1]; } stream_session_t; diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 10ecf2f3..75c9d8dc 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -1106,10 +1106,17 @@ tcp_timer_establish_handler (u32 conn_index) tcp_connection_t *tc; tc = tcp_half_open_connection_get (conn_index); - tc->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID; - - ASSERT (tc->state == TCP_STATE_SYN_SENT); - stream_session_connect_notify (&tc->connection, 1 /* fail */ ); + if (tc) + { + ASSERT (tc->state == TCP_STATE_SYN_SENT); + tc->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID; + stream_session_connect_notify (&tc->connection, 1 /* fail */ ); + } + else + { + tc = tcp_connection_get (conn_index, vlib_get_thread_index ()); + ASSERT (tc->state == TCP_STATE_SYN_RCVD); + } tcp_connection_cleanup (tc); } diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 52610ddd..8010b446 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -97,6 +97,7 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler; * ticks to timer units */ #define TCP_DELACK_TIME 1 /* 0.1s */ #define TCP_ESTABLISH_TIME 750 /* 75s */ +#define TCP_SYN_RCVD_TIME 100 /* 10s */ #define TCP_2MSL_TIME 300 /* 30s */ #define TCP_CLOSEWAIT_TIME 20 /* 0.1s */ #define TCP_CLEANUP_TIME 5 /* 0.5s Time to wait before cleanup */ diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 2d36c85e..a3c4f1d8 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -2254,8 +2254,9 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number; stream_session_accept_notify (&tc0->connection); - /* Reset SYN-ACK retransmit timer */ + /* Reset SYN-ACK retransmit and SYN_RCV establish timers */ tcp_retransmit_timer_reset (tc0); + tcp_timer_reset (tc0, TCP_TIMER_ESTABLISH); break; case TCP_STATE_ESTABLISHED: /* We can get packets in established state here because they @@ -2281,13 +2282,21 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* If FIN is ACKed */ else if (tc0->snd_una == tc0->snd_una_max) { - ASSERT (tcp_fin (tcp0)); tc0->rcv_nxt += 1; tc0->state = TCP_STATE_FIN_WAIT_2; TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); - /* Stop all timers, 2MSL will be set lower */ - tcp_connection_timers_reset (tc0); + if (tcp_fin (tcp0)) + { + /* Stop all timers, 2MSL will be set lower */ + tcp_connection_timers_reset (tc0); + } + else + { + /* Wait for peer to finish sending its data */ + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, + TCP_2MSL_TIME); + } } break; case TCP_STATE_FIN_WAIT_2: @@ -2296,8 +2305,6 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, * acknowledged ("ok") but do not delete the TCB. */ if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) goto drop; - - /* check if rtx queue is empty and ack CLOSE TODO */ break; case TCP_STATE_CLOSE_WAIT: /* Do the same processing as for the ESTABLISHED state. */ @@ -2311,9 +2318,9 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) goto drop; - /* XXX test that send queue empty */ tc0->state = TCP_STATE_TIME_WAIT; TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); goto drop; break; @@ -2409,10 +2416,12 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* move along .. */ break; case TCP_STATE_FIN_WAIT_1: - tc0->state = TCP_STATE_TIME_WAIT; - tcp_connection_timers_reset (tc0); - tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + tc0->state = TCP_STATE_CLOSING; + tcp_make_ack (tc0, b0); + next0 = tcp_next_output (is_ip4); TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); + /* Wait for ACK but not forever */ + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); break; case TCP_STATE_FIN_WAIT_2: /* Got FIN, send ACK! */ @@ -2652,6 +2661,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Reuse buffer to make syn-ack and send */ tcp_make_synack (child0, b0); next0 = tcp_next_output (is_ip4); + tcp_timer_set (child0, TCP_TIMER_ESTABLISH, TCP_SYN_RCVD_TIME); drop: if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) -- cgit 1.2.3-korg From c9d1c5b6b3668e243bbdd978069976cc1184892b Mon Sep 17 00:00:00 2001 From: root Date: Tue, 15 Aug 2017 12:58:31 -0400 Subject: tcp: fix v6 sessions Change-Id: Ia6dd5e948b17b2f3866fe70838eabb09e35415e1 Signed-off-by: Dave Barach Signed-off-by: Florin Coras --- src/vnet/ip/ip6_forward.c | 118 +++++++++++++++++++++----------------- src/vnet/session/session.c | 4 +- src/vnet/session/session_cli.c | 2 +- src/vnet/session/session_lookup.c | 8 +-- src/vnet/tcp/tcp.c | 17 +++++- src/vnet/tcp/tcp_input.c | 16 +----- src/vnet/tcp/tcp_output.c | 6 +- 7 files changed, 90 insertions(+), 81 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/vnet/ip/ip6_forward.c b/src/vnet/ip/ip6_forward.c index 8ae08a01..5832bd0b 100644 --- a/src/vnet/ip/ip6_forward.c +++ b/src/vnet/ip/ip6_forward.c @@ -1323,6 +1323,20 @@ ip6_urpf_loose_check (ip6_main_t * im, vlib_buffer_t * b, ip6_header_t * i) return (fib_urpf_check_size (lb0->lb_urpf)); } +always_inline u8 +ip6_next_proto_is_tcp_udp (vlib_buffer_t * p0, ip6_header_t * ip0, + u32 * udp_offset0) +{ + u32 proto0; + proto0 = ip6_locate_header (p0, ip0, IP_PROTOCOL_UDP, udp_offset0); + if (proto0 != IP_PROTOCOL_UDP) + { + proto0 = ip6_locate_header (p0, ip0, IP_PROTOCOL_TCP, udp_offset0); + proto0 = (proto0 == IP_PROTOCOL_TCP) ? proto0 : 0; + } + return proto0; +} + static uword ip6_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { @@ -1352,8 +1366,8 @@ ip6_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) u32 pi0, ip_len0, udp_len0, flags0, next0; u32 pi1, ip_len1, udp_len1, flags1, next1; i32 len_diff0, len_diff1; - u8 error0, type0, good_l4_checksum0; - u8 error1, type1, good_l4_checksum1; + u8 error0, type0, good_l4_csum0, is_tcp_udp0; + u8 error1, type1, good_l4_csum1, is_tcp_udp1; u32 udp_offset0, udp_offset1; pi0 = to_next[0] = from[0]; @@ -1381,67 +1395,69 @@ ip6_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) flags0 = p0->flags; flags1 = p1->flags; - good_l4_checksum0 = - (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; - good_l4_checksum1 = - (flags1 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; + is_tcp_udp0 = ip6_next_proto_is_tcp_udp (p0, ip0, &udp_offset0); + is_tcp_udp1 = ip6_next_proto_is_tcp_udp (p1, ip1, &udp_offset1); + + good_l4_csum0 = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; + good_l4_csum1 = (flags1 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; len_diff0 = 0; len_diff1 = 0; - if (PREDICT_TRUE (IP_PROTOCOL_UDP == ip6_locate_header (p0, ip0, - IP_PROTOCOL_UDP, - &udp_offset0))) + if (PREDICT_TRUE (is_tcp_udp0)) { udp0 = (udp_header_t *) ((u8 *) ip0 + udp_offset0); /* Don't verify UDP checksum for packets with explicit zero checksum. */ - good_l4_checksum0 |= type0 == IP_BUILTIN_PROTOCOL_UDP + good_l4_csum0 |= type0 == IP_BUILTIN_PROTOCOL_UDP && udp0->checksum == 0; /* Verify UDP length. */ - ip_len0 = clib_net_to_host_u16 (ip0->payload_length); - udp_len0 = clib_net_to_host_u16 (udp0->length); - len_diff0 = ip_len0 - udp_len0; + if (is_tcp_udp0 == IP_PROTOCOL_UDP) + { + ip_len0 = clib_net_to_host_u16 (ip0->payload_length); + udp_len0 = clib_net_to_host_u16 (udp0->length); + len_diff0 = ip_len0 - udp_len0; + } } - if (PREDICT_TRUE (IP_PROTOCOL_UDP == ip6_locate_header (p1, ip1, - IP_PROTOCOL_UDP, - &udp_offset1))) + if (PREDICT_TRUE (is_tcp_udp1)) { udp1 = (udp_header_t *) ((u8 *) ip1 + udp_offset1); /* Don't verify UDP checksum for packets with explicit zero checksum. */ - good_l4_checksum1 |= type1 == IP_BUILTIN_PROTOCOL_UDP + good_l4_csum1 |= type1 == IP_BUILTIN_PROTOCOL_UDP && udp1->checksum == 0; /* Verify UDP length. */ - ip_len1 = clib_net_to_host_u16 (ip1->payload_length); - udp_len1 = clib_net_to_host_u16 (udp1->length); - len_diff1 = ip_len1 - udp_len1; + if (is_tcp_udp1 == IP_PROTOCOL_UDP) + { + ip_len1 = clib_net_to_host_u16 (ip1->payload_length); + udp_len1 = clib_net_to_host_u16 (udp1->length); + len_diff1 = ip_len1 - udp_len1; + } } - good_l4_checksum0 |= type0 == IP_BUILTIN_PROTOCOL_UNKNOWN; - good_l4_checksum1 |= type1 == IP_BUILTIN_PROTOCOL_UNKNOWN; + good_l4_csum0 |= type0 == IP_BUILTIN_PROTOCOL_UNKNOWN; + good_l4_csum1 |= type1 == IP_BUILTIN_PROTOCOL_UNKNOWN; len_diff0 = type0 == IP_BUILTIN_PROTOCOL_UDP ? len_diff0 : 0; len_diff1 = type1 == IP_BUILTIN_PROTOCOL_UDP ? len_diff1 : 0; if (PREDICT_FALSE (type0 != IP_BUILTIN_PROTOCOL_UNKNOWN - && !good_l4_checksum0 + && !good_l4_csum0 && !(flags0 & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED))) { flags0 = ip6_tcp_udp_icmp_validate_checksum (vm, p0); - good_l4_checksum0 = + good_l4_csum0 = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; } if (PREDICT_FALSE (type1 != IP_BUILTIN_PROTOCOL_UNKNOWN - && !good_l4_checksum1 + && !good_l4_csum1 && !(flags1 & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED))) { flags1 = ip6_tcp_udp_icmp_validate_checksum (vm, p1); - good_l4_checksum1 = + good_l4_csum1 = (flags1 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; } error0 = error1 = IP6_ERROR_UNKNOWN_PROTOCOL; - error0 = len_diff0 < 0 ? IP6_ERROR_UDP_LENGTH : error0; error1 = len_diff1 < 0 ? IP6_ERROR_UDP_LENGTH : error1; @@ -1449,10 +1465,8 @@ ip6_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) IP6_ERROR_UDP_CHECKSUM); ASSERT (IP6_ERROR_UDP_CHECKSUM + IP_BUILTIN_PROTOCOL_ICMP == IP6_ERROR_ICMP_CHECKSUM); - error0 = - (!good_l4_checksum0 ? IP6_ERROR_UDP_CHECKSUM + type0 : error0); - error1 = - (!good_l4_checksum1 ? IP6_ERROR_UDP_CHECKSUM + type1 : error1); + error0 = (!good_l4_csum0 ? IP6_ERROR_UDP_CHECKSUM + type0 : error0); + error1 = (!good_l4_csum1 ? IP6_ERROR_UDP_CHECKSUM + type1 : error1); /* Drop packets from unroutable hosts. */ /* If this is a neighbor solicitation (ICMP), skip source RPF check */ @@ -1491,8 +1505,9 @@ ip6_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) udp_header_t *udp0; u32 pi0, ip_len0, udp_len0, flags0, next0; i32 len_diff0; - u8 error0, type0, good_l4_checksum0; + u8 error0, type0, good_l4_csum0; u32 udp_offset0; + u8 is_tcp_udp0; pi0 = to_next[0] = from[0]; from += 1; @@ -1501,59 +1516,55 @@ ip6_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) n_left_to_next -= 1; p0 = vlib_get_buffer (vm, pi0); - ip0 = vlib_buffer_get_current (p0); - vnet_buffer (p0)->l3_hdr_offset = p0->current_data; type0 = lm->builtin_protocol_by_ip_protocol[ip0->protocol]; next0 = lm->local_next_by_ip_protocol[ip0->protocol]; - flags0 = p0->flags; + is_tcp_udp0 = ip6_next_proto_is_tcp_udp (p0, ip0, &udp_offset0); + good_l4_csum0 = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; - good_l4_checksum0 = - (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; len_diff0 = 0; - - if (PREDICT_TRUE (IP_PROTOCOL_UDP == ip6_locate_header (p0, ip0, - IP_PROTOCOL_UDP, - &udp_offset0))) + if (PREDICT_TRUE (is_tcp_udp0)) { udp0 = (udp_header_t *) ((u8 *) ip0 + udp_offset0); - /* Don't verify UDP checksum for packets with explicit zero checksum. */ - good_l4_checksum0 |= type0 == IP_BUILTIN_PROTOCOL_UDP + /* Don't verify UDP checksum for packets with explicit zero + * checksum. */ + good_l4_csum0 |= type0 == IP_BUILTIN_PROTOCOL_UDP && udp0->checksum == 0; /* Verify UDP length. */ - ip_len0 = clib_net_to_host_u16 (ip0->payload_length); - udp_len0 = clib_net_to_host_u16 (udp0->length); - len_diff0 = ip_len0 - udp_len0; + if (is_tcp_udp0 == IP_PROTOCOL_UDP) + { + ip_len0 = clib_net_to_host_u16 (ip0->payload_length); + udp_len0 = clib_net_to_host_u16 (udp0->length); + len_diff0 = ip_len0 - udp_len0; + } } - good_l4_checksum0 |= type0 == IP_BUILTIN_PROTOCOL_UNKNOWN; + good_l4_csum0 |= type0 == IP_BUILTIN_PROTOCOL_UNKNOWN; len_diff0 = type0 == IP_BUILTIN_PROTOCOL_UDP ? len_diff0 : 0; if (PREDICT_FALSE (type0 != IP_BUILTIN_PROTOCOL_UNKNOWN - && !good_l4_checksum0 + && !good_l4_csum0 && !(flags0 & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED))) { flags0 = ip6_tcp_udp_icmp_validate_checksum (vm, p0); - good_l4_checksum0 = + good_l4_csum0 = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; } error0 = IP6_ERROR_UNKNOWN_PROTOCOL; - error0 = len_diff0 < 0 ? IP6_ERROR_UDP_LENGTH : error0; ASSERT (IP6_ERROR_UDP_CHECKSUM + IP_BUILTIN_PROTOCOL_UDP == IP6_ERROR_UDP_CHECKSUM); ASSERT (IP6_ERROR_UDP_CHECKSUM + IP_BUILTIN_PROTOCOL_ICMP == IP6_ERROR_ICMP_CHECKSUM); - error0 = - (!good_l4_checksum0 ? IP6_ERROR_UDP_CHECKSUM + type0 : error0); + error0 = (!good_l4_csum0 ? IP6_ERROR_UDP_CHECKSUM + type0 : error0); - /* If this is a neighbor solicitation (ICMP), skip source RPF check */ + /* If this is a neighbor solicitation (ICMP), skip src RPF check */ if (error0 == IP6_ERROR_UNKNOWN_PROTOCOL && type0 != IP_BUILTIN_PROTOCOL_ICMP && !ip6_address_is_link_local_unicast (&ip0->src_address)) @@ -1564,7 +1575,6 @@ ip6_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) next0 = error0 != IP6_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next0; - p0->error = error_node->errors[error0]; vlib_validate_buffer_enqueue_x1 (vm, node, next_index, diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 843d474f..70a5cd83 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -375,10 +375,12 @@ stream_session_connect_notify (transport_connection_t * tc, u8 is_fail) u64 handle; u32 opaque = 0; int error = 0; + u8 st; + st = session_type_from_proto_and_ip (tc->transport_proto, tc->is_ip4); handle = stream_session_half_open_lookup_handle (&tc->lcl_ip, &tc->rmt_ip, tc->lcl_port, tc->rmt_port, - tc->transport_proto); + st); if (handle == HALF_OPEN_LOOKUP_INVALID_VALUE) { clib_warning ("This can't be good!"); diff --git a/src/vnet/session/session_cli.c b/src/vnet/session/session_cli.c index 9f3d217c..f60048f1 100755 --- a/src/vnet/session/session_cli.c +++ b/src/vnet/session/session_cli.c @@ -315,7 +315,7 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, VLIB_CLI_COMMAND (vlib_cli_show_session_command) = { .path = "show session", - .short_help = "show session [verbose]", + .short_help = "show session [verbose [nnn]]", .function = show_session_command_fn, }; /* *INDENT-ON* */ diff --git a/src/vnet/session/session_lookup.c b/src/vnet/session/session_lookup.c index 41f9dbf0..0f9abf9a 100644 --- a/src/vnet/session/session_lookup.c +++ b/src/vnet/session/session_lookup.c @@ -106,8 +106,8 @@ make_v4_listener_kv (session_kv4_t * kv, ip4_address_t * lcl, u16 lcl_port, always_inline void make_v4_ss_kv_from_tc (session_kv4_t * kv, transport_connection_t * t) { - return make_v4_ss_kv (kv, &t->lcl_ip.ip4, &t->rmt_ip.ip4, t->lcl_port, - t->rmt_port, t->transport_proto); + make_v4_ss_kv (kv, &t->lcl_ip.ip4, &t->rmt_ip.ip4, t->lcl_port, t->rmt_port, + session_type_from_proto_and_ip (t->transport_proto, 1)); } always_inline void @@ -149,8 +149,8 @@ make_v6_listener_kv (session_kv6_t * kv, ip6_address_t * lcl, u16 lcl_port, always_inline void make_v6_ss_kv_from_tc (session_kv6_t * kv, transport_connection_t * t) { - make_v6_ss_kv (kv, &t->lcl_ip.ip6, &t->rmt_ip.ip6, t->lcl_port, - t->rmt_port, t->transport_proto); + make_v6_ss_kv (kv, &t->lcl_ip.ip6, &t->rmt_ip.ip6, t->lcl_port, t->rmt_port, + session_type_from_proto_and_ip (t->transport_proto, 0)); } /* diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 75c9d8dc..d1690022 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -362,7 +362,11 @@ ip_interface_get_first_ip (u32 sw_if_index, u8 is_ip4) /* *INDENT-OFF* */ foreach_ip_interface_address (lm6, ia, sw_if_index, 1 /* unnumbered */ , ({ - return ip_interface_address_get_address (lm6, ia); + ip6_address_t *rv; + rv = ip_interface_address_get_address (lm6, ia); + /* Trying to use a link-local ip6 src address is a fool's errand */ + if (!ip6_address_is_link_local_unicast (rv)) + return rv; })); /* *INDENT-ON* */ } @@ -635,6 +639,14 @@ tcp_connection_open (transport_endpoint_t * rmt) else { ip6 = ip_interface_get_first_ip (sw_if_index, 0); + if (ip6 == 0) + { + clib_warning ("no routable ip6 addresses on %U", + format_vnet_sw_if_index_name, vnet_get_main (), + sw_if_index); + return -1; + } + clib_memcpy (&lcl_addr.ip6, ip6, sizeof (*ip6)); } } @@ -1109,7 +1121,6 @@ tcp_timer_establish_handler (u32 conn_index) if (tc) { ASSERT (tc->state == TCP_STATE_SYN_SENT); - tc->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID; stream_session_connect_notify (&tc->connection, 1 /* fail */ ); } else @@ -1117,6 +1128,7 @@ tcp_timer_establish_handler (u32 conn_index) tc = tcp_connection_get (conn_index, vlib_get_thread_index ()); ASSERT (tc->state == TCP_STATE_SYN_RCVD); } + tc->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID; tcp_connection_cleanup (tc); } @@ -1231,6 +1243,7 @@ tcp_main_enable (vlib_main_t * vm) pi->unformat_pg_edit = unformat_pg_tcp_header; ip4_register_protocol (IP_PROTOCOL_TCP, tcp4_input_node.index); + ip6_register_protocol (IP_PROTOCOL_TCP, tcp6_input_node.index); /* Register as transport with session layer */ session_register_transport (TRANSPORT_PROTO_TCP, 1, &tcp_proto); diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index a3c4f1d8..6f9e4c7a 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -1877,26 +1877,12 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tc0 = tcp_half_open_connection_get (vnet_buffer (b0)-> tcp.connection_index); + ASSERT (tc0); ack0 = vnet_buffer (b0)->tcp.ack_number; seq0 = vnet_buffer (b0)->tcp.seq_number; tcp0 = tcp_buffer_hdr (b0); - if (!tc0) - { - ip4_header_t *ip40 = vlib_buffer_get_current (b0); - tcp0 = ip4_next_header (ip40); - tc0 = - (tcp_connection_t *) - stream_session_lookup_transport_wt4 (&ip40->dst_address, - &ip40->src_address, - tcp0->dst_port, - tcp0->src_port, - SESSION_TYPE_IP4_TCP, - my_thread_index); - ASSERT (0); - goto drop; - } if (PREDICT_FALSE (!tcp_ack (tcp0) && !tcp_rst (tcp0) && !tcp_syn (tcp0))) goto drop; diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 5a395b9f..e6a211ba 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -775,12 +775,10 @@ tcp_send_reset (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4) else { int bogus = ~0; - pkt_ih6 = (ip6_header_t *) (pkt_th - 1); ASSERT ((pkt_ih6->ip_version_traffic_class_and_flow_label & 0xF0) == 0x60); - ih6 = - vlib_buffer_push_ip6 (vm, b, &pkt_ih6->dst_address, - &pkt_ih6->src_address, IP_PROTOCOL_TCP); + ih6 = vlib_buffer_push_ip6 (vm, b, &pkt_ih6->dst_address, + &pkt_ih6->src_address, IP_PROTOCOL_TCP); th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih6, &bogus); ASSERT (!bogus); } -- cgit 1.2.3-korg From 1f152cd6faf96b524b6b7071b5cffe1916f9c5cc Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Fri, 18 Aug 2017 19:28:03 -0700 Subject: tcp: retransmit and multi-buffer segment fixes and improvements - set session state as closed on session manager delete - enable retransmit as opposed to persist timer after persist timer completes - properly discard buffer chain bytes when new data overlaps ooo segments - don't use rxt bytes in snd space estimate used on tx path Change-Id: Id9cab686e532e5fe70c775d5440260e8eb890a9f Signed-off-by: Florin Coras --- src/svm/svm_fifo.c | 14 +- src/vnet/session/segment_manager.c | 1 + src/vnet/session/session.c | 105 ++++++++++++--- src/vnet/session/session_node.c | 26 ++-- src/vnet/session/stream_session.h | 2 +- src/vnet/tcp/tcp.c | 39 +++--- src/vnet/tcp/tcp.h | 23 +++- src/vnet/tcp/tcp_input.c | 27 ++-- src/vnet/tcp/tcp_output.c | 264 +++++++++++++++++++++---------------- 9 files changed, 319 insertions(+), 182 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c index 7f8127cf..8fe82f56 100644 --- a/src/svm/svm_fifo.c +++ b/src/svm/svm_fifo.c @@ -53,10 +53,12 @@ ooo_segment_end_pos (svm_fifo_t * f, ooo_segment_t * s) u8 * format_ooo_segment (u8 * s, va_list * args) { + svm_fifo_t *f = va_arg (*args, svm_fifo_t *); ooo_segment_t *seg = va_arg (*args, ooo_segment_t *); - - s = format (s, "pos %u, len %u, next %d, prev %d", - seg->start, seg->length, seg->next, seg->prev); + u32 normalized_start = (seg->start + f->nitems - f->tail) % f->nitems; + s = format (s, "[%u, %u], len %u, next %d, prev %d", normalized_start, + (normalized_start + seg->length) % f->nitems, seg->length, + seg->next, seg->prev); return s; } @@ -154,7 +156,7 @@ format_ooo_list (u8 * s, va_list * args) while (ooo_segment_index != OOO_SEGMENT_INVALID_INDEX) { seg = pool_elt_at_index (f->ooo_segments, ooo_segment_index); - s = format (s, " %U\n", format_ooo_segment, seg); + s = format (s, " %U\n", format_ooo_segment, f, seg); ooo_segment_index = seg->next; } @@ -557,7 +559,6 @@ svm_fifo_enqueue_with_offset_internal (svm_fifo_t * f, { u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; u32 cursize, nitems, normalized_offset; - u32 offset_from_tail; f->ooos_newest = OOO_SEGMENT_INVALID_INDEX; @@ -570,8 +571,7 @@ svm_fifo_enqueue_with_offset_internal (svm_fifo_t * f, normalized_offset = (f->tail + offset) % nitems; /* Will this request fit? */ - offset_from_tail = (nitems + normalized_offset - f->tail) % nitems; - if ((required_bytes + offset_from_tail) > (nitems - cursize)) + if ((required_bytes + offset) > (nitems - cursize)) return -1; svm_fifo_trace_add (f, offset, required_bytes, 1); diff --git a/src/vnet/session/segment_manager.c b/src/vnet/session/segment_manager.c index 262b7faa..43977063 100644 --- a/src/vnet/session/segment_manager.c +++ b/src/vnet/session/segment_manager.c @@ -224,6 +224,7 @@ segment_manager_del (segment_manager_t * sm) session = stream_session_get (session_index, thread_index); /* Instead of directly removing the session call disconnect */ + session->session_state = SESSION_STATE_CLOSED; session_send_session_evt_to_thread (stream_session_handle (session), FIFO_EVENT_DISCONNECT, thread_index); diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 70a5cd83..6fe99047 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -92,38 +92,104 @@ stream_session_create_i (segment_manager_t * sm, transport_connection_t * tc, return 0; } -/** Enqueue buffer chain tail */ +/** + * Discards bytes from buffer chain + * + * It discards n_bytes_to_drop starting at first buffer after chain_b + */ +always_inline void +session_enqueue_discard_chain_bytes (vlib_main_t * vm, vlib_buffer_t * b, + vlib_buffer_t ** chain_b, + u32 n_bytes_to_drop) +{ + vlib_buffer_t *next = *chain_b; + u32 to_drop = n_bytes_to_drop; + ASSERT (b->flags & VLIB_BUFFER_NEXT_PRESENT); + while (to_drop && (next->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + next = vlib_get_buffer (vm, next->next_buffer); + if (next->current_length > to_drop) + { + vlib_buffer_advance (next, to_drop); + to_drop = 0; + } + else + { + to_drop -= next->current_length; + next->current_length = 0; + } + } + *chain_b = next; + + if (to_drop == 0) + b->total_length_not_including_first_buffer -= n_bytes_to_drop; +} + +/** + * Enqueue buffer chain tail + */ always_inline int session_enqueue_chain_tail (stream_session_t * s, vlib_buffer_t * b, u32 offset, u8 is_in_order) { vlib_buffer_t *chain_b; - u32 chain_bi = b->next_buffer, len; + u32 chain_bi, len, diff; vlib_main_t *vm = vlib_get_main (); u8 *data; - u16 written = 0; + u32 written = 0; int rv = 0; + if (is_in_order && offset) + { + diff = offset - b->current_length; + if (diff > b->total_length_not_including_first_buffer) + return 0; + chain_b = b; + session_enqueue_discard_chain_bytes (vm, b, &chain_b, diff); + chain_bi = vlib_get_buffer_index (vm, chain_b); + } + else + chain_bi = b->next_buffer; + do { chain_b = vlib_get_buffer (vm, chain_bi); data = vlib_buffer_get_current (chain_b); len = chain_b->current_length; + if (!len) + continue; if (is_in_order) { rv = svm_fifo_enqueue_nowait (s->server_rx_fifo, len, data); - if (rv < len) + if (rv == len) + { + written += rv; + } + else if (rv < len) { return (rv > 0) ? (written + rv) : written; } - written += rv; + else if (rv > len) + { + written += rv; + + /* written more than what was left in chain */ + if (written > b->total_length_not_including_first_buffer) + return written; + + /* drop the bytes that have already been delivered */ + session_enqueue_discard_chain_bytes (vm, b, &chain_b, rv - len); + } } else { rv = svm_fifo_enqueue_with_offset (s->server_rx_fifo, offset, len, data); if (rv) - return -1; + { + clib_warning ("failed to enqueue multi-buffer seg"); + return -1; + } offset += len; } } @@ -155,22 +221,22 @@ stream_session_enqueue_data (transport_connection_t * tc, vlib_buffer_t * b, u32 offset, u8 queue_event, u8 is_in_order) { stream_session_t *s; - int enqueued = 0, rv; + int enqueued = 0, rv, in_order_off; s = stream_session_get (tc->s_index, tc->thread_index); if (is_in_order) { - enqueued = - svm_fifo_enqueue_nowait (s->server_rx_fifo, b->current_length, - vlib_buffer_get_current (b)); - if (PREDICT_FALSE - ((b->flags & VLIB_BUFFER_NEXT_PRESENT) && enqueued > 0)) + enqueued = svm_fifo_enqueue_nowait (s->server_rx_fifo, + b->current_length, + vlib_buffer_get_current (b)); + if (PREDICT_FALSE ((b->flags & VLIB_BUFFER_NEXT_PRESENT) + && enqueued >= 0)) { - rv = session_enqueue_chain_tail (s, b, 0, 1); - if (rv <= 0) - return enqueued; - enqueued += rv; + in_order_off = enqueued > b->current_length ? enqueued : 0; + rv = session_enqueue_chain_tail (s, b, in_order_off, 1); + if (rv > 0) + enqueued += rv; } } else @@ -179,9 +245,10 @@ stream_session_enqueue_data (transport_connection_t * tc, vlib_buffer_t * b, b->current_length, vlib_buffer_get_current (b)); if (PREDICT_FALSE ((b->flags & VLIB_BUFFER_NEXT_PRESENT) && !rv)) - rv = session_enqueue_chain_tail (s, b, offset + b->current_length, 0); - if (rv) - return -1; + session_enqueue_chain_tail (s, b, offset + b->current_length, 0); + /* if something was enqueued, report even this as success for ooo + * segment handling */ + return rv; } if (queue_event) diff --git a/src/vnet/session/session_node.c b/src/vnet/session/session_node.c index fac2b852..cd52742b 100644 --- a/src/vnet/session/session_node.c +++ b/src/vnet/session/session_node.c @@ -76,7 +76,7 @@ session_tx_fifo_chain_tail (session_manager_main_t * smm, vlib_main_t * vm, u8 thread_index, svm_fifo_t * fifo, vlib_buffer_t * b0, u32 bi0, u8 n_bufs_per_seg, u32 left_from_seg, u32 * left_to_snd0, - u16 * n_bufs, u32 * rx_offset, u16 deq_per_buf, + u16 * n_bufs, u32 * tx_offset, u16 deq_per_buf, u8 peek_data) { vlib_buffer_t *chain_b0, *prev_b0; @@ -104,8 +104,8 @@ session_tx_fifo_chain_tail (session_manager_main_t * smm, vlib_main_t * vm, data0 = vlib_buffer_get_current (chain_b0); if (peek_data) { - n_bytes_read = svm_fifo_peek (fifo, *rx_offset, len_to_deq0, data0); - *rx_offset += n_bytes_read; + n_bytes_read = svm_fifo_peek (fifo, *tx_offset, len_to_deq0, data0); + *tx_offset += n_bytes_read; } else { @@ -126,7 +126,8 @@ session_tx_fifo_chain_tail (session_manager_main_t * smm, vlib_main_t * vm, if (to_deq == 0) break; } - ASSERT (to_deq == 0); + ASSERT (to_deq == 0 + && b0->total_length_not_including_first_buffer == left_from_seg); *left_to_snd0 -= left_from_seg; } @@ -144,7 +145,7 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, transport_proto_vft_t *transport_vft; u32 next_index, next0, *to_next, n_left_to_next, bi0; vlib_buffer_t *b0; - u32 rx_offset = 0, max_dequeue0, n_bytes_per_seg; + u32 tx_offset = 0, max_dequeue0, n_bytes_per_seg, left_for_seg; u16 snd_mss0, n_bufs_per_seg, n_bufs; u8 *data0; int i, n_bytes_read; @@ -170,11 +171,11 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, if (peek_data) { /* Offset in rx fifo from where to peek data */ - rx_offset = transport_vft->tx_fifo_offset (tc0); + tx_offset = transport_vft->tx_fifo_offset (tc0); } /* Check how much we can pull. If buffering, subtract the offset */ - max_dequeue0 = svm_fifo_max_dequeue (s0->server_tx_fifo) - rx_offset; + max_dequeue0 = svm_fifo_max_dequeue (s0->server_tx_fifo) - tx_offset; /* Nothing to read return */ if (max_dequeue0 == 0) @@ -193,6 +194,7 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, } else { + /* Expectation is that snd_space0 is already a multiple of snd_mss */ max_len_to_snd0 = snd_space0; } @@ -265,8 +267,7 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, b0 = vlib_get_buffer (vm, bi0); b0->error = 0; - b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID - | VNET_BUFFER_F_LOCALLY_ORIGINATED; + b0->flags = VNET_BUFFER_F_LOCALLY_ORIGINATED; b0->current_data = 0; b0->total_length_not_including_first_buffer = 0; @@ -274,11 +275,11 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, data0 = vlib_buffer_make_headroom (b0, MAX_HDRS_LEN); if (peek_data) { - n_bytes_read = svm_fifo_peek (s0->server_tx_fifo, rx_offset, + n_bytes_read = svm_fifo_peek (s0->server_tx_fifo, tx_offset, len_to_deq0, data0); /* Keep track of progress locally, transport is also supposed to * increment it independently when pushing the header */ - rx_offset += n_bytes_read; + tx_offset += n_bytes_read; } else { @@ -299,12 +300,11 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, */ if (PREDICT_FALSE (n_bufs_per_seg > 1 && left_to_snd0)) { - u32 left_for_seg; left_for_seg = clib_min (snd_mss0 - n_bytes_read, left_to_snd0); session_tx_fifo_chain_tail (smm, vm, thread_index, s0->server_tx_fifo, b0, bi0, n_bufs_per_seg, left_for_seg, - &left_to_snd0, &n_bufs, &rx_offset, + &left_to_snd0, &n_bufs, &tx_offset, deq_per_buf, peek_data); } diff --git a/src/vnet/session/stream_session.h b/src/vnet/session/stream_session.h index 533cf97f..275052d3 100644 --- a/src/vnet/session/stream_session.h +++ b/src/vnet/session/stream_session.h @@ -56,7 +56,7 @@ typedef struct _stream_session_t u8 session_type; /** State */ - u8 session_state; + volatile u8 session_state; u8 thread_index; diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 6edf52af..197fff96 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -798,7 +798,8 @@ format_tcp_vars (u8 * s, va_list * args) s = format (s, "rtt_seq %u\n", tc->rtt_seq); s = format (s, " tsval_recent %u tsval_recent_age %u\n", tc->tsval_recent, tcp_time_now () - tc->tsval_recent_age); - s = format (s, " scoreboard: %U\n", format_tcp_scoreboard, &tc->sack_sb); + s = format (s, " scoreboard: %U\n", format_tcp_scoreboard, &tc->sack_sb, + tc); if (vec_len (tc->snd_sacks)) s = format (s, " sacks tx: %U\n", format_tcp_sacks, tc); @@ -858,7 +859,7 @@ format_tcp_session (u8 * s, va_list * args) if (tc) s = format (s, "%U", format_tcp_connection, tc, verbose); else - s = format (s, "empty"); + s = format (s, "empty\n"); return s; } @@ -930,7 +931,11 @@ u8 * format_tcp_sack_hole (u8 * s, va_list * args) { sack_scoreboard_hole_t *hole = va_arg (*args, sack_scoreboard_hole_t *); - s = format (s, "[%u, %u]", hole->start, hole->end); + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); + if (tc) + s = format (s, " [%u, %u]", hole->start - tc->iss, hole->end - tc->iss); + else + s = format (s, " [%u, %u]", hole->start, hole->end); return s; } @@ -938,6 +943,7 @@ u8 * format_tcp_scoreboard (u8 * s, va_list * args) { sack_scoreboard_t *sb = va_arg (*args, sack_scoreboard_t *); + tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); sack_scoreboard_hole_t *hole; s = format (s, "sacked_bytes %u last_sacked_bytes %u lost_bytes %u\n", sb->sacked_bytes, sb->last_sacked_bytes, sb->lost_bytes); @@ -952,7 +958,7 @@ format_tcp_scoreboard (u8 * s, va_list * args) while (hole) { - s = format (s, "%U", format_tcp_sack_hole, hole); + s = format (s, "%U", format_tcp_sack_hole, hole, tc); hole = scoreboard_next_hole (sb, hole); } @@ -1001,13 +1007,10 @@ tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space) return tc->snd_wnd <= snd_space ? tc->snd_wnd : 0; } - /* If we can't write at least a segment, don't try at all */ + /* If not snd_wnd constrained and we can't write at least a segment, + * don't try at all */ if (PREDICT_FALSE (snd_space < tc->snd_mss)) - { - if (snd_space > clib_min (tc->mss, tc->rcv_opts.mss) - TCP_HDR_LEN_MAX) - return snd_space; - return 0; - } + return 0; /* round down to mss multiple */ return snd_space - (snd_space % tc->snd_mss); @@ -1030,7 +1033,7 @@ tcp_snd_space (tcp_connection_t * tc) if (PREDICT_TRUE (tcp_in_cong_recovery (tc) == 0)) { - snd_space = tcp_available_snd_space (tc); + snd_space = tcp_available_output_snd_space (tc); /* If we haven't gotten dupacks or if we did and have gotten sacked * bytes then we can still send as per Limited Transmit (RFC3042) */ @@ -1051,17 +1054,20 @@ tcp_snd_space (tcp_connection_t * tc) if (tcp_in_recovery (tc)) { tc->snd_nxt = tc->snd_una_max; - snd_space = tcp_available_wnd (tc) - tc->snd_rxt_bytes + snd_space = tcp_available_snd_wnd (tc) - tc->snd_rxt_bytes - (tc->snd_una_max - tc->snd_congestion); if (snd_space <= 0 || (tc->snd_una_max - tc->snd_una) >= tc->snd_wnd) return 0; return tcp_round_snd_space (tc, snd_space); } - /* If in fast recovery, send 1 SMSS if wnd allows */ - if (tcp_in_fastrecovery (tc) - && tcp_available_snd_space (tc) && !tcp_fastrecovery_sent_1_smss (tc)) + /* RFC 5681: When previously unsent data is available and the new value of + * cwnd and the receiver's advertised window allow, a TCP SHOULD send 1*SMSS + * bytes of previously unsent data. */ + if (tcp_in_fastrecovery (tc) && !tcp_fastrecovery_sent_1_smss (tc)) { + if (tcp_available_output_snd_space (tc) < tc->snd_mss) + return 0; tcp_fastrecovery_1_smss_on (tc); return tc->snd_mss; } @@ -1073,7 +1079,8 @@ u32 tcp_session_send_space (transport_connection_t * trans_conn) { tcp_connection_t *tc = (tcp_connection_t *) trans_conn; - return tcp_snd_space (tc); + return clib_min (tcp_snd_space (tc), + tc->snd_wnd - (tc->snd_nxt - tc->snd_una)); } i32 diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 097cc8cf..9e4660b8 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -498,7 +498,9 @@ tcp_half_open_connection_get (u32 conn_index) void tcp_make_ack (tcp_connection_t * ts, vlib_buffer_t * b); void tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b); void tcp_make_synack (tcp_connection_t * ts, vlib_buffer_t * b); -void tcp_send_reset (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4); +void tcp_send_reset_w_pkt (tcp_connection_t * tc, vlib_buffer_t * pkt, + u8 is_ip4); +void tcp_send_reset (tcp_connection_t * tc); void tcp_send_syn (tcp_connection_t * tc); void tcp_send_fin (tcp_connection_t * tc); void tcp_init_mss (tcp_connection_t * tc); @@ -582,15 +584,30 @@ tcp_loss_wnd (const tcp_connection_t * tc) } always_inline u32 -tcp_available_wnd (const tcp_connection_t * tc) +tcp_available_snd_wnd (const tcp_connection_t * tc) { return clib_min (tc->cwnd, tc->snd_wnd); } +always_inline u32 +tcp_available_output_snd_space (const tcp_connection_t * tc) +{ + u32 available_wnd = tcp_available_snd_wnd (tc); + int flight_size = (int) (tc->snd_nxt - tc->snd_una); + + if (available_wnd <= flight_size) + return 0; + + return available_wnd - flight_size; +} + +/** + * Estimate of how many bytes we can still push into the network + */ always_inline u32 tcp_available_snd_space (const tcp_connection_t * tc) { - u32 available_wnd = tcp_available_wnd (tc); + u32 available_wnd = tcp_available_snd_wnd (tc); u32 flight_size = tcp_flight_size (tc); if (available_wnd <= flight_size) diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 6f9e4c7a..95f9ade1 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -1363,7 +1363,7 @@ always_inline int tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, u16 data_len) { - int written; + int written, error = TCP_ERROR_ENQUEUED; ASSERT (seq_geq (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt)); @@ -1381,12 +1381,12 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, /* Update rcv_nxt */ if (PREDICT_TRUE (written == data_len)) { - tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end; + tc->rcv_nxt += written; } /* If more data written than expected, account for out-of-order bytes. */ else if (written > data_len) { - tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end + written - data_len; + tc->rcv_nxt += written; /* Send ACK confirming the update */ tc->flags |= TCP_CONN_SNDACK; @@ -1400,7 +1400,7 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, * not be enqueued. Inform peer */ tc->flags |= TCP_CONN_SNDACK; - return TCP_ERROR_PARTIALLY_ENQUEUED; + error = TCP_ERROR_PARTIALLY_ENQUEUED; } else { @@ -1415,7 +1415,7 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt); } - return TCP_ERROR_ENQUEUED; + return error; } /** Enqueue out-of-order data */ @@ -1495,10 +1495,10 @@ tcp_can_delack (tcp_connection_t * tc) static int tcp_buffer_discard_bytes (vlib_buffer_t * b, u32 n_bytes_to_drop) { - u32 discard; + u32 discard, first = b->current_length; vlib_main_t *vm = vlib_get_main (); - /* Handle multi segment packets */ + /* Handle multi-buffer segments */ if (n_bytes_to_drop > b->current_length) { if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT)) @@ -1511,7 +1511,12 @@ tcp_buffer_discard_bytes (vlib_buffer_t * b, u32 n_bytes_to_drop) n_bytes_to_drop -= discard; } while (n_bytes_to_drop); + if (n_bytes_to_drop > first) + b->total_length_not_including_first_buffer -= n_bytes_to_drop - first; } + else + vlib_buffer_advance (b, n_bytes_to_drop); + vnet_buffer (b)->tcp.data_len -= n_bytes_to_drop; return 0; } @@ -1908,7 +1913,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (ack0 <= tc0->iss || ack0 > tc0->snd_nxt) { if (!tcp_rst (tcp0)) - tcp_send_reset (tc0, b0, is_ip4); + tcp_send_reset_w_pkt (tc0, b0, is_ip4); goto drop; } @@ -1995,7 +2000,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, * allocate session send reset */ if (stream_session_connect_notify (&new_tc0->connection, 0)) { - tcp_send_reset (new_tc0, b0, is_ip4); + tcp_send_reset_w_pkt (new_tc0, b0, is_ip4); tcp_connection_cleanup (new_tc0); goto drop; } @@ -2017,7 +2022,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (stream_session_connect_notify (&new_tc0->connection, 0)) { tcp_connection_cleanup (new_tc0); - tcp_send_reset (tc0, b0, is_ip4); + tcp_send_reset_w_pkt (tc0, b0, is_ip4); TCP_EVT_DBG (TCP_EVT_RST_SENT, tc0); goto drop; } @@ -2221,7 +2226,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, */ if (!tcp_rcv_ack_is_acceptable (tc0, b0)) { - tcp_send_reset (tc0, b0, is_ip4); + tcp_send_reset_w_pkt (tc0, b0, is_ip4); goto drop; } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 7da0c073..c56eadf8 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -440,7 +440,8 @@ tcp_init_mss (tcp_connection_t * tc) always_inline int tcp_alloc_tx_buffers (tcp_main_t * tm, u8 thread_index, u32 n_free_buffers) { - vec_validate (tm->tx_buffers[thread_index], n_free_buffers - 1); + vec_validate (tm->tx_buffers[thread_index], + vec_len (tm->tx_buffers[thread_index]) + n_free_buffers - 1); _vec_len (tm->tx_buffers[thread_index]) = vlib_buffer_alloc_from_free_list (vlib_get_main (), tm->tx_buffers[thread_index], @@ -480,27 +481,31 @@ tcp_return_buffer (tcp_main_t * tm) _vec_len (my_tx_buffers) += 1; } -always_inline void +always_inline void * tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b) { - vlib_buffer_t *it = b; - u32 save_free_list = b->flags & VLIB_BUFFER_FREE_LIST_INDEX_MASK; - do - { - it->current_data = 0; - it->current_length = 0; - it->total_length_not_including_first_buffer = 0; - } - while ((it->flags & VLIB_BUFFER_NEXT_PRESENT) - && (it = vlib_get_buffer (vm, it->next_buffer))); - if (b->flags & VLIB_BUFFER_NEXT_PRESENT) vlib_buffer_free_one (vm, b->next_buffer); - b->flags = save_free_list; + b->flags = 0; + b->current_data = 0; + b->current_length = 0; + b->total_length_not_including_first_buffer = 0; + vnet_buffer (b)->tcp.flags = 0; /* Leave enough space for headers */ - vlib_buffer_make_headroom (b, MAX_HDRS_LEN); + return vlib_buffer_make_headroom (b, MAX_HDRS_LEN); +} + +always_inline void * +tcp_init_buffer (vlib_main_t * vm, vlib_buffer_t * b) +{ + ASSERT ((b->flags & VLIB_BUFFER_NEXT_PRESENT) == 0); + b->flags = VNET_BUFFER_F_LOCALLY_ORIGINATED; + b->total_length_not_including_first_buffer = 0; vnet_buffer (b)->tcp.flags = 0; + + /* Leave enough space for headers */ + return vlib_buffer_make_headroom (b, MAX_HDRS_LEN); } /** @@ -632,6 +637,59 @@ tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, vlib_put_frame_to_node (vm, next_index, f); } +always_inline void +tcp_enqueue_to_output_i (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, + u8 is_ip4, u8 flush) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + u32 thread_index = vlib_get_thread_index (); + u32 *to_next, next_index; + vlib_frame_t *f; + + b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; + b->error = 0; + + /* Decide where to send the packet */ + next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index; + + /* Initialize the trajectory trace, if configured */ + if (VLIB_BUFFER_TRACE_TRAJECTORY > 0) + { + b->pre_data[0] = 1; + b->pre_data[1] = next_index; + } + + /* Get frame to v4/6 output node */ + f = tm->tx_frames[!is_ip4][thread_index]; + if (!f) + { + f = vlib_get_frame_to_node (vm, next_index); + ASSERT (f); + tm->tx_frames[!is_ip4][thread_index] = f; + } + to_next = vlib_frame_vector_args (f); + to_next[f->n_vectors] = bi; + f->n_vectors += 1; + if (flush || f->n_vectors == VLIB_FRAME_SIZE) + { + vlib_put_frame_to_node (vm, next_index, f); + tm->tx_frames[!is_ip4][thread_index] = 0; + } +} + +always_inline void +tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) +{ + tcp_enqueue_to_output_i (vm, b, bi, is_ip4, 0); +} + +always_inline void +tcp_enqueue_to_output_now (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, + u8 is_ip4) +{ + tcp_enqueue_to_output_i (vm, b, bi, is_ip4, 1); +} + int tcp_make_reset_in_place (vlib_main_t * vm, vlib_buffer_t * b0, tcp_state_t state, u8 thread_index, u8 is_ip4) @@ -712,9 +770,11 @@ tcp_make_reset_in_place (vlib_main_t * vm, vlib_buffer_t * b0, /** * Send reset without reusing existing buffer + * + * It extracts connection info out of original packet */ void -tcp_send_reset (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4) +tcp_send_reset_w_pkt (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4) { vlib_buffer_t *b; u32 bi; @@ -730,9 +790,7 @@ tcp_send_reset (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4) return; b = vlib_get_buffer (vm, bi); - - /* Leave enough space for headers */ - vlib_buffer_make_headroom (b, MAX_HDRS_LEN); + tcp_init_buffer (vm, b); /* Make and write options */ tcp_hdr_len = sizeof (tcp_header_t); @@ -787,6 +845,38 @@ tcp_send_reset (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4) TCP_EVT_DBG (TCP_EVT_RST_SENT, tc); } +/** + * Build and set reset packet for connection + */ +void +tcp_send_reset (tcp_connection_t * tc) +{ + vlib_main_t *vm = vlib_get_main (); + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_buffer_t *b; + u32 bi; + tcp_header_t *th; + u16 tcp_hdr_opts_len, advertise_wnd, opts_write_len; + u8 flags; + + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); + tcp_init_buffer (vm, b); + + tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); + tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t); + advertise_wnd = tcp_window_to_advertise (tc, TCP_STATE_ESTABLISHED); + flags = TCP_FLAG_RST; + th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt, + tc->rcv_nxt, tcp_hdr_opts_len, flags, + advertise_wnd); + opts_write_len = tcp_options_write ((u8 *) (th + 1), &tc->snd_opts); + ASSERT (opts_write_len == tc->snd_opts_len); + vnet_buffer (b)->tcp.connection_index = tc->c_c_index; + tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4); +} + void tcp_push_ip_hdr (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b) { @@ -835,9 +925,7 @@ tcp_send_syn (tcp_connection_t * tc) return; b = vlib_get_buffer (vm, bi); - - /* Leave enough space for headers */ - vlib_buffer_make_headroom (b, MAX_HDRS_LEN); + tcp_init_buffer (vm, b); /* Set random initial sequence */ time_now = tcp_time_now (); @@ -875,59 +963,6 @@ tcp_send_syn (tcp_connection_t * tc) TCP_EVT_DBG (TCP_EVT_SYN_SENT, tc); } -always_inline void -tcp_enqueue_to_output_i (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, - u8 is_ip4, u8 flush) -{ - tcp_main_t *tm = vnet_get_tcp_main (); - u32 thread_index = vlib_get_thread_index (); - u32 *to_next, next_index; - vlib_frame_t *f; - - b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; - b->error = 0; - - /* Decide where to send the packet */ - next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index; - - /* Initialize the trajectory trace, if configured */ - if (VLIB_BUFFER_TRACE_TRAJECTORY > 0) - { - b->pre_data[0] = 1; - b->pre_data[1] = next_index; - } - - /* Get frame to v4/6 output node */ - f = tm->tx_frames[!is_ip4][thread_index]; - if (!f) - { - f = vlib_get_frame_to_node (vm, next_index); - ASSERT (f); - tm->tx_frames[!is_ip4][thread_index] = f; - } - to_next = vlib_frame_vector_args (f); - to_next[f->n_vectors] = bi; - f->n_vectors += 1; - if (flush || f->n_vectors == VLIB_FRAME_SIZE) - { - vlib_put_frame_to_node (vm, next_index, f); - tm->tx_frames[!is_ip4][thread_index] = 0; - } -} - -always_inline void -tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) -{ - tcp_enqueue_to_output_i (vm, b, bi, is_ip4, 0); -} - -always_inline void -tcp_enqueue_to_output_now (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, - u8 is_ip4) -{ - tcp_enqueue_to_output_i (vm, b, bi, is_ip4, 1); -} - /** * Flush tx frame populated by retransmits and timer pops */ @@ -969,7 +1004,7 @@ tcp_send_fin (tcp_connection_t * tc) if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) return; b = vlib_get_buffer (vm, bi); - + /* buffer will be initialized by in tcp_make_fin */ tcp_make_fin (tc, b); tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4); tc->flags |= TCP_CONN_FINSNT; @@ -1013,6 +1048,8 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, tcp_header_t *th; data_len = b->current_length + b->total_length_not_including_first_buffer; + ASSERT (!b->total_length_not_including_first_buffer + || (b->flags & VLIB_BUFFER_NEXT_PRESENT)); vnet_buffer (b)->tcp.flags = 0; if (compute_opts) @@ -1106,29 +1143,27 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, * Make sure we can retransmit something */ available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection); + available_bytes -= offset; if (!available_bytes) return 0; max_deq_bytes = clib_min (tc->snd_mss, max_deq_bytes); max_deq_bytes = clib_min (available_bytes, max_deq_bytes); - seg_size = max_deq_bytes + MAX_HDRS_LEN; /* Start is beyond snd_congestion */ start = tc->snd_una + offset; if (seq_geq (start, tc->snd_congestion)) - { - goto done; - } + goto done; /* Don't overshoot snd_congestion */ if (seq_gt (start + max_deq_bytes, tc->snd_congestion)) { max_deq_bytes = tc->snd_congestion - start; if (max_deq_bytes == 0) - { - goto done; - } + goto done; } + seg_size = max_deq_bytes + MAX_HDRS_LEN; + /* * Prepare options */ @@ -1141,7 +1176,7 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) return 0; *b = vlib_get_buffer (vm, bi); - data = vlib_buffer_make_headroom (*b, MAX_HDRS_LEN); + data = tcp_init_buffer (vm, *b); /* Easy case, buffer size greater than mss */ if (PREDICT_TRUE (seg_size <= tm->bytes_per_buffer)) @@ -1162,7 +1197,6 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, int i; n_bufs_per_seg = ceil ((double) seg_size / tm->bytes_per_buffer); - ASSERT (available_bytes >= max_deq_bytes); /* Make sure we have enough buffers */ available_bufs = vec_len (tm->tx_buffers[thread_index]); @@ -1182,8 +1216,6 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, b[0]->current_length = n_bytes; b[0]->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; b[0]->total_length_not_including_first_buffer = 0; - - tcp_push_hdr_i (tc, *b, tc->state, 0); max_deq_bytes -= n_bytes; chain_b = *b; @@ -1197,22 +1229,22 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, chain_b->current_data = 0; data = vlib_buffer_get_current (chain_b); n_peeked = stream_session_peek_bytes (&tc->connection, data, - n_bytes, len_to_deq); - n_bytes += n_peeked; + offset + n_bytes, len_to_deq); ASSERT (n_peeked == len_to_deq); + n_bytes += n_peeked; chain_b->current_length = n_peeked; - b[0]->total_length_not_including_first_buffer += - chain_b->current_length; + chain_b->flags = 0; + chain_b->next_buffer = 0; /* update previous buffer */ prev_b->next_buffer = chain_bi; prev_b->flags |= VLIB_BUFFER_NEXT_PRESENT; - /* update current buffer */ - chain_b->next_buffer = 0; - max_deq_bytes -= n_peeked; + b[0]->total_length_not_including_first_buffer += n_peeked; } + + tcp_push_hdr_i (tc, *b, tc->state, 0); } ASSERT (n_bytes > 0); @@ -1348,7 +1380,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) return; b = vlib_get_buffer (vm, bi); - vlib_buffer_make_headroom (b, MAX_HDRS_LEN); + tcp_init_buffer (vm, b); tcp_push_hdr_i (tc, b, tc->state, 1); /* Account for the SYN */ @@ -1409,8 +1441,9 @@ tcp_timer_persist_handler (u32 index) u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; - u32 bi, old_snd_nxt, snd_bytes = 0, available_bytes = 0; + u32 bi, old_snd_nxt, max_snd_bytes, available_bytes, offset; int n_bytes = 0; + u8 *data; tc = tcp_connection_get_if_valid (index, thread_index); @@ -1419,12 +1452,13 @@ tcp_timer_persist_handler (u32 index) /* Make sure timer handle is set to invalid */ tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID; + offset = tc->snd_una_max - tc->snd_una; /* Problem already solved or worse */ available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection); if (tc->state == TCP_STATE_CLOSED || tc->state > TCP_STATE_ESTABLISHED || tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc) - || !available_bytes) + || !available_bytes || available_bytes <= offset) return; /* Increment RTO backoff */ @@ -1437,18 +1471,17 @@ tcp_timer_persist_handler (u32 index) if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) return; b = vlib_get_buffer (vm, bi); + data = tcp_init_buffer (vm, b); - tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); + tcp_validate_txf_size (tc, offset); tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); - snd_bytes = clib_min (tc->snd_mss, tm->bytes_per_buffer); - n_bytes = stream_session_peek_bytes (&tc->connection, - vlib_buffer_get_current (b), - tc->snd_una_max - tc->snd_una, - snd_bytes); - ASSERT (n_bytes != 0); + max_snd_bytes = clib_min (tc->snd_mss, tm->bytes_per_buffer - MAX_HDRS_LEN); + n_bytes = stream_session_peek_bytes (&tc->connection, data, offset, + max_snd_bytes); b->current_length = n_bytes; - ASSERT (tc->snd_nxt == tc->snd_una_max || tc->rto_boff > 1 - || tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)); + ASSERT (n_bytes != 0 && (tc->snd_nxt == tc->snd_una_max || tc->rto_boff > 1 + || tcp_timer_is_active (tc, + TCP_TIMER_RETRANSMIT))); /* Allow updating of snd_una_max but don't update snd_nxt */ old_snd_nxt = tc->snd_nxt; @@ -1456,8 +1489,8 @@ tcp_timer_persist_handler (u32 index) tc->snd_nxt = old_snd_nxt; tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); - /* Re-enable persist timer */ - tcp_persist_timer_set (tc); + /* Just sent new data, enable retransmit */ + tcp_retransmit_timer_update (tc); } /** @@ -1490,7 +1523,7 @@ void tcp_fast_retransmit_sack (tcp_connection_t * tc) { vlib_main_t *vm = vlib_get_main (); - u32 n_written = 0, offset = 0, max_bytes; + u32 n_written = 0, offset, max_bytes; vlib_buffer_t *b = 0; sack_scoreboard_hole_t *hole; sack_scoreboard_t *sb; @@ -1523,7 +1556,9 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) * unSACKed sequence number SHOULD be returned, and RescueRxt set to * RecoveryPoint. HighRxt MUST NOT be updated. */ - max_bytes = clib_min (tc->snd_mss, snd_space); + max_bytes = clib_min (tc->snd_mss, + tc->snd_congestion - tc->snd_una); + max_bytes = clib_min (max_bytes, snd_space); offset = tc->snd_congestion - tc->snd_una - max_bytes; sb->rescue_rxt = tc->snd_congestion; tc->snd_nxt = tc->snd_una + offset; @@ -1535,9 +1570,12 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) break; } - max_bytes = snd_limited ? tc->snd_mss : hole->end - sb->high_rxt; + max_bytes = clib_min (hole->end - sb->high_rxt, snd_space); + max_bytes = snd_limited ? clib_min (max_bytes, tc->snd_mss) : max_bytes; + if (max_bytes == 0) + break; offset = sb->high_rxt - tc->snd_una; - tc->snd_nxt = tc->snd_una + offset; + tc->snd_nxt = sb->high_rxt; n_written = tcp_prepare_retransmit_segment (tc, offset, max_bytes, &b); /* Nothing left to retransmit */ @@ -1547,6 +1585,7 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) bi = vlib_get_buffer_index (vm, b); sb->high_rxt += n_written; tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + ASSERT (n_written <= snd_space); snd_space -= n_written; } @@ -1835,6 +1874,7 @@ tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b) tc = (tcp_connection_t *) tconn; tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED, 0); + ASSERT (seq_leq (tc->snd_una_max, tc->snd_una + tc->snd_wnd)); if (tc->rtt_ts == 0 && !tcp_in_cong_recovery (tc)) { -- cgit 1.2.3-korg From d84ba85c0071a28fe888c912c3dc37f471b0caeb Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Tue, 22 Aug 2017 17:56:46 -0400 Subject: TCP horizontal scaling - Remove frame handoff support machinery. We haven't used it in a long time. - Configuration support for the local endpoints bihash table - Drop lookup failure packets in tcp46_syn_sent Change-Id: Icd51e6785f74661c741e76fac23d21c4cc998d17 Signed-off-by: Dave Barach --- src/vlib/main.c | 9 +-------- src/vlib/node.h | 5 +---- src/vlib/node_funcs.h | 15 ++++++--------- src/vnet/tcp/tcp.c | 23 +++++++++++++++++++++-- src/vnet/tcp/tcp.h | 4 ++++ src/vnet/tcp/tcp_input.c | 6 +++++- 6 files changed, 38 insertions(+), 24 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/vlib/main.c b/src/vlib/main.c index 73548fbe..5d99e899 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -138,19 +138,12 @@ vlib_frame_alloc_to_node (vlib_main_t * vm, u32 to_node_index, else { f = clib_mem_alloc_aligned_no_fail (n, VLIB_FRAME_ALIGN); - f->thread_index = vm->thread_index; fi = vlib_frame_index_no_check (vm, f); } /* Poison frame when debugging. */ if (CLIB_DEBUG > 0) - { - u32 save_thread_index = f->thread_index; - - memset (f, 0xfe, n); - - f->thread_index = save_thread_index; - } + memset (f, 0xfe, n); /* Insert magic number. */ { diff --git a/src/vlib/node.h b/src/vlib/node.h index 77914272..2acd61ce 100644 --- a/src/vlib/node.h +++ b/src/vlib/node.h @@ -326,7 +326,7 @@ typedef struct vlib_node_t /* Max number of vector elements to process at once per node. */ #define VLIB_FRAME_SIZE 256 -#define VLIB_FRAME_ALIGN VLIB_MAX_CPUS +#define VLIB_FRAME_ALIGN CLIB_CACHE_LINE_BYTES /* Calling frame (think stack frame) for a node. */ typedef struct vlib_frame_t @@ -343,9 +343,6 @@ typedef struct vlib_frame_t /* Number of vector elements currently in frame. */ u16 n_vectors; - /* Owner thread / heap id */ - u16 thread_index; - /* Scalar and vector arguments to next node. */ u8 arguments[0]; } vlib_frame_t; diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h index c4c06454..0059b9be 100644 --- a/src/vlib/node_funcs.h +++ b/src/vlib/node_funcs.h @@ -216,24 +216,21 @@ always_inline vlib_frame_t * vlib_get_frame_no_check (vlib_main_t * vm, uword frame_index) { vlib_frame_t *f; - u32 thread_index = frame_index & VLIB_CPU_MASK; - u32 offset = frame_index & VLIB_OFFSET_MASK; - vm = vlib_mains[thread_index]; - f = vm->heap_base + offset; + f = vm->heap_base + (frame_index * VLIB_FRAME_ALIGN); return f; } always_inline u32 vlib_frame_index_no_check (vlib_main_t * vm, vlib_frame_t * f) { - u32 i; + uword i; - ASSERT (((uword) f & VLIB_CPU_MASK) == 0); - - vm = vlib_mains[f->thread_index]; + ASSERT (((uword) f & (VLIB_FRAME_ALIGN - 1)) == 0); i = ((u8 *) f - (u8 *) vm->heap_base); - return i | f->thread_index; + ASSERT ((i / VLIB_FRAME_ALIGN) <= 0xFFFFFFFFULL); + + return i / VLIB_FRAME_ALIGN; } always_inline vlib_frame_t * diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 197fff96..6b2b4759 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -1321,9 +1321,14 @@ tcp_main_enable (vlib_main_t * vm) tm->tstamp_ticks_per_clock = vm->clib_time.seconds_per_clock / TCP_TSTAMP_RESOLUTION; + if (tm->local_endpoints_table_buckets == 0) + tm->local_endpoints_table_buckets = 250000; + if (tm->local_endpoints_table_memory == 0) + tm->local_endpoints_table_memory = 512 << 20; + clib_bihash_init_24_8 (&tm->local_endpoints_table, "local endpoint table", - 1000000 /* $$$$ config parameter nbuckets */ , - (512 << 20) /*$$$ config parameter table size */ ); + tm->local_endpoints_table_buckets, + tm->local_endpoints_table_memory); /* Initialize [port-allocator] random number seed */ tm->port_allocator_seed = (u32) clib_cpu_time_now (); @@ -1377,6 +1382,7 @@ static clib_error_t * tcp_config_fn (vlib_main_t * vm, unformat_input_t * input) { tcp_main_t *tm = vnet_get_tcp_main (); + u64 tmp; while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { @@ -1387,6 +1393,19 @@ tcp_config_fn (vlib_main_t * vm, unformat_input_t * input) else if (unformat (input, "preallocated-half-open-connections %d", &tm->preallocated_half_open_connections)) ; + else if (unformat (input, "local-endpoints-table-memory %U", + unformat_memory_size, &tmp)) + { + if (tmp >= 0x100000000) + return clib_error_return (0, "memory size %llx (%lld) too large", + tmp, tmp); + tm->local_endpoints_table_memory = tmp; + } + else if (unformat (input, "local-endpoints-table-buckets %d", + &tm->local_endpoints_table_buckets)) + ; + + else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 9e4660b8..11d61f5d 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -399,6 +399,10 @@ typedef struct _tcp_main u32 preallocated_connections; u32 preallocated_half_open_connections; + /** Transport table (preallocation) size parameters */ + u32 local_endpoints_table_memory; + u32 local_endpoints_table_buckets; + /** Vectors of src addresses. Optional unless one needs > 63K active-opens */ ip4_address_t *ip4_src_addresses; u32 last_v4_address_rotor; diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 95f9ade1..66e2b88f 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -1882,7 +1882,11 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tc0 = tcp_half_open_connection_get (vnet_buffer (b0)-> tcp.connection_index); - ASSERT (tc0); + if (PREDICT_FALSE (tc0 == 0)) + { + error0 = TCP_ERROR_INVALID_CONNECTION; + goto drop; + } ack0 = vnet_buffer (b0)->tcp.ack_number; seq0 = vnet_buffer (b0)->tcp.seq_number; -- cgit 1.2.3-korg From 50958959b57c9c2d3fc72ae7588c53d1804aeb86 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Tue, 29 Aug 2017 14:50:13 -0700 Subject: tcp: re-enable persist timer if no data available to send Additionally, flush rx fifos for closed sessions. Change-Id: If2cc563fbda0451e7572650e98b15f0a694a0ff9 Signed-off-by: Florin Coras --- src/vnet/session/session.c | 8 +++++++- src/vnet/tcp/tcp_input.c | 1 + src/vnet/tcp/tcp_output.c | 22 ++++++++++++++++++---- 3 files changed, 26 insertions(+), 5 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index ee22ccbe..dcd141f1 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -330,7 +330,13 @@ stream_session_enqueue_notify (stream_session_t * s, u8 block) static u32 serial_number; if (PREDICT_FALSE (s->session_state == SESSION_STATE_CLOSED)) - return 0; + { + /* Session is closed so app will never clean up. Flush rx fifo */ + u32 to_dequeue = svm_fifo_max_dequeue (s->server_rx_fifo); + if (to_dequeue) + svm_fifo_dequeue_drop (s->server_rx_fifo, to_dequeue); + return 0; + } /* Get session's server */ app = application_get_if_valid (s->app_index); diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 66e2b88f..1d903453 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -3073,6 +3073,7 @@ do { \ _(TIME_WAIT, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(TIME_WAIT, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(TIME_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED); _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); _(CLOSED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index c56eadf8..02555513 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -1452,15 +1452,29 @@ tcp_timer_persist_handler (u32 index) /* Make sure timer handle is set to invalid */ tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID; - offset = tc->snd_una_max - tc->snd_una; /* Problem already solved or worse */ - available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection); if (tc->state == TCP_STATE_CLOSED || tc->state > TCP_STATE_ESTABLISHED - || tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc) - || !available_bytes || available_bytes <= offset) + || tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc)) return; + available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection); + offset = tc->snd_una_max - tc->snd_una; + + /* Reprogram persist if no new bytes available to send. We may have data + * next time */ + if (!available_bytes) + { + tcp_persist_timer_set (tc); + return; + } + + if (available_bytes <= offset) + { + ASSERT (tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)); + return; + } + /* Increment RTO backoff */ tc->rto_boff += 1; tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); -- cgit 1.2.3-korg From 4eeeaaf5e822718eb222e6c49abd82e1bcb566fd Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Tue, 5 Sep 2017 14:03:37 -0400 Subject: tcp: horizontal scaling improvments - do not scale syn-ack window - fix the max number of outstanding syns in builtin client - fix syn-sent ack validation to use modulo arithmetic - improve retransmit timer handler - fix output buffer allocator leakeage - improved debugging Change-Id: Iac3bc0eadf7d0b494a93e22d210a3153b61b3273 Signed-off-by: Florin Coras --- src/vnet/session/session.c | 21 ++-- src/vnet/session/session_node.c | 6 +- src/vnet/tcp/builtin_client.c | 5 + src/vnet/tcp/tcp.c | 26 ++++- src/vnet/tcp/tcp.h | 7 +- src/vnet/tcp/tcp_debug.h | 49 +++++---- src/vnet/tcp/tcp_error.def | 3 +- src/vnet/tcp/tcp_input.c | 204 ++++++++++++++++++++++++++----------- src/vnet/tcp/tcp_output.c | 212 ++++++++++++++++++--------------------- src/vppinfra/tw_timer_template.c | 11 +- 10 files changed, 335 insertions(+), 209 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 17644e29..4544f9a0 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -456,13 +456,16 @@ stream_session_connect_notify (transport_connection_t * tc, u8 is_fail) st); if (handle == HALF_OPEN_LOOKUP_INVALID_VALUE) { - clib_warning ("This can't be good!"); + clib_warning ("half-open was removed!"); return -1; } + /* Cleanup half-open table */ + stream_session_half_open_table_del (tc); + /* Get the app's index from the handle we stored when opening connection * and the opaque (api_context for external apps) from transport session - * index*/ + * index */ app = application_get_if_valid (handle >> 32); if (!app) return -1; @@ -499,9 +502,6 @@ stream_session_connect_notify (transport_connection_t * tc, u8 is_fail) new_s->session_state = SESSION_STATE_READY; } - /* Cleanup session lookup */ - stream_session_half_open_table_del (tc); - return error; } @@ -535,7 +535,7 @@ stream_session_disconnect_notify (transport_connection_t * tc) } /** - * Cleans up session and associated app if needed. + * Cleans up session and lookup table. */ void stream_session_delete (stream_session_t * s) @@ -559,9 +559,10 @@ stream_session_delete (stream_session_t * s) /** * Notification from transport that connection is being deleted * - * This should be called only on previously fully established sessions. For - * instance failed connects should call stream_session_connect_notify and - * indicate that the connect has failed. + * This removes the session if it is still valid. It should be called only on + * previously fully established sessions. For instance failed connects should + * call stream_session_connect_notify and indicate that the connect has + * failed. */ void stream_session_delete_notify (transport_connection_t * tc) @@ -748,7 +749,7 @@ session_send_session_evt_to_thread (u64 session_handle, if (PREDICT_TRUE (q->cursize < q->maxsize)) { if (unix_shared_memory_queue_add (q, (u8 *) & evt, - 1 /* do wait for mutex */ )) + 0 /* do wait for mutex */ )) { clib_warning ("failed to enqueue evt"); } diff --git a/src/vnet/session/session_node.c b/src/vnet/session/session_node.c index dec6d13c..09687687 100644 --- a/src/vnet/session/session_node.c +++ b/src/vnet/session/session_node.c @@ -267,7 +267,7 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, b0 = vlib_get_buffer (vm, bi0); b0->error = 0; - b0->flags = VNET_BUFFER_F_LOCALLY_ORIGINATED; + b0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; b0->current_data = 0; b0->total_length_not_including_first_buffer = 0; @@ -321,8 +321,10 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, })); /* *INDENT-ON* */ - VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + if (VLIB_BUFFER_TRACE_TRAJECTORY) + b0->pre_data[1] = 3; + if (PREDICT_FALSE (n_trace > 0)) { session_queue_trace_t *t0; diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index 94e6b4ae..5b4c8679 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -509,6 +509,11 @@ clients_connect (vlib_main_t * vm, u8 * uri, u32 n_clients) /* Crude pacing for call setups */ if ((i % 4) == 0) vlib_process_suspend (vm, 10e-6); + ASSERT (i + 1 >= tm->ready_connections); + while (i + 1 - tm->ready_connections > 8000) + { + vlib_process_suspend (vm, 100e-6); + } } } diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index a4c13084..04f1e068 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -160,6 +160,7 @@ tcp_half_open_connection_new (void) { tcp_main_t *tm = vnet_get_tcp_main (); tcp_connection_t *tc = 0; + ASSERT (vlib_get_thread_index () == 0); pool_get (tm->half_open_connections, tc); memset (tc, 0, sizeof (*tc)); tc->c_c_index = tc - tm->half_open_connections; @@ -561,6 +562,22 @@ tcp_connection_fib_attach (tcp_connection_t * tc) } #endif /* 0 */ +/** + * Initialize connection send variables. + */ +void +tcp_init_snd_vars (tcp_connection_t * tc) +{ + u32 time_now; + + /* Set random initial sequence */ + time_now = tcp_time_now (); + tc->iss = random_u32 (&time_now); + tc->snd_una = tc->iss; + tc->snd_nxt = tc->iss + 1; + tc->snd_una_max = tc->snd_nxt; +} + /** Initialize tcp connection variables * * Should be called after having received a msg from the peer, i.e., a SYN or @@ -572,6 +589,9 @@ tcp_connection_init_vars (tcp_connection_t * tc) tcp_init_mss (tc); scoreboard_init (&tc->sack_sb); tcp_cc_init (tc); + if (tc->state == TCP_STATE_SYN_RCVD) + tcp_init_snd_vars (tc); + // tcp_connection_fib_attach (tc); } @@ -691,6 +711,7 @@ tcp_connection_open (transport_endpoint_t * rmt) TCP_EVT_DBG (TCP_EVT_OPEN, tc); tc->state = TCP_STATE_SYN_SENT; + tcp_init_snd_vars (tc); tcp_send_syn (tc); clib_spinlock_unlock_if_init (&tm->half_open_lock); @@ -784,7 +805,7 @@ format_tcp_vars (u8 * s, va_list * args) tc->snd_wnd, tc->rcv_wnd, tc->snd_wl1 - tc->irs, tc->snd_wl2 - tc->iss); s = format (s, " flight size %u send space %u rcv_wnd_av %d\n", - tcp_flight_size (tc), tcp_available_snd_space (tc), + tcp_flight_size (tc), tcp_available_output_snd_space (tc), tcp_rcv_wnd_available (tc)); s = format (s, " cong %U ", format_tcp_congestion_status, tc); s = format (s, "cwnd %u ssthresh %u rtx_bytes %u bytes_acked %u\n", @@ -1155,6 +1176,9 @@ tcp_timer_establish_handler (u32 conn_index) return; ASSERT (tc->state == TCP_STATE_SYN_RCVD); + /* Start cleanup. App wasn't notified yet so use delete notify as + * opposed to delete to cleanup session layer state. */ + stream_session_delete_notify (&tc->connection); } tc->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID; tcp_connection_cleanup (tc); diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 11d61f5d..6020a3de 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -97,7 +97,7 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler; * ticks to timer units */ #define TCP_DELACK_TIME 1 /* 0.1s */ #define TCP_ESTABLISH_TIME 750 /* 75s */ -#define TCP_SYN_RCVD_TIME 100 /* 10s */ +#define TCP_SYN_RCVD_TIME 600 /* 60s */ #define TCP_2MSL_TIME 300 /* 30s */ #define TCP_CLOSEWAIT_TIME 20 /* 0.1s */ #define TCP_CLEANUP_TIME 5 /* 0.5s Time to wait before cleanup */ @@ -676,6 +676,7 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, void tcp_connection_timers_init (tcp_connection_t * tc); void tcp_connection_timers_reset (tcp_connection_t * tc); +void tcp_init_snd_vars (tcp_connection_t * tc); void tcp_connection_init_vars (tcp_connection_t * tc); always_inline void @@ -690,6 +691,7 @@ always_inline void tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval) { ASSERT (tc->c_thread_index == vlib_get_thread_index ()); + ASSERT (tc->timers[timer_id] == TCP_TIMER_HANDLE_INVALID); tc->timers[timer_id] = tw_timer_start_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index], tc->c_c_index, timer_id, interval); @@ -722,6 +724,7 @@ tcp_timer_update (tcp_connection_t * tc, u8 timer_id, u32 interval) always_inline void tcp_retransmit_timer_set (tcp_connection_t * tc) { + ASSERT (tc->snd_una != tc->snd_una_max); tcp_timer_set (tc, TCP_TIMER_RETRANSMIT, clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); } @@ -769,7 +772,7 @@ tcp_retransmit_timer_update (tcp_connection_t * tc) { tcp_retransmit_timer_reset (tc); if (tc->snd_wnd < tc->snd_mss) - tcp_persist_timer_set (tc); + tcp_persist_timer_update (tc); } else tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index fc36eb29..cf77e6e6 100755 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -197,9 +197,10 @@ typedef enum _tcp_dbg_evt ed->data[0] = _tc->c_c_index; \ } -#define TCP_EVT_SYN_RCVD_HANDLER(_tc, ...) \ +#define TCP_EVT_SYN_RCVD_HANDLER(_tc,_init, ...) \ { \ - TCP_EVT_INIT_HANDLER(_tc, 0); \ + if (_init) \ + TCP_EVT_INIT_HANDLER(_tc, 0); \ ELOG_TYPE_DECLARE (_e) = \ { \ .format = "syn-rx: irs %u", \ @@ -275,11 +276,14 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "syn-tx: iss %u", \ - .format_args = "i4", \ + .format = "syn-tx: iss %u snd_una %u snd_una_max %u snd_nxt %u", \ + .format_args = "i4i4i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 1); \ + DECLARE_ETD(_tc, _e, 4); \ ed->data[0] = _tc->iss; \ + ed->data[1] = _tc->snd_una - _tc->iss; \ + ed->data[2] = _tc->snd_una_max - _tc->iss; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ TCP_EVT_STATE_CHANGE_HANDLER(_tc); \ } @@ -287,24 +291,30 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "synack-tx: iss %u irs %u", \ - .format_args = "i4i4", \ + .format = "synack-tx: iss %u irs %u snd_una %u snd_nxt %u rcv_nxt %u",\ + .format_args = "i4i4i4i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 2); \ + DECLARE_ETD(_tc, _e, 5); \ ed->data[0] = _tc->iss; \ ed->data[1] = _tc->irs; \ + ed->data[2] = _tc->snd_una - _tc->iss; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ + ed->data[4] = _tc->rcv_nxt - _tc->irs; \ } #define TCP_EVT_SYNACK_RCVD_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "synack-rx: iss %u irs %u", \ - .format_args = "i4i4", \ + .format = "synack-rx: iss %u irs %u snd_una %u snd_nxt %u rcv_nxt %u",\ + .format_args = "i4i4i4i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 2); \ + DECLARE_ETD(_tc, _e, 5); \ ed->data[0] = _tc->iss; \ ed->data[1] = _tc->irs; \ + ed->data[2] = _tc->snd_una - _tc->iss; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ + ed->data[4] = _tc->rcv_nxt - _tc->irs; \ TCP_EVT_STATE_CHANGE_HANDLER(_tc); \ } @@ -361,17 +371,20 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "%s-rxt: iss %u", \ - .format_args = "t4i4", \ + .format = "%s-rxt: iss %u irs %u snd_nxt %u rcv_nxt %u", \ + .format_args = "t4i4i4i4i4", \ .n_enum_strings = 2, \ .enum_strings = { \ "syn", \ "syn-ack", \ }, \ }; \ - DECLARE_ETD(_tc, _e, 2); \ + DECLARE_ETD(_tc, _e, 5); \ ed->data[0] = _type; \ ed->data[1] = _tc->iss; \ + ed->data[2] = _tc->irs; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ + ed->data[4] = _tc->rcv_nxt - _tc->irs; \ } #else @@ -414,7 +427,7 @@ typedef enum _tcp_dbg_evt ed->data[0] = _tc->rcv_nxt - _tc->irs; \ ed->data[1] = _tc->rcv_wnd; \ ed->data[2] = _tc->snd_nxt - _tc->iss; \ - ed->data[3] = tcp_available_wnd(_tc); \ + ed->data[3] = tcp_available_snd_wnd(_tc); \ ed->data[4] = _tc->snd_wnd; \ } @@ -422,7 +435,7 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "acked: %u snd_una %u snd_wnd %u cwnd %u inflight %u", \ + .format = "ack-rx: %u snd_una %u snd_wnd %u cwnd %u inflight %u", \ .format_args = "i4i4i4i4i4", \ }; \ DECLARE_ETD(_tc, _e, 5); \ @@ -452,13 +465,13 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "pktize: una %u snd_nxt %u space %u flight %u rcv_wnd %u",\ + .format = "tx: una %u snd_nxt %u space %u flight %u rcv_wnd %u",\ .format_args = "i4i4i4i4i4", \ }; \ DECLARE_ETD(_tc, _e, 5); \ ed->data[0] = _tc->snd_una - _tc->iss; \ ed->data[1] = _tc->snd_nxt - _tc->iss; \ - ed->data[2] = tcp_available_snd_space (_tc); \ + ed->data[2] = tcp_available_output_snd_space (_tc); \ ed->data[3] = tcp_flight_size (_tc); \ ed->data[4] = _tc->rcv_wnd; \ } diff --git a/src/vnet/tcp/tcp_error.def b/src/vnet/tcp/tcp_error.def index a4e46d64..08922315 100644 --- a/src/vnet/tcp/tcp_error.def +++ b/src/vnet/tcp/tcp_error.def @@ -38,4 +38,5 @@ tcp_error (FILTERED_DUPACKS, "Filtered duplicate ACKs") tcp_error (RST_SENT, "Resets sent") tcp_error (INVALID_CONNECTION, "Invalid connection") tcp_error (NO_WND, "No window") -tcp_error (CONNECTION_CLOSED, "Connection closed") \ No newline at end of file +tcp_error (CONNECTION_CLOSED, "Connection closed") +tcp_error (CREATE_EXISTS, "Connection already exists") \ No newline at end of file diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 1d903453..841e72a5 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -275,6 +275,7 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, if (PREDICT_FALSE (tcp_options_parse (th0, &tc0->rcv_opts))) { + clib_warning ("options parse error"); return -1; } @@ -350,9 +351,12 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, if (tcp_syn (th0)) { /* TODO implement RFC 5961 */ - tcp_make_ack (tc0, b0); + if (tc0->state != TCP_STATE_SYN_RCVD) + tcp_make_ack (tc0, b0); + else + tcp_make_synack (tc0, b0); *next0 = tcp_next_output (tc0->c_is_ip4); - TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0); + TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0, 0); return -1; } @@ -1842,6 +1846,74 @@ VLIB_NODE_FUNCTION_MULTIARCH (tcp6_established_node, tcp6_established); vlib_node_registration_t tcp4_syn_sent_node; vlib_node_registration_t tcp6_syn_sent_node; +static u8 +tcp_lookup_is_valid (tcp_connection_t * tc, tcp_header_t * hdr) +{ + transport_connection_t *tmp; + if (!tc) + return 1; + + u8 is_valid = (tc->c_lcl_port == hdr->dst_port + && (tc->state == TCP_STATE_LISTEN + || tc->c_rmt_port == hdr->src_port)); + + if (!is_valid) + { + if ((tmp = + stream_session_half_open_lookup (&tc->c_lcl_ip, &tc->c_rmt_ip, + tc->c_lcl_port, tc->c_rmt_port, + tc->c_transport_proto))) + { + if (tmp->lcl_port == hdr->dst_port + && tmp->rmt_port == hdr->src_port) + { + clib_warning ("half-open is valid!"); + } + } + } + return is_valid; +} + +/** + * Lookup transport connection + */ +static tcp_connection_t * +tcp_lookup_connection (vlib_buffer_t * b, u8 thread_index, u8 is_ip4) +{ + tcp_header_t *tcp; + transport_connection_t *tconn; + tcp_connection_t *tc; + if (is_ip4) + { + ip4_header_t *ip4; + ip4 = vlib_buffer_get_current (b); + tcp = ip4_next_header (ip4); + tconn = stream_session_lookup_transport_wt4 (&ip4->dst_address, + &ip4->src_address, + tcp->dst_port, + tcp->src_port, + SESSION_TYPE_IP4_TCP, + thread_index); + tc = tcp_get_connection_from_transport (tconn); + ASSERT (tcp_lookup_is_valid (tc, tcp)); + } + else + { + ip6_header_t *ip6; + ip6 = vlib_buffer_get_current (b); + tcp = ip6_next_header (ip6); + tconn = stream_session_lookup_transport_wt6 (&ip6->dst_address, + &ip6->src_address, + tcp->dst_port, + tcp->src_port, + SESSION_TYPE_IP6_TCP, + thread_index); + tc = tcp_get_connection_from_transport (tconn); + ASSERT (tcp_lookup_is_valid (tc, tcp)); + } + return tc; +} + always_inline uword tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, int is_ip4) @@ -1888,6 +1960,15 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; } + /* Half-open completed recently but the connection was't removed + * yet by the owning thread */ + if (PREDICT_FALSE (tc0->flags & TCP_CONN_HALF_OPEN_DONE)) + { + /* Make sure the connection actually exists */ + ASSERT (tcp_lookup_connection (b0, my_thread_index, is_ip4)); + goto drop; + } + ack0 = vnet_buffer (b0)->tcp.ack_number; seq0 = vnet_buffer (b0)->tcp.seq_number; tcp0 = tcp_buffer_hdr (b0); @@ -1914,16 +1995,20 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, */ if (tcp_ack (tcp0)) { - if (ack0 <= tc0->iss || ack0 > tc0->snd_nxt) + if (seq_leq (ack0, tc0->iss) || seq_gt (ack0, tc0->snd_nxt)) { + clib_warning ("ack not in rcv wnd"); if (!tcp_rst (tcp0)) tcp_send_reset_w_pkt (tc0, b0, is_ip4); goto drop; } /* Make sure ACK is valid */ - if (tc0->snd_una > ack0) - goto drop; + if (seq_gt (tc0->snd_una, ack0)) + { + clib_warning ("ack invalid"); + goto drop; + } } /* @@ -1949,11 +2034,17 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* No SYN flag. Drop. */ if (!tcp_syn (tcp0)) - goto drop; + { + clib_warning ("not synack"); + goto drop; + } /* Parse options */ if (tcp_options_parse (tcp0, &tc0->rcv_opts)) - goto drop; + { + clib_warning ("options parse fail"); + goto drop; + } /* Valid SYN or SYN-ACK. Move connection from half-open pool to * current thread pool. */ @@ -1981,8 +2072,8 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (tcp_opts_wscale (&new_tc0->rcv_opts)) new_tc0->snd_wscale = new_tc0->rcv_opts.wscale; - new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window) - << new_tc0->snd_wscale; + /* RFC1323: SYN and SYN-ACK wnd not scaled */ + new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window); new_tc0->snd_wl1 = seq0; new_tc0->snd_wl2 = ack0; @@ -2004,6 +2095,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, * allocate session send reset */ if (stream_session_connect_notify (&new_tc0->connection, 0)) { + clib_warning ("connect notify fail"); tcp_send_reset_w_pkt (new_tc0, b0, is_ip4); tcp_connection_cleanup (new_tc0); goto drop; @@ -2032,6 +2124,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } tc0->rtt_ts = 0; + tcp_init_snd_vars (tc0); tcp_make_synack (new_tc0, b0); next0 = tcp_next_output (is_ip4); @@ -2196,6 +2289,18 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + tcp_is_syn (tcp0) + tcp_is_fin (tcp0) + vnet_buffer (b0)->tcp.data_len; + if (CLIB_DEBUG) + { + tcp_connection_t *tmp; + tmp = tcp_lookup_connection (b0, my_thread_index, is_ip4); + if (tmp->state != tc0->state) + { + clib_warning ("state changed"); + ASSERT (0); + goto drop; + } + } + /* * Special treatment for CLOSED */ @@ -2211,8 +2316,8 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, */ /* 1-4: check SEQ, RST, SYN */ - if (PREDICT_FALSE - (tcp_segment_validate (vm, tc0, b0, tcp0, &next0))) + if (PREDICT_FALSE (tcp_segment_validate (vm, tc0, b0, tcp0, + &next0))) { error0 = TCP_ERROR_SEGMENT_INVALID; goto drop; @@ -2230,6 +2335,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, */ if (!tcp_rcv_ack_is_acceptable (tc0, b0)) { + clib_warning ("connection not accepted"); tcp_send_reset_w_pkt (tc0, b0, is_ip4); goto drop; } @@ -2252,6 +2358,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Reset SYN-ACK retransmit and SYN_RCV establish timers */ tcp_retransmit_timer_reset (tc0); tcp_timer_reset (tc0, TCP_TIMER_ESTABLISH); + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); break; case TCP_STATE_ESTABLISHED: /* We can get packets in established state here because they @@ -2400,6 +2507,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Send FIN-ACK notify app and enter CLOSE-WAIT */ tcp_connection_timers_reset (tc0); tcp_make_fin (tc0, b0); + tc0->snd_nxt += 1; next0 = tcp_next_output (tc0->c_is_ip4); stream_session_disconnect_notify (&tc0->connection); tc0->state = TCP_STATE_CLOSE_WAIT; @@ -2598,6 +2706,14 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* 3. check for a SYN (did that already) */ + /* Make sure connection wasn't just created */ + child0 = tcp_lookup_connection (b0, my_thread_index, is_ip4); + if (PREDICT_FALSE (child0->state != TCP_STATE_LISTEN)) + { + error0 = TCP_ERROR_CREATE_EXISTS; + goto drop; + } + /* Create child session and send SYN-ACK */ child0 = tcp_connection_new (my_thread_index); child0->c_lcl_port = lc0->c_lcl_port; @@ -2621,12 +2737,15 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (stream_session_accept (&child0->connection, lc0->c_s_index, sst, 0 /* notify */ )) { + clib_warning ("session accept fail"); + tcp_connection_cleanup (child0); error0 = TCP_ERROR_CREATE_SESSION_FAIL; goto drop; } if (tcp_options_parse (th0, &child0->rcv_opts)) { + clib_warning ("options parse fail"); goto drop; } @@ -2651,7 +2770,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, child0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number; tcp_connection_init_vars (child0); - TCP_EVT_DBG (TCP_EVT_SYN_RCVD, child0); + TCP_EVT_DBG (TCP_EVT_SYN_RCVD, child0, 1); /* Reuse buffer to make syn-ack and send */ tcp_make_synack (child0, b0); @@ -2768,34 +2887,6 @@ typedef enum _tcp_input_next #define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN) -static u8 -tcp_lookup_is_valid (tcp_connection_t * tc, tcp_header_t * hdr) -{ - transport_connection_t *tmp; - if (!tc) - return 1; - - u8 is_valid = (tc->c_lcl_port == hdr->dst_port - && (tc->state == TCP_STATE_LISTEN - || tc->c_rmt_port == hdr->src_port)); - - if (!is_valid) - { - if ((tmp = - stream_session_half_open_lookup (&tc->c_lcl_ip, &tc->c_rmt_ip, - tc->c_lcl_port, tc->c_rmt_port, - tc->c_transport_proto))) - { - if (tmp->lcl_port == hdr->dst_port - && tmp->rmt_port == hdr->src_port) - { - clib_warning ("half-open is valid!"); - } - } - } - return is_valid; -} - always_inline uword tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, int is_ip4) @@ -2822,6 +2913,7 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t *b0; tcp_header_t *tcp0 = 0; tcp_connection_t *tc0; + transport_connection_t *tconn; ip4_header_t *ip40; ip6_header_t *ip60; u32 error0 = TCP_ERROR_NO_LISTENER, next0 = TCP_INPUT_NEXT_DROP; @@ -2847,15 +2939,13 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + tcp_header_bytes (tcp0)); n_data_bytes0 = clib_net_to_host_u16 (ip40->length) - n_advance_bytes0; - - tc0 = - (tcp_connection_t *) - stream_session_lookup_transport_wt4 (&ip40->dst_address, - &ip40->src_address, - tcp0->dst_port, - tcp0->src_port, - SESSION_TYPE_IP4_TCP, - my_thread_index); + tconn = stream_session_lookup_transport_wt4 (&ip40->dst_address, + &ip40->src_address, + tcp0->dst_port, + tcp0->src_port, + SESSION_TYPE_IP4_TCP, + my_thread_index); + tc0 = tcp_get_connection_from_transport (tconn); ASSERT (tcp_lookup_is_valid (tc0, tcp0)); } else @@ -2866,15 +2956,13 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, n_data_bytes0 = clib_net_to_host_u16 (ip60->payload_length) - n_advance_bytes0; n_advance_bytes0 += sizeof (ip60[0]); - - tc0 = - (tcp_connection_t *) - stream_session_lookup_transport_wt6 (&ip60->dst_address, - &ip60->src_address, - tcp0->dst_port, - tcp0->src_port, - SESSION_TYPE_IP6_TCP, - my_thread_index); + tconn = stream_session_lookup_transport_wt6 (&ip60->dst_address, + &ip60->src_address, + tcp0->dst_port, + tcp0->src_port, + SESSION_TYPE_IP6_TCP, + my_thread_index); + tc0 = tcp_get_connection_from_transport (tconn); ASSERT (tcp_lookup_is_valid (tc0, tcp0)); } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 15a9dcb4..9cb3e779 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -66,11 +66,10 @@ format_tcp_tx_trace (u8 * s, va_list * args) } static u8 -tcp_window_compute_scale (u32 available_space) +tcp_window_compute_scale (u32 window) { u8 wnd_scale = 0; - while (wnd_scale < TCP_MAX_WND_SCALE - && (available_space >> wnd_scale) > TCP_WND_MAX) + while (wnd_scale < TCP_MAX_WND_SCALE && (window >> wnd_scale) > TCP_WND_MAX) wnd_scale++; return wnd_scale; } @@ -444,12 +443,10 @@ tcp_alloc_tx_buffers (tcp_main_t * tm, u8 thread_index, u32 n_free_buffers) vec_validate (tm->tx_buffers[thread_index], current_length + n_free_buffers - 1); - _vec_len (tm->tx_buffers[thread_index]) = - current_length + vlib_buffer_alloc_from_free_list (vlib_get_main (), - tm->tx_buffers - [thread_index], - n_free_buffers, - VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + _vec_len (tm->tx_buffers[thread_index]) = current_length + + vlib_buffer_alloc (vlib_get_main (), + &tm->tx_buffers[thread_index][current_length], + n_free_buffers); /* buffer shortage, report failure */ if (vec_len (tm->tx_buffers[thread_index]) == 0) { @@ -470,7 +467,7 @@ tcp_get_free_buffer_index (tcp_main_t * tm, u32 * bidx) return -1; } my_tx_buffers = tm->tx_buffers[thread_index]; - *bidx = my_tx_buffers[_vec_len (my_tx_buffers) - 1]; + *bidx = my_tx_buffers[vec_len (my_tx_buffers) - 1]; _vec_len (my_tx_buffers) -= 1; return 0; } @@ -478,10 +475,7 @@ tcp_get_free_buffer_index (tcp_main_t * tm, u32 * bidx) always_inline void tcp_return_buffer (tcp_main_t * tm) { - u32 *my_tx_buffers; - u32 thread_index = vlib_get_thread_index (); - my_tx_buffers = tm->tx_buffers[thread_index]; - _vec_len (my_tx_buffers) += 1; + _vec_len (tm->tx_buffers[vlib_get_thread_index ()]) += 1; } always_inline void * @@ -489,7 +483,8 @@ tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b) { if (b->flags & VLIB_BUFFER_NEXT_PRESENT) vlib_buffer_free_one (vm, b->next_buffer); - b->flags = 0; + /* Zero all flags but free list index and trace flag */ + b->flags &= VLIB_BUFFER_NEXT_PRESENT - 1; b->current_data = 0; b->current_length = 0; b->total_length_not_including_first_buffer = 0; @@ -503,7 +498,8 @@ always_inline void * tcp_init_buffer (vlib_main_t * vm, vlib_buffer_t * b) { ASSERT ((b->flags & VLIB_BUFFER_NEXT_PRESENT) == 0); - b->flags = VNET_BUFFER_F_LOCALLY_ORIGINATED; + b->flags &= VLIB_BUFFER_FREE_LIST_INDEX_MASK; + b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; b->total_length_not_including_first_buffer = 0; vnet_buffer (b)->tcp.flags = 0; @@ -567,8 +563,34 @@ tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b) /* Reset flags, make sure ack is sent */ vnet_buffer (b)->tcp.flags &= ~TCP_BUF_FLAG_DUPACK; +} + +/** + * Convert buffer to SYN + */ +void +tcp_make_syn (tcp_connection_t * tc, vlib_buffer_t * b) +{ + u8 tcp_hdr_opts_len, tcp_opts_len; + tcp_header_t *th; + u16 initial_wnd; + tcp_options_t snd_opts; + + initial_wnd = tcp_initial_window_to_advertise (tc); - tc->snd_nxt += 1; + /* Make and write options */ + memset (&snd_opts, 0, sizeof (snd_opts)); + tcp_opts_len = tcp_make_syn_options (&snd_opts, tc->rcv_wscale); + tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t); + + th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss, + tc->rcv_nxt, tcp_hdr_opts_len, TCP_FLAG_SYN, + initial_wnd); + vnet_buffer (b)->tcp.connection_index = tc->c_c_index; + tcp_options_write ((u8 *) (th + 1), &snd_opts); + + tcp_timer_update (tc, TCP_TIMER_RETRANSMIT_SYN, + tc->rto * TCP_TO_TIMER_TICK); } /** @@ -582,37 +604,25 @@ tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b) u8 tcp_opts_len, tcp_hdr_opts_len; tcp_header_t *th; u16 initial_wnd; - u32 time_now; memset (snd_opts, 0, sizeof (*snd_opts)); - tcp_reuse_buffer (vm, b); - /* Set random initial sequence */ - time_now = tcp_time_now (); - - tc->iss = random_u32 (&time_now); - tc->snd_una = tc->iss; - tc->snd_nxt = tc->iss + 1; - tc->snd_una_max = tc->snd_nxt; - initial_wnd = tcp_initial_window_to_advertise (tc); - - /* Make and write options */ tcp_opts_len = tcp_make_synack_options (tc, snd_opts); tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t); th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss, tc->rcv_nxt, tcp_hdr_opts_len, TCP_FLAG_SYN | TCP_FLAG_ACK, initial_wnd); - tcp_options_write ((u8 *) (th + 1), snd_opts); vnet_buffer (b)->tcp.connection_index = tc->c_c_index; vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; - /* Init retransmit timer */ - tcp_retransmit_timer_set (tc); + /* Init retransmit timer. Use update instead of set because of + * retransmissions */ + tcp_retransmit_timer_force_update (tc); TCP_EVT_DBG (TCP_EVT_SYNACK_SENT, tc); } @@ -918,44 +928,17 @@ tcp_send_syn (tcp_connection_t * tc) u32 bi; tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); - u8 tcp_hdr_opts_len, tcp_opts_len; - tcp_header_t *th; - u32 time_now; - u16 initial_wnd; - tcp_options_t snd_opts; if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) return; b = vlib_get_buffer (vm, bi); tcp_init_buffer (vm, b); - - /* Set random initial sequence */ - time_now = tcp_time_now (); - - tc->iss = random_u32 (&time_now); - tc->snd_una = tc->iss; - tc->snd_una_max = tc->snd_nxt = tc->iss + 1; - - initial_wnd = tcp_initial_window_to_advertise (tc); - - /* Make and write options */ - memset (&snd_opts, 0, sizeof (snd_opts)); - tcp_opts_len = tcp_make_syn_options (&snd_opts, tc->rcv_wscale); - tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t); - - th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss, - tc->rcv_nxt, tcp_hdr_opts_len, TCP_FLAG_SYN, - initial_wnd); - - tcp_options_write ((u8 *) (th + 1), &snd_opts); + tcp_make_syn (tc, b); /* Measure RTT with this */ tc->rtt_ts = tcp_time_now (); tc->rtt_seq = tc->snd_nxt; - - /* Start retransmit trimer */ - tcp_timer_set (tc, TCP_TIMER_RETRANSMIT_SYN, tc->rto * TCP_TO_TIMER_TICK); tc->rto_boff = 0; /* Set the connection establishment timer */ @@ -1010,8 +993,12 @@ tcp_send_fin (tcp_connection_t * tc) /* buffer will be initialized by in tcp_make_fin */ tcp_make_fin (tc, b); tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4); - tc->flags |= TCP_CONN_FINSNT; - tc->flags &= ~TCP_CONN_FINPNDG; + if (!(tc->flags & TCP_CONN_FINSNT)) + { + tc->flags |= TCP_CONN_FINSNT; + tc->flags &= ~TCP_CONN_FINPNDG; + tc->snd_nxt += 1; + } tcp_retransmit_timer_force_update (tc); TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc); } @@ -1146,6 +1133,7 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, * Make sure we can retransmit something */ available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection); + ASSERT (available_bytes >= offset); available_bytes -= offset; if (!available_bytes) return 0; @@ -1209,6 +1197,7 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, VLIB_FRAME_SIZE - available_bufs)) { tcp_return_buffer (tm); + *b = 0; return 0; } } @@ -1236,7 +1225,7 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, ASSERT (n_peeked == len_to_deq); n_bytes += n_peeked; chain_b->current_length = n_peeked; - chain_b->flags = 0; + chain_b->flags &= VLIB_BUFFER_FREE_LIST_INDEX_MASK; chain_b->next_buffer = 0; /* update previous buffer */ @@ -1310,19 +1299,6 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID; } - if (!tcp_in_recovery (tc) && tc->rto_boff > 0 - && tc->state >= TCP_STATE_ESTABLISHED) - { - tc->rto_boff = 0; - tcp_update_rto (tc); - } - - /* Increment RTO backoff (also equal to number of retries) */ - tc->rto_boff += 1; - - /* Go back to first un-acked byte */ - tc->snd_nxt = tc->snd_una; - if (tc->state >= TCP_STATE_ESTABLISHED) { /* Lost FIN, retransmit and return */ @@ -1332,6 +1308,18 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) return; } + /* We're not in recovery so make sure rto_boff is 0 */ + if (!tcp_in_recovery (tc) && tc->rto_boff > 0) + { + tc->rto_boff = 0; + tcp_update_rto (tc); + } + + /* Increment RTO backoff (also equal to number of retries) and go back + * to first un-acked byte */ + tc->rto_boff += 1; + tc->snd_nxt = tc->snd_una; + /* First retransmit timeout */ if (tc->rto_boff == 1) tcp_rtx_timeout_cc (tc); @@ -1349,12 +1337,11 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) if (n_bytes == 0) { - if (b) - { - clib_warning ("retransmit fail: %U", format_tcp_connection, tc, - 2); - ASSERT (tc->rto_boff > 1 && tc->snd_una == tc->snd_congestion); - } + ASSERT (!b); + if (tc->snd_una == tc->snd_una_max) + return; + ASSERT (tc->rto_boff > 1 && tc->snd_una == tc->snd_congestion); + clib_warning ("retransmit fail: %U", format_tcp_connection, tc, 2); /* Try again eventually */ tcp_retransmit_timer_set (tc); return; @@ -1365,16 +1352,18 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* For first retransmit, record timestamp (Eifel detection RFC3522) */ if (tc->rto_boff == 1) tc->snd_rxt_ts = tcp_time_now (); + + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + tcp_retransmit_timer_update (tc); } - /* Retransmit for SYN/SYNACK */ - else if (tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_SYN_SENT) + /* Retransmit for SYN */ + else if (tc->state == TCP_STATE_SYN_SENT) { /* Half-open connection actually moved to established but we were * waiting for syn retransmit to pop to call cleanup from the right * thread. */ if (tc->flags & TCP_CONN_HALF_OPEN_DONE) { - ASSERT (tc->state == TCP_STATE_SYN_SENT); if (tcp_half_open_connection_cleanup (tc)) { clib_warning ("could not remove half-open connection"); @@ -1385,49 +1374,46 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Try without increasing RTO a number of times. If this fails, * start growing RTO exponentially */ + tc->rto_boff += 1; if (tc->rto_boff > TCP_RTO_SYN_RETRIES) tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) - { - clib_warning ("tcp_get_free_buffer_index FAIL"); - return; - } + return; + b = vlib_get_buffer (vm, bi); tcp_init_buffer (vm, b); - tcp_push_hdr_i (tc, b, tc->state, 1); + tcp_make_syn (tc, b); - /* Account for the SYN */ - tc->snd_nxt += 1; tc->rtt_ts = 0; - TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc, - (tc->state == TCP_STATE_SYN_SENT ? 0 : 1)); + TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc, 0); + + /* This goes straight to ipx_lookup. Retransmit timer set already */ + tcp_push_ip_hdr (tm, tc, b); + tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4); } - else + /* Retransmit SYN-ACK */ + else if (tc->state == TCP_STATE_SYN_RCVD) { - ASSERT (tc->state == TCP_STATE_CLOSED); - clib_warning ("connection closed ..."); - return; - } + tc->rto_boff += 1; + tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); + tc->rtt_ts = 0; - if (!is_syn) - { - tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + + b = vlib_get_buffer (vm, bi); + tcp_make_synack (tc, b); + TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc, 1); - /* Re-enable retransmit timer */ - tcp_retransmit_timer_set (tc); + /* Retransmit timer already updated, just enqueue to output */ + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); } else { - ASSERT (tc->state == TCP_STATE_SYN_SENT); - - /* This goes straight to ipx_lookup */ - tcp_push_ip_hdr (tm, tc, b); - tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4); - - /* Re-enable retransmit timer */ - tcp_timer_set (tc, TCP_TIMER_RETRANSMIT_SYN, - tc->rto * TCP_TO_TIMER_TICK); + ASSERT (tc->state == TCP_STATE_CLOSED); + clib_warning ("connection closed ..."); + return; } } diff --git a/src/vppinfra/tw_timer_template.c b/src/vppinfra/tw_timer_template.c index aba00142..abad3718 100644 --- a/src/vppinfra/tw_timer_template.c +++ b/src/vppinfra/tw_timer_template.c @@ -572,7 +572,8 @@ static inline { vec_add1 (callback_vector, t->user_handle); #if TW_START_STOP_TRACE_SIZE > 0 - TW (tw_timer_trace) (tw, 0xfe, ~0, t - tw->timers); + TW (tw_timer_trace) (tw, 0xfe, t->user_handle, + t - tw->timers); #endif pool_put (tw->timers, t); } @@ -635,7 +636,8 @@ static inline { vec_add1 (callback_vector, t->user_handle); #if TW_START_STOP_TRACE_SIZE > 0 - TW (tw_timer_trace) (tw, 0xfe, ~0, t - tw->timers); + TW (tw_timer_trace) (tw, 0xfe, t->user_handle, + t - tw->timers); #endif pool_put (tw->timers, t); } @@ -689,7 +691,8 @@ static inline { vec_add1 (callback_vector, t->user_handle); #if TW_START_STOP_TRACE_SIZE > 0 - TW (tw_timer_trace) (tw, 0xfe, ~0, t - tw->timers); + TW (tw_timer_trace) (tw, 0xfe, t->user_handle, + t - tw->timers); #endif pool_put (tw->timers, t); } @@ -725,7 +728,7 @@ static inline next_index = t->next; vec_add1 (callback_vector, t->user_handle); #if TW_START_STOP_TRACE_SIZE > 0 - TW (tw_timer_trace) (tw, 0xfe, ~0, t - tw->timers); + TW (tw_timer_trace) (tw, 0xfe, t->user_handle, t - tw->timers); #endif pool_put (tw->timers, t); } -- cgit 1.2.3-korg From 9d063047eb1a3738cb0fc9ebebb55793d155bb20 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Thu, 14 Sep 2017 03:08:00 -0400 Subject: session/tcp: improve preallocated segment handling - add preallocated segment flag - don't remove pre-allocated segments except if application detaches - when preallocating fifos in multiple segments, completely fill a segment before moving to the next - detach server application from segment-managers when deleting app - batch syn/syn-ack/fin (re)transmissions - loosen up close-wait and time-wait times Change-Id: I412f53ce601cc83b3acc26aeffd7fa2d52d73b03 Signed-off-by: Florin Coras --- src/svm/svm_fifo_segment.c | 33 +++++++------- src/svm/svm_fifo_segment.h | 1 + src/vnet/session/application.c | 2 + src/vnet/session/segment_manager.c | 73 ++++++++++++++++-------------- src/vnet/session/session.c | 18 +++----- src/vnet/session/session_node.c | 21 +++++---- src/vnet/tcp/builtin_client.c | 2 +- src/vnet/tcp/tcp.c | 7 ++- src/vnet/tcp/tcp.h | 9 ++-- src/vnet/tcp/tcp_debug.h | 9 +--- src/vnet/tcp/tcp_input.c | 81 +++++++++++++++++---------------- src/vnet/tcp/tcp_output.c | 91 ++++++++++++++++++++++++++++++-------- 12 files changed, 209 insertions(+), 138 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/svm/svm_fifo_segment.c b/src/svm/svm_fifo_segment.c index 3bdd2b28..da2b7935 100644 --- a/src/svm/svm_fifo_segment.c +++ b/src/svm/svm_fifo_segment.c @@ -57,11 +57,12 @@ allocate_new_fifo_chunk (svm_fifo_segment_header_t * fsh, } static void -preallocate_fifo_pairs (svm_fifo_segment_header_t * fsh, +preallocate_fifo_pairs (svm_fifo_segment_private_t * s, svm_fifo_segment_create_args_t * a) { - u32 rx_fifo_size, tx_fifo_size; - u32 rx_rounded_data_size, tx_rounded_data_size; + svm_fifo_segment_header_t *fsh = s->h; + u32 rx_fifo_size, tx_fifo_size, pairs_to_allocate; + u32 rx_rounded_data_size, tx_rounded_data_size, pair_size; svm_fifo_t *f; u8 *rx_fifo_space, *tx_fifo_space; int rx_freelist_index, tx_freelist_index; @@ -97,10 +98,11 @@ preallocate_fifo_pairs (svm_fifo_segment_header_t * fsh, - max_log2 (FIFO_SEGMENT_MIN_FIFO_SIZE); /* Calculate space requirements */ - rx_fifo_size = (sizeof (*f) + rx_rounded_data_size) - * a->preallocated_fifo_pairs; - tx_fifo_size = (sizeof (*f) + tx_rounded_data_size) - * a->preallocated_fifo_pairs; + pair_size = 2 * sizeof (*f) + rx_rounded_data_size + tx_rounded_data_size; + pairs_to_allocate = clib_min (s->ssvm.ssvm_size / pair_size, + a->preallocated_fifo_pairs); + rx_fifo_size = (sizeof (*f) + rx_rounded_data_size) * pairs_to_allocate; + tx_fifo_size = (sizeof (*f) + tx_rounded_data_size) * pairs_to_allocate; vec_validate_init_empty (fsh->free_fifos, clib_max (rx_freelist_index, tx_freelist_index), @@ -139,7 +141,7 @@ preallocate_fifo_pairs (svm_fifo_segment_header_t * fsh, /* Carve rx fifo space */ f = (svm_fifo_t *) rx_fifo_space; - for (i = 0; i < a->preallocated_fifo_pairs; i++) + for (i = 0; i < pairs_to_allocate; i++) { f->freelist_index = rx_freelist_index; f->next = fsh->free_fifos[rx_freelist_index]; @@ -149,7 +151,7 @@ preallocate_fifo_pairs (svm_fifo_segment_header_t * fsh, } /* Carve tx fifo space */ f = (svm_fifo_t *) tx_fifo_space; - for (i = 0; i < a->preallocated_fifo_pairs; i++) + for (i = 0; i < pairs_to_allocate; i++) { f->freelist_index = tx_freelist_index; f->next = fsh->free_fifos[tx_freelist_index]; @@ -157,6 +159,9 @@ preallocate_fifo_pairs (svm_fifo_segment_header_t * fsh, tx_fifo_space += sizeof (*f) + tx_rounded_data_size; f = (svm_fifo_t *) tx_fifo_space; } + + /* Account for the pairs allocated */ + a->preallocated_fifo_pairs -= pairs_to_allocate; } /** (master) create an svm fifo segment */ @@ -200,7 +205,7 @@ svm_fifo_segment_create (svm_fifo_segment_create_args_t * a) sh->opaque[0] = fsh; s->h = fsh; fsh->segment_name = format (0, "%s%c", a->segment_name, 0); - preallocate_fifo_pairs (fsh, a); + preallocate_fifo_pairs (s, a); ssvm_pop_heap (oldheap); @@ -245,10 +250,6 @@ svm_fifo_segment_create_process_private (svm_fifo_segment_create_args_t * a) segment_count = a->private_segment_count; } - /* Spread preallocated fifo pairs across segments */ - a->preallocated_fifo_pairs = - (a->preallocated_fifo_pairs + segment_count - 1) / segment_count; - /* Allocate segments */ for (i = 0; i < segment_count; i++) { @@ -280,9 +281,11 @@ svm_fifo_segment_create_process_private (svm_fifo_segment_create_args_t * a) if (a->private_segment_count) { + if (i != 0) + fsh->flags |= FIFO_SEGMENT_F_IS_PREALLOCATED; oldheap = clib_mem_get_heap (); clib_mem_set_heap (sh->heap); - preallocate_fifo_pairs (fsh, a); + preallocate_fifo_pairs (s, a); clib_mem_set_heap (oldheap); } sh->ready = 1; diff --git a/src/svm/svm_fifo_segment.h b/src/svm/svm_fifo_segment.h index 7c97e9b4..5b771328 100644 --- a/src/svm/svm_fifo_segment.h +++ b/src/svm/svm_fifo_segment.h @@ -33,6 +33,7 @@ typedef enum #define FIFO_SEGMENT_F_IS_PRIVATE 1 << 0 /* Private segment */ #define FIFO_SEGMENT_F_IS_MAIN_HEAP 1 << 1 /* Segment is main heap */ +#define FIFO_SEGMENT_F_IS_PREALLOCATED 1 << 2 /* Segment is preallocated */ typedef struct { diff --git a/src/vnet/session/application.c b/src/vnet/session/application.c index d105119c..2b789c5f 100644 --- a/src/vnet/session/application.c +++ b/src/vnet/session/application.c @@ -111,6 +111,8 @@ application_del (application_t * app) hash_foreach (handle, index, app->listeners_table, ({ vec_add1 (handles, handle); + sm = segment_manager_get (index); + sm->app_index = SEGMENT_MANAGER_INVALID_APP_INDEX; })); /* *INDENT-ON* */ diff --git a/src/vnet/session/segment_manager.c b/src/vnet/session/segment_manager.c index c23e4c02..48d02755 100644 --- a/src/vnet/session/segment_manager.c +++ b/src/vnet/session/segment_manager.c @@ -197,27 +197,24 @@ u8 segment_manager_has_fifos (segment_manager_t * sm) { svm_fifo_segment_private_t *segment; - /* Weird, but handle it */ - if (vec_len (sm->segment_indices) == 0) - return 0; - if (vec_len (sm->segment_indices) == 1) - { - segment = svm_fifo_segment_get_segment (sm->segment_indices[0]); - if (svm_fifo_segment_num_fifos (segment) == 0) - return 0; - } - if (CLIB_DEBUG) + int i; + + for (i = 0; i < vec_len (sm->segment_indices); i++) { - svm_fifo_segment_private_t *segment; - int i; - for (i = 1; i < vec_len (sm->segment_indices); i++) - { - segment = svm_fifo_segment_get_segment (sm->segment_indices[i]); - if (!svm_fifo_segment_has_fifos (segment)) - clib_warning ("segment has no fifos!"); - } + segment = svm_fifo_segment_get_segment (sm->segment_indices[i]); + if (CLIB_DEBUG && i && !svm_fifo_segment_has_fifos (segment) + && !(segment->h->flags & FIFO_SEGMENT_F_IS_PREALLOCATED)) + clib_warning ("segment %d has no fifos!", sm->segment_indices[i]); + if (svm_fifo_segment_has_fifos (segment)) + return 1; } - return 1; + return 0; +} + +static u8 +segment_manager_app_detached (segment_manager_t * sm) +{ + return (sm->app_index == SEGMENT_MANAGER_INVALID_APP_INDEX); } static void @@ -228,6 +225,13 @@ segment_manager_del_segment (segment_manager_t * sm, u32 segment_index) clib_spinlock_lock (&sm->lockp); svm_segment_index = sm->segment_indices[segment_index]; fifo_segment = svm_fifo_segment_get_segment (svm_segment_index); + if (!fifo_segment + || ((fifo_segment->h->flags & FIFO_SEGMENT_F_IS_PREALLOCATED) + && !segment_manager_app_detached (sm))) + { + clib_spinlock_unlock (&sm->lockp); + return; + } svm_fifo_segment_delete (fifo_segment); vec_del1 (sm->segment_indices, segment_index); clib_spinlock_unlock (&sm->lockp); @@ -288,26 +292,29 @@ segment_manager_del_sessions (segment_manager_t * sm) * * Since the fifos allocated in the segment keep backpointers to the sessions * prior to removing the segment, we call session disconnect. This - * subsequently propages into transport. + * subsequently propagates into transport. */ void segment_manager_del (segment_manager_t * sm) { + int i; - ASSERT (vec_len (sm->segment_indices) <= 1); - if (vec_len (sm->segment_indices)) + ASSERT (!segment_manager_has_fifos (sm) + && segment_manager_app_detached (sm)); + + /* If we have empty preallocated segments that haven't been removed, remove + * them now. Apart from that, the first segment in the first segment manager + * is not removed when all fifos are removed. It can only be removed when + * the manager is explicitly deleted/detached by the app. */ + for (i = vec_len (sm->segment_indices) - 1; i >= 0; i--) { - /* The first segment in the first segment manager is not removed when - * all fifos are removed. It can only be removed when the manager is - * explicitly deleted/detached by the app. */ if (CLIB_DEBUG) { - svm_fifo_segment_private_t *fifo_segment; - fifo_segment = - svm_fifo_segment_get_segment (sm->segment_indices[0]); - ASSERT (!svm_fifo_segment_has_fifos (fifo_segment)); + svm_fifo_segment_private_t *segment; + segment = svm_fifo_segment_get_segment (sm->segment_indices[i]); + ASSERT (!svm_fifo_segment_has_fifos (segment)); } - segment_manager_del_segment (sm, 0); + segment_manager_del_segment (sm, i); } clib_spinlock_free (&sm->lockp); if (CLIB_DEBUG) @@ -322,8 +329,7 @@ segment_manager_init_del (segment_manager_t * sm) segment_manager_del_sessions (sm); else { - ASSERT (!sm->first_is_protected - || sm->app_index == SEGMENT_MANAGER_INVALID_APP_INDEX); + ASSERT (!sm->first_is_protected || segment_manager_app_detached (sm)); segment_manager_del (sm); } } @@ -478,7 +484,8 @@ segment_manager_dealloc_fifos (u32 svm_segment_index, svm_fifo_t * rx_fifo, } /* Remove segment manager if no sessions and detached from app */ - if (sm->app_index == SEGMENT_MANAGER_INVALID_APP_INDEX && is_first) + if (segment_manager_app_detached (sm) + && !segment_manager_has_fifos (sm)) segment_manager_del (sm); } } diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 792e6612..dc930ce8 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -453,7 +453,7 @@ stream_session_connect_notify (transport_connection_t * tc, u8 is_fail) st); if (handle == HALF_OPEN_LOOKUP_INVALID_VALUE) { - clib_warning ("half-open was removed!"); + TCP_DBG ("half-open was removed!"); return -1; } @@ -732,6 +732,7 @@ session_send_session_evt_to_thread (u64 session_handle, u32 thread_index) { static u16 serial_number = 0; + u32 tries = 0; session_fifo_event_t evt; unix_shared_memory_queue_t *q; @@ -741,21 +742,14 @@ session_send_session_evt_to_thread (u64 session_handle, evt.event_id = serial_number++; q = session_manager_get_vpp_event_queue (thread_index); - - /* Based on request block (or not) for lack of space */ - if (PREDICT_TRUE (q->cursize < q->maxsize)) + while (unix_shared_memory_queue_add (q, (u8 *) & evt, 1)) { - if (unix_shared_memory_queue_add (q, (u8 *) & evt, - 0 /* do wait for mutex */ )) + if (tries++ == 3) { - clib_warning ("failed to enqueue evt"); + TCP_DBG ("failed to enqueue evt"); + break; } } - else - { - clib_warning ("queue full"); - return; - } } /** diff --git a/src/vnet/session/session_node.c b/src/vnet/session/session_node.c index 09687687..d0155849 100644 --- a/src/vnet/session/session_node.c +++ b/src/vnet/session/session_node.c @@ -168,15 +168,19 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, return 0; } + /* Check how much we can pull. */ + max_dequeue0 = svm_fifo_max_dequeue (s0->server_tx_fifo); + if (peek_data) { - /* Offset in rx fifo from where to peek data */ + /* Offset in rx fifo from where to peek data */ tx_offset = transport_vft->tx_fifo_offset (tc0); + if (PREDICT_FALSE (tx_offset >= max_dequeue0)) + max_dequeue0 = 0; + else + max_dequeue0 -= tx_offset; } - /* Check how much we can pull. If buffering, subtract the offset */ - max_dequeue0 = svm_fifo_max_dequeue (s0->server_tx_fifo) - tx_offset; - /* Nothing to read return */ if (max_dequeue0 == 0) { @@ -277,6 +281,8 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, { n_bytes_read = svm_fifo_peek (s0->server_tx_fifo, tx_offset, len_to_deq0, data0); + if (n_bytes_read <= 0) + goto dequeue_fail; /* Keep track of progress locally, transport is also supposed to * increment it independently when pushing the header */ tx_offset += n_bytes_read; @@ -285,11 +291,10 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, { n_bytes_read = svm_fifo_dequeue_nowait (s0->server_tx_fifo, len_to_deq0, data0); + if (n_bytes_read <= 0) + goto dequeue_fail; } - if (n_bytes_read <= 0) - goto dequeue_fail; - b0->current_length = n_bytes_read; left_to_snd0 -= n_bytes_read; @@ -616,7 +621,7 @@ skip_dequeue: case FIFO_EVENT_APP_TX: s0 = session_event_get_session (e0, my_thread_index); - if (CLIB_DEBUG && !s0) + if (PREDICT_FALSE (!s0)) { clib_warning ("It's dead, Jim!"); continue; diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index 5b4c8679..527b3289 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -510,7 +510,7 @@ clients_connect (vlib_main_t * vm, u8 * uri, u32 n_clients) if ((i % 4) == 0) vlib_process_suspend (vm, 10e-6); ASSERT (i + 1 >= tm->ready_connections); - while (i + 1 - tm->ready_connections > 8000) + while (i + 1 - tm->ready_connections > 1000) { vlib_process_suspend (vm, 100e-6); } diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 04f1e068..f779428f 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -1035,7 +1035,7 @@ tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space) /* If not snd_wnd constrained and we can't write at least a segment, * don't try at all */ if (PREDICT_FALSE (snd_space < tc->snd_mss)) - return 0; + return snd_space < tc->cwnd ? 0 : snd_space; /* round down to mss multiple */ return snd_space - (snd_space % tc->snd_mss); @@ -1167,6 +1167,7 @@ tcp_timer_establish_handler (u32 conn_index) { ASSERT (tc->state == TCP_STATE_SYN_SENT); stream_session_connect_notify (&tc->connection, 1 /* fail */ ); + TCP_DBG ("establish pop: %U", format_tcp_connection, tc, 2); } else { @@ -1174,7 +1175,7 @@ tcp_timer_establish_handler (u32 conn_index) /* note: the connection may have already disappeared */ if (PREDICT_FALSE (tc == 0)) return; - + TCP_DBG ("establish pop: %U", format_tcp_connection, tc, 2); ASSERT (tc->state == TCP_STATE_SYN_RCVD); /* Start cleanup. App wasn't notified yet so use delete notify as * opposed to delete to cleanup session layer state. */ @@ -1369,6 +1370,8 @@ tcp_main_enable (vlib_main_t * vm) vec_validate (tm->tx_frames[0], num_threads - 1); vec_validate (tm->tx_frames[1], num_threads - 1); + vec_validate (tm->ip_lookup_tx_frames[0], num_threads - 1); + vec_validate (tm->ip_lookup_tx_frames[1], num_threads - 1); tm->bytes_per_buffer = vlib_buffer_free_list_buffer_size (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 6020a3de..bb8091af 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -99,8 +99,9 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler; #define TCP_ESTABLISH_TIME 750 /* 75s */ #define TCP_SYN_RCVD_TIME 600 /* 60s */ #define TCP_2MSL_TIME 300 /* 30s */ -#define TCP_CLOSEWAIT_TIME 20 /* 0.1s */ -#define TCP_CLEANUP_TIME 5 /* 0.5s Time to wait before cleanup */ +#define TCP_CLOSEWAIT_TIME 20 /* 2s */ +#define TCP_TIMEWAIT_TIME 20 /* 2s */ +#define TCP_CLEANUP_TIME 10 /* 1s Time to wait before cleanup */ #define TCP_TIMER_PERSIST_MIN 2 /* 0.2s */ #define TCP_RTO_MAX 60 * THZ /* Min max RTO (60s) as per RFC6298 */ @@ -372,8 +373,10 @@ typedef struct _tcp_main /** per-worker tx buffer free lists */ u32 **tx_buffers; - /** per-worker tx frames to 4/6 output nodes */ + /** per-worker tx frames to tcp 4/6 output nodes */ vlib_frame_t **tx_frames[2]; + /** per-worker tx frames to ip 4/6 lookup nodes */ + vlib_frame_t **ip_lookup_tx_frames[2]; /* Per worker-thread timer wheel for connections timers */ tw_timer_wheel_16t_2w_512sl_t *timer_wheels; diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index cf77e6e6..4bc6b42e 100755 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -82,13 +82,7 @@ typedef enum _tcp_dbg_evt * Infra and evt track setup */ -#define TCP_DBG(_tc, _evt, _args...) \ -{ \ - u8 *_tmp = 0; \ - _tmp = format(_tmp, "%U", format_tcp_connection_verbose, _tc); \ - clib_warning("%s", _tmp); \ - vec_free(_tmp); \ -} +#define TCP_DBG(_fmt, _args...) clib_warning (_fmt, ##_args) #define DECLARE_ETD(_tc, _e, _size) \ struct \ @@ -240,6 +234,7 @@ typedef enum _tcp_dbg_evt #define TCP_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args) #else #define TCP_EVT_DBG(_evt, _args...) +#define TCP_DBG(_fmt, _args...) #endif /* diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 841e72a5..64a07070 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -351,12 +351,17 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, if (tcp_syn (th0)) { /* TODO implement RFC 5961 */ - if (tc0->state != TCP_STATE_SYN_RCVD) - tcp_make_ack (tc0, b0); + if (tc0->state == TCP_STATE_SYN_RCVD) + { + tcp_make_synack (tc0, b0); + TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0, 0); + } else - tcp_make_synack (tc0, b0); + { + tcp_make_ack (tc0, b0); + TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, tc0); + } *next0 = tcp_next_output (tc0->c_is_ip4); - TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0, 0); return -1; } @@ -1747,18 +1752,17 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* 8: check the FIN bit */ if (PREDICT_FALSE (is_fin)) { - /* Enter CLOSE-WAIT and notify session. Don't send ACK, instead - * wait for session to call close. To avoid lingering + /* Enter CLOSE-WAIT and notify session. To avoid lingering * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */ - tc0->state = TCP_STATE_CLOSE_WAIT; - TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0); + /* Account for the FIN if nothing else was received */ if (vnet_buffer (b0)->tcp.data_len == 0) - { - tc0->rcv_nxt += 1; - next0 = TCP_ESTABLISHED_NEXT_DROP; - } + tc0->rcv_nxt += 1; + tcp_make_ack (tc0, b0); + next0 = tcp_next_output (tc0->c_is_ip4); + tc0->state = TCP_STATE_CLOSE_WAIT; stream_session_disconnect_notify (&tc0->connection); tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); + TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0); } done: @@ -1973,6 +1977,12 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, seq0 = vnet_buffer (b0)->tcp.seq_number; tcp0 = tcp_buffer_hdr (b0); + /* Crude check to see if the connection handle does not match + * the packet. Probably connection just switched to established */ + if (PREDICT_FALSE (tcp0->dst_port != tc0->c_lcl_port + || tcp0->src_port != tc0->c_rmt_port)) + goto drop; + if (PREDICT_FALSE (!tcp_ack (tcp0) && !tcp_rst (tcp0) && !tcp_syn (tcp0))) goto drop; @@ -2265,6 +2275,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_header_t *tcp0 = 0; tcp_connection_t *tc0; u32 next0 = TCP_RCV_PROCESS_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED; + u8 is_fin0; bi0 = from[0]; to_next[0] = bi0; @@ -2283,11 +2294,11 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } tcp0 = tcp_buffer_hdr (b0); + is_fin0 = tcp_is_fin (tcp0); /* SYNs, FINs and data consume sequence numbers */ vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number - + tcp_is_syn (tcp0) + tcp_is_fin (tcp0) - + vnet_buffer (b0)->tcp.data_len; + + tcp_is_syn (tcp0) + is_fin0 + vnet_buffer (b0)->tcp.data_len; if (CLIB_DEBUG) { @@ -2384,21 +2395,14 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* If FIN is ACKed */ else if (tc0->snd_una == tc0->snd_una_max) { - tc0->rcv_nxt += 1; tc0->state = TCP_STATE_FIN_WAIT_2; TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); - if (tcp_fin (tcp0)) - { - /* Stop all timers, 2MSL will be set lower */ - tcp_connection_timers_reset (tc0); - } - else - { - /* Wait for peer to finish sending its data */ - tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, - TCP_2MSL_TIME); - } + /* Stop all retransmit timers because we have nothing more + * to send. Enable waitclose though because we're willing to + * wait for peer's FIN but not indefinitely. */ + tcp_connection_timers_reset (tc0); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); } break; case TCP_STATE_FIN_WAIT_2: @@ -2434,10 +2438,10 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (!tcp_rcv_ack_is_acceptable (tc0, b0)) goto drop; + tc0->snd_una = vnet_buffer (b0)->tcp.ack_number; /* Apparently our FIN was lost */ - if (tcp_fin (tcp0)) + if (is_fin0) { - /* Don't "make" fin since that increments snd_nxt */ tcp_send_fin (tc0); goto drop; } @@ -2450,8 +2454,6 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, * particular, this makes sure that we won't have dead sessions * when processing events on the tx path */ tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); - - /* Stop retransmit */ tcp_retransmit_timer_reset (tc0); goto drop; @@ -2466,8 +2468,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; tcp_make_ack (tc0, b0); - tcp_timer_reset (tc0, TCP_TIMER_WAITCLOSE); - tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); goto drop; @@ -2486,6 +2487,8 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, case TCP_STATE_FIN_WAIT_2: if (vnet_buffer (b0)->tcp.data_len) error0 = tcp_segment_rcv (tm, tc0, b0, &next0); + else if (is_fin0) + tc0->rcv_nxt += 1; break; case TCP_STATE_CLOSE_WAIT: case TCP_STATE_CLOSING: @@ -2497,7 +2500,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } /* 8: check the FIN bit */ - if (!tcp_fin (tcp0)) + if (!is_fin0) goto drop; switch (tc0->state) @@ -2527,19 +2530,19 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); break; case TCP_STATE_FIN_WAIT_2: - /* Got FIN, send ACK! */ + /* Got FIN, send ACK! Be more aggressive with resource cleanup */ tc0->state = TCP_STATE_TIME_WAIT; tcp_connection_timers_reset (tc0); - tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_TIMEWAIT_TIME); tcp_make_ack (tc0, b0); next0 = tcp_next_output (is_ip4); TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); break; case TCP_STATE_TIME_WAIT: - /* Remain in the TIME-WAIT state. Restart the 2 MSL time-wait + /* Remain in the TIME-WAIT state. Restart the time-wait * timeout. */ - tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_TIMEWAIT_TIME); break; } TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0); @@ -3162,9 +3165,9 @@ do { \ TCP_ERROR_NONE); _(TIME_WAIT, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(TIME_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); - _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED); + _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); - _(CLOSED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, + _(CLOSED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); #undef _ } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index b843c926..be29f05f 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -629,9 +629,11 @@ tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b) } always_inline void -tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, - u8 is_ip4) +tcp_enqueue_to_ip_lookup_i (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, + u8 is_ip4, u8 flush) { + tcp_main_t *tm = vnet_get_tcp_main (); + u32 thread_index = vlib_get_thread_index (); u32 *to_next, next_index; vlib_frame_t *f; @@ -643,13 +645,42 @@ tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, /* Send to IP lookup */ next_index = is_ip4 ? ip4_lookup_node.index : ip6_lookup_node.index; - f = vlib_get_frame_to_node (vm, next_index); + if (VLIB_BUFFER_TRACE_TRAJECTORY > 0) + { + b->pre_data[0] = 2; + b->pre_data[1] = next_index; + } + + f = tm->ip_lookup_tx_frames[!is_ip4][thread_index]; + if (!f) + { + f = vlib_get_frame_to_node (vm, next_index); + ASSERT (f); + tm->ip_lookup_tx_frames[!is_ip4][thread_index] = f; + } - /* Enqueue the packet */ to_next = vlib_frame_vector_args (f); - to_next[0] = bi; - f->n_vectors = 1; - vlib_put_frame_to_node (vm, next_index, f); + to_next[f->n_vectors] = bi; + f->n_vectors += 1; + if (flush || f->n_vectors == VLIB_FRAME_SIZE) + { + vlib_put_frame_to_node (vm, next_index, f); + tm->ip_lookup_tx_frames[!is_ip4][thread_index] = 0; + } +} + +always_inline void +tcp_enqueue_to_ip_lookup_now (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, + u8 is_ip4) +{ + tcp_enqueue_to_ip_lookup_i (vm, b, bi, is_ip4, 1); +} + +always_inline void +tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, + u8 is_ip4) +{ + tcp_enqueue_to_ip_lookup_i (vm, b, bi, is_ip4, 0); } always_inline void @@ -666,8 +697,6 @@ tcp_enqueue_to_output_i (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, /* Decide where to send the packet */ next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index; - - /* Initialize the trajectory trace, if configured */ if (VLIB_BUFFER_TRACE_TRAJECTORY > 0) { b->pre_data[0] = 1; @@ -856,7 +885,7 @@ tcp_send_reset_w_pkt (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4) ASSERT (!bogus); } - tcp_enqueue_to_ip_lookup (vm, b, bi, is_ip4); + tcp_enqueue_to_ip_lookup_now (vm, b, bi, is_ip4); TCP_EVT_DBG (TCP_EVT_RST_SENT, tc); } @@ -968,7 +997,24 @@ tcp_flush_frame_to_output (vlib_main_t * vm, u8 thread_index, u8 is_ip4) } /** - * Flush both v4 and v6 tx frames for thread index + * Flush ip lookup tx frames populated by timer pops + */ +always_inline void +tcp_flush_frame_to_ip_lookup (vlib_main_t * vm, u8 thread_index, u8 is_ip4) +{ + if (tcp_main.ip_lookup_tx_frames[!is_ip4][thread_index]) + { + u32 next_index; + next_index = is_ip4 ? ip4_lookup_node.index : ip6_lookup_node.index; + vlib_put_frame_to_node (vm, next_index, + tcp_main.ip_lookup_tx_frames[!is_ip4] + [thread_index]); + tcp_main.ip_lookup_tx_frames[!is_ip4][thread_index] = 0; + } +} + +/** + * Flush v4 and v6 tcp and ip-lookup tx frames for thread index */ void tcp_flush_frames_to_output (u8 thread_index) @@ -976,6 +1022,8 @@ tcp_flush_frames_to_output (u8 thread_index) vlib_main_t *vm = vlib_get_main (); tcp_flush_frame_to_output (vm, thread_index, 1); tcp_flush_frame_to_output (vm, thread_index, 0); + tcp_flush_frame_to_ip_lookup (vm, thread_index, 1); + tcp_flush_frame_to_ip_lookup (vm, thread_index, 0); } /** @@ -984,22 +1032,28 @@ tcp_flush_frames_to_output (u8 thread_index) void tcp_send_fin (tcp_connection_t * tc) { - vlib_buffer_t *b; - u32 bi; tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); + vlib_buffer_t *b; + u32 bi; + u8 fin_snt = 0; + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) return; b = vlib_get_buffer (vm, bi); - /* buffer will be initialized by in tcp_make_fin */ + fin_snt = tc->flags & TCP_CONN_FINSNT; + if (fin_snt) + tc->snd_nxt = tc->snd_una; tcp_make_fin (tc, b); tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4); - if (!(tc->flags & TCP_CONN_FINSNT)) + if (!fin_snt) { tc->flags |= TCP_CONN_FINSNT; tc->flags &= ~TCP_CONN_FINPNDG; - tc->snd_nxt += 1; + /* Account for the FIN */ + tc->snd_una_max += 1; + tc->snd_nxt = tc->snd_una_max; } tcp_retransmit_timer_force_update (tc); TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc); @@ -1398,7 +1452,8 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) else if (tc->state == TCP_STATE_SYN_RCVD) { tc->rto_boff += 1; - tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); + if (tc->rto_boff > TCP_RTO_SYN_RETRIES) + tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); tc->rtt_ts = 0; if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) @@ -1414,7 +1469,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) else { ASSERT (tc->state == TCP_STATE_CLOSED); - clib_warning ("connection closed ..."); + TCP_DBG ("connection state: %d", tc->state); return; } } -- cgit 1.2.3-korg From 7fe51f3e3e80ed6ffe989df1c6963527166afc25 Mon Sep 17 00:00:00 2001 From: Pierre Pfister Date: Wed, 20 Sep 2017 08:48:36 +0200 Subject: tcp: add option to punt traffic Until now, if the stack didn't find a connection for a packet, it sent back a reset. With the punt option enabled, packets are now enqueued to error-punt where they can be handed off to the host os. Change-Id: I12dea8694b8bd24c92b0d601412928aa7b8046cb Signed-off-by: Florin Coras Signed-off-by: Pierre Pfister --- src/vnet/ip/punt.c | 66 +++++++++++++++++++++++++++++----------------- src/vnet/tcp/tcp.c | 33 +++++++++++++++++++++++ src/vnet/tcp/tcp.h | 5 ++++ src/vnet/tcp/tcp_error.def | 3 ++- src/vnet/tcp/tcp_input.c | 22 ++++++++++++---- 5 files changed, 99 insertions(+), 30 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/vnet/ip/punt.c b/src/vnet/ip/punt.c index 1ea32fa0..0869954c 100644 --- a/src/vnet/ip/punt.c +++ b/src/vnet/ip/punt.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -613,16 +614,15 @@ vnet_punt_socket_del (vlib_main_t * vm, bool is_ip4, u8 l4_protocol, u16 port) * @brief Request IP traffic punt to the local TCP/IP stack. * * @em Note - * - UDP is the only protocol supported in the current implementation - * - When requesting UDP punt port number(s) must be specified - * - All TCP traffic is currently punted to the host by default + * - UDP and TCP are the only protocols supported in the current implementation * * @param vm vlib_main_t corresponding to the current thread * @param ipv IP protcol version. * 4 - IPv4, 6 - IPv6, ~0 for both IPv6 and IPv4 * @param protocol 8-bits L4 protocol value - * Only value of 17 (UDP) is currently supported - * @param port 16-bits L4 (TCP/IP) port number when applicable + * UDP is 17 + * TCP is 1 + * @param port 16-bits L4 (TCP/IP) port number when applicable (UDP only) * * @returns 0 on success, non-zero value otherwise */ @@ -630,28 +630,42 @@ clib_error_t * vnet_punt_add_del (vlib_main_t * vm, u8 ipv, u8 protocol, u16 port, bool is_add) { + /* For now we only support UDP punt */ - if (protocol != IP_PROTOCOL_UDP) + if (protocol != IP_PROTOCOL_UDP && protocol != IP_PROTOCOL_TCP) return clib_error_return (0, - "only UDP protocol (%d) is supported, got %d", - IP_PROTOCOL_UDP, protocol); + "only UDP (%d) and TCP (%d) protocols are supported, got %d", + IP_PROTOCOL_UDP, IP_PROTOCOL_TCP, protocol); if (ipv != (u8) ~ 0 && ipv != 4 && ipv != 6) return clib_error_return (0, "IP version must be 4 or 6, got %d", ipv); if (port == (u16) ~ 0) { - if (ipv == 4 || ipv == (u8) ~ 0) - udp_punt_unknown (vm, 1, is_add); + if ((ipv == 4) || (ipv == (u8) ~ 0)) + { + if (protocol == IP_PROTOCOL_UDP) + udp_punt_unknown (vm, 1, is_add); + else if (protocol == IP_PROTOCOL_TCP) + tcp_punt_unknown (vm, 1, is_add); + } - if (ipv == 6 || ipv == (u8) ~ 0) - udp_punt_unknown (vm, 0, is_add); + if ((ipv == 6) || (ipv == (u8) ~ 0)) + { + if (protocol == IP_PROTOCOL_UDP) + udp_punt_unknown (vm, 0, is_add); + else if (protocol == IP_PROTOCOL_TCP) + tcp_punt_unknown (vm, 0, is_add); + } return 0; } else if (is_add) { + if (protocol == IP_PROTOCOL_TCP) + return clib_error_return (0, "punt TCP ports is not supported yet"); + if (ipv == 4 || ipv == (u8) ~ 0) udp_register_dst_port (vm, port, udp4_punt_node.index, 1); @@ -665,32 +679,36 @@ vnet_punt_add_del (vlib_main_t * vm, u8 ipv, u8 protocol, u16 port, } static clib_error_t * -udp_punt_cli (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) +punt_cli (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) { - u32 udp_port; + u32 port; bool is_add = true; + u32 protocol = ~0; clib_error_t *error; while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { if (unformat (input, "del")) is_add = false; - if (unformat (input, "all")) + else if (unformat (input, "all")) { /* punt both IPv6 and IPv4 when used in CLI */ - error = vnet_punt_add_del (vm, ~0, IP_PROTOCOL_UDP, ~0, is_add); + error = vnet_punt_add_del (vm, ~0, protocol, ~0, is_add); if (error) clib_error_report (error); } - else if (unformat (input, "%d", &udp_port)) + else if (unformat (input, "%d", &port)) { /* punt both IPv6 and IPv4 when used in CLI */ - error = - vnet_punt_add_del (vm, ~0, IP_PROTOCOL_UDP, udp_port, is_add); + error = vnet_punt_add_del (vm, ~0, protocol, port, is_add); if (error) clib_error_report (error); } + else if (unformat (input, "udp")) + protocol = IP_PROTOCOL_UDP; + else if (unformat (input, "tcp")) + protocol = IP_PROTOCOL_TCP; } return 0; @@ -717,10 +735,10 @@ udp_punt_cli (vlib_main_t * vm, * @endparblock ?*/ /* *INDENT-OFF* */ -VLIB_CLI_COMMAND (punt_udp_command, static) = { - .path = "set punt udp", - .short_help = "set punt udp [del] ", - .function = udp_punt_cli, +VLIB_CLI_COMMAND (punt_command, static) = { + .path = "set punt", + .short_help = "set punt [udp|tcp] [del] ", + .function = punt_cli, }; /* *INDENT-ON* */ diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index f779428f..f457ef7e 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -1398,6 +1398,16 @@ vnet_tcp_enable_disable (vlib_main_t * vm, u8 is_en) return 0; } +void +tcp_punt_unknown (vlib_main_t * vm, u8 is_ip4, u8 is_add) +{ + tcp_main_t *tm = &tcp_main; + if (is_ip4) + tm->punt_unknown4 = is_add; + else + tm->punt_unknown6 = is_add; +} + clib_error_t * tcp_init (vlib_main_t * vm) { @@ -1893,6 +1903,29 @@ VLIB_CLI_COMMAND (tcp_replay_scoreboard_command, static) = }; /* *INDENT-ON* */ +static clib_error_t * +show_tcp_punt_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd_arg) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + if (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + return clib_error_return (0, "unknown input `%U'", format_unformat_error, + input); + vlib_cli_output (vm, "IPv4 TCP punt: %s", + tm->punt_unknown4 ? "enabled" : "disabled"); + vlib_cli_output (vm, "IPv6 TCP punt: %s", + tm->punt_unknown6 ? "enabled" : "disabled"); + return 0; +} +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_tcp_punt_command, static) = +{ + .path = "show tcp punt", + .short_help = "show tcp punt", + .function = show_tcp_punt_fn, +}; +/* *INDENT-ON* */ + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index bb8091af..259dbca1 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -417,6 +417,9 @@ typedef struct _tcp_main /** vlib buffer size */ u32 bytes_per_buffer; + + u8 punt_unknown4; + u8 punt_unknown6; } tcp_main_t; extern tcp_main_t tcp_main; @@ -441,6 +444,8 @@ tcp_buffer_hdr (vlib_buffer_t * b) clib_error_t *vnet_tcp_enable_disable (vlib_main_t * vm, u8 is_en); +void tcp_punt_unknown (vlib_main_t * vm, u8 is_ip4, u8 is_add); + always_inline tcp_connection_t * tcp_connection_get (u32 conn_index, u32 thread_index) { diff --git a/src/vnet/tcp/tcp_error.def b/src/vnet/tcp/tcp_error.def index 08922315..a179717f 100644 --- a/src/vnet/tcp/tcp_error.def +++ b/src/vnet/tcp/tcp_error.def @@ -39,4 +39,5 @@ tcp_error (RST_SENT, "Resets sent") tcp_error (INVALID_CONNECTION, "Invalid connection") tcp_error (NO_WND, "No window") tcp_error (CONNECTION_CLOSED, "Connection closed") -tcp_error (CREATE_EXISTS, "Connection already exists") \ No newline at end of file +tcp_error (CREATE_EXISTS, "Connection already exists") +tcp_error (PUNT, "Packets punted") \ No newline at end of file diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 64a07070..bd57eca3 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -2869,6 +2869,7 @@ typedef enum _tcp_input_next TCP_INPUT_NEXT_SYN_SENT, TCP_INPUT_NEXT_ESTABLISHED, TCP_INPUT_NEXT_RESET, + TCP_INPUT_NEXT_PUNT, TCP_INPUT_N_NEXT } tcp_input_next_t; @@ -2878,7 +2879,8 @@ typedef enum _tcp_input_next _ (RCV_PROCESS, "tcp4-rcv-process") \ _ (SYN_SENT, "tcp4-syn-sent") \ _ (ESTABLISHED, "tcp4-established") \ - _ (RESET, "tcp4-reset") + _ (RESET, "tcp4-reset") \ + _ (PUNT, "error-punt") #define foreach_tcp6_input_next \ _ (DROP, "error-drop") \ @@ -2886,7 +2888,8 @@ typedef enum _tcp_input_next _ (RCV_PROCESS, "tcp6-rcv-process") \ _ (SYN_SENT, "tcp6-syn-sent") \ _ (ESTABLISHED, "tcp6-established") \ - _ (RESET, "tcp6-reset") + _ (RESET, "tcp6-reset") \ + _ (PUNT, "error-punt") #define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN) @@ -3010,9 +3013,18 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } else { - /* Send reset */ - next0 = TCP_INPUT_NEXT_RESET; - error0 = TCP_ERROR_NO_LISTENER; + if ((is_ip4 && tm->punt_unknown4) || + (!is_ip4 && tm->punt_unknown6)) + { + next0 = TCP_INPUT_NEXT_PUNT; + error0 = TCP_ERROR_PUNT; + } + else + { + /* Send reset */ + next0 = TCP_INPUT_NEXT_RESET; + error0 = TCP_ERROR_NO_LISTENER; + } } done: -- cgit 1.2.3-korg From f1762d6fa8cbc1a8fae691e568e73e94d5dcbc93 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Sun, 24 Sep 2017 19:43:08 -0400 Subject: tcp: do not sample rtt for retransmitted segments Change-Id: I365c31607332a944ef498369881332b515894ed7 Signed-off-by: Florin Coras --- src/vnet/tcp/tcp_debug.h | 108 +++++++++++++++++++++++----------------------- src/vnet/tcp/tcp_input.c | 52 +++++++++++----------- src/vnet/tcp/tcp_output.c | 21 ++++++++- 3 files changed, 100 insertions(+), 81 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index 4bc6b42e..eb318cde 100755 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -20,7 +20,7 @@ #define TCP_DEBUG (1) #define TCP_DEBUG_SM (0) -#define TCP_DEBUG_CC (1) +#define TCP_DEBUG_CC (0) #define TCP_DEBUG_CC_STAT (1) #define foreach_tcp_dbg_evt \ @@ -411,21 +411,6 @@ typedef enum _tcp_dbg_evt ed->data[4] = _tc->snd_wnd; \ } -#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...) \ -{ \ - ELOG_TYPE_DECLARE (_e) = \ - { \ - .format = "dack-tx: rcv_nxt %u rcv_wnd %u snd_nxt %u av_wnd %u snd_wnd %u",\ - .format_args = "i4i4i4i4i4", \ - }; \ - DECLARE_ETD(_tc, _e, 5); \ - ed->data[0] = _tc->rcv_nxt - _tc->irs; \ - ed->data[1] = _tc->rcv_wnd; \ - ed->data[2] = _tc->snd_nxt - _tc->iss; \ - ed->data[3] = tcp_available_snd_wnd(_tc); \ - ed->data[4] = _tc->snd_wnd; \ -} - #define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ @@ -441,21 +426,6 @@ typedef enum _tcp_dbg_evt ed->data[4] = tcp_flight_size(_tc); \ } -#define TCP_EVT_DUPACK_RCVD_HANDLER(_tc, ...) \ -{ \ - ELOG_TYPE_DECLARE (_e) = \ - { \ - .format = "dack-rx: snd_una %u cwnd %u snd_wnd %u flight %u rcv_wnd %u",\ - .format_args = "i4i4i4i4i4", \ - }; \ - DECLARE_ETD(_tc, _e, 5); \ - ed->data[0] = _tc->snd_una - _tc->iss; \ - ed->data[1] = _tc->cwnd; \ - ed->data[2] = _tc->snd_wnd; \ - ed->data[3] = tcp_flight_size(_tc); \ - ed->data[4] = _tc->rcv_wnd; \ -} - #define TCP_EVT_PKTIZE_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ @@ -601,9 +571,7 @@ if (_av > 0) \ } #else #define TCP_EVT_ACK_SENT_HANDLER(_tc, ...) -#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...) #define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...) -#define TCP_EVT_DUPACK_RCVD_HANDLER(_tc, ...) #define TCP_EVT_PKTIZE_HANDLER(_tc, ...) #define TCP_EVT_INPUT_HANDLER(_tc, _type, _len, _written, ...) #define TCP_EVT_TIMER_POP_HANDLER(_tc_index, _timer_id, ...) @@ -649,19 +617,6 @@ if (_av > 0) \ */ #if TCP_DEBUG_CC -#define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...) \ -{ \ - ELOG_TYPE_DECLARE (_e) = \ - { \ - .format = "rxt: snd_nxt %u offset %u snd %u rxt %u", \ - .format_args = "i4i4i4i4", \ - }; \ - DECLARE_ETD(_tc, _e, 4); \ - ed->data[0] = _tc->snd_nxt - _tc->iss; \ - ed->data[1] = offset; \ - ed->data[2] = n_bytes; \ - ed->data[3] = _tc->snd_rxt_bytes; \ -} #define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...) \ { \ @@ -669,13 +624,14 @@ if (_av > 0) \ { \ .format = "cc: %s wnd %u snd_cong %u rxt_bytes %u", \ .format_args = "t4i4i4i4", \ - .n_enum_strings = 5, \ + .n_enum_strings = 6, \ .enum_strings = { \ "fast-rxt", \ "rxt-timeout", \ "first-rxt", \ "recovered", \ "congestion", \ + "undo", \ }, \ }; \ DECLARE_ETD(_tc, _e, 4); \ @@ -685,6 +641,50 @@ if (_av > 0) \ ed->data[3] = _tc->snd_rxt_bytes; \ } +#define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "rxt: snd_nxt %u offset %u snd %u rxt %u", \ + .format_args = "i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 4); \ + ed->data[0] = _tc->snd_nxt - _tc->iss; \ + ed->data[1] = offset; \ + ed->data[2] = n_bytes; \ + ed->data[3] = _tc->snd_rxt_bytes; \ +} + +#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "dack-tx: rcv_nxt %u rcv_wnd %u snd_nxt %u av_wnd %u snd_wnd %u",\ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->rcv_nxt - _tc->irs; \ + ed->data[1] = _tc->rcv_wnd; \ + ed->data[2] = _tc->snd_nxt - _tc->iss; \ + ed->data[3] = tcp_available_snd_wnd(_tc); \ + ed->data[4] = _tc->snd_wnd; \ +} + +#define TCP_EVT_DUPACK_RCVD_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "dack-rx: snd_una %u cwnd %u snd_wnd %u flight %u rcv_wnd %u",\ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->snd_una - _tc->iss; \ + ed->data[1] = _tc->cwnd; \ + ed->data[2] = _tc->snd_wnd; \ + ed->data[3] = tcp_flight_size(_tc); \ + ed->data[4] = _tc->rcv_wnd; \ +} + #define TCP_EVT_CC_PACK_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ @@ -696,6 +696,13 @@ if (_av > 0) \ ed->data[0] = _tc->snd_una - _tc->iss; \ ed->data[1] = _tc->snd_una_max - _tc->iss; \ } +#else +#define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...) +#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...) +#define TCP_EVT_DUPACK_RCVD_HANDLER(_tc, ...) +#define TCP_EVT_CC_PACK_HANDLER(_tc, ...) +#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...) +#endif /* * Congestion control stats @@ -744,13 +751,6 @@ if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now()) \ #define TCP_EVT_CC_STAT_HANDLER(_tc, ...) #endif -#else -#define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...) -#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...) -#define TCP_EVT_CC_PACK_HANDLER(_tc, ...) -#define TCP_EVT_CC_STAT_HANDLER(_tc, ...) -#endif - #endif /* SRC_VNET_TCP_TCP_DEBUG_H_ */ /* * fd.io coding-style-patch-verification: ON diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index bd57eca3..0a36d063 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -248,8 +248,8 @@ tcp_update_timestamp (tcp_connection_t * tc, u32 seq, u32 seq_end) * then the TSval from the segment is copied to TS.Recent; * otherwise, the TSval is ignored. */ - if (tcp_opts_tstamp (&tc->rcv_opts) && tc->tsval_recent - && seq_leq (seq, tc->rcv_las) && seq_leq (tc->rcv_las, seq_end)) + if (tcp_opts_tstamp (&tc->rcv_opts) && seq_leq (seq, tc->rcv_las) + && seq_leq (tc->rcv_las, seq_end)) { ASSERT (timestamp_leq (tc->tsval_recent, tc->rcv_opts.tsval)); tc->tsval_recent = tc->rcv_opts.tsval; @@ -418,51 +418,53 @@ tcp_update_rto (tcp_connection_t * tc) tc->rto = clib_max (tc->rto, TCP_RTO_MIN); } -/** Update RTT estimate and RTO timer +/** + * Update RTT estimate and RTO timer * * Measure RTT: We have two sources of RTT measurements: TSOPT and ACK * timing. Middle boxes are known to fiddle with TCP options so we * should give higher priority to ACK timing. * + * This should be called only if previously sent bytes have been acked. + * * return 1 if valid rtt 0 otherwise */ static int tcp_update_rtt (tcp_connection_t * tc, u32 ack) { u32 mrtt = 0; - u8 rtx_acked; - - /* Determine if only rtx bytes are acked. */ - rtx_acked = tcp_in_cong_recovery (tc) || !tc->bytes_acked; /* Karn's rule, part 1. Don't use retransmitted segments to estimate * RTT because they're ambiguous. */ - if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq) && !rtx_acked) + if (tcp_in_cong_recovery (tc) || tc->sack_sb.sacked_bytes) + goto done; + + if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq)) { mrtt = tcp_time_now () - tc->rtt_ts; } /* As per RFC7323 TSecr can be used for RTTM only if the segment advances * snd_una, i.e., the left side of the send window: - * seq_lt (tc->snd_una, ack). */ - else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr - && tc->bytes_acked) + * seq_lt (tc->snd_una, ack). This is a condition for calling update_rtt */ + else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr) { mrtt = tcp_time_now () - tc->rcv_opts.tsecr; } - /* Allow measuring of a new RTT */ - tc->rtt_ts = 0; - - /* If ACK moves left side of the wnd make sure boff is 0, even if mrtt is - * not valid */ - if (tc->bytes_acked) - tc->rto_boff = 0; - /* Ignore dubious measurements */ if (mrtt == 0 || mrtt > TCP_RTT_MAX) - return 0; + goto done; tcp_estimate_rtt (tc, mrtt); + +done: + + /* Allow measuring of a new RTT */ + tc->rtt_ts = 0; + + /* If we got here something must've been ACKed so make sure boff is 0, + * even if mrrt is not valid since we update the rto lower */ + tc->rto_boff = 0; tcp_update_rto (tc); return 0; @@ -932,10 +934,11 @@ static void tcp_cc_recovery_exit (tcp_connection_t * tc) { /* Deflate rto */ - tcp_update_rto (tc); tc->rto_boff = 0; + tcp_update_rto (tc); tc->snd_rxt_ts = 0; tcp_recovery_off (tc); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); } void @@ -946,6 +949,7 @@ tcp_cc_fastrecovery_exit (tcp_connection_t * tc) tc->rcv_dupacks = 0; tcp_fastrecovery_off (tc); tcp_fastrecovery_1_smss_off (tc); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); } static void @@ -958,13 +962,14 @@ tcp_cc_congestion_undo (tcp_connection_t * tc) if (tcp_in_recovery (tc)) tcp_cc_recovery_exit (tc); ASSERT (tc->rto_boff == 0); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 5); /* TODO extend for fastrecovery */ } static u8 tcp_cc_is_spurious_retransmit (tcp_connection_t * tc) { - return (tcp_in_recovery (tc) + return (tcp_in_recovery (tc) && tc->rto_boff == 1 && tc->snd_rxt_ts && tcp_opts_tstamp (&tc->rcv_opts) && timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts)); @@ -988,7 +993,6 @@ tcp_cc_recover (tcp_connection_t * tc) ASSERT (tc->rto_boff == 0); ASSERT (!tcp_in_cong_recovery (tc)); ASSERT (tcp_scoreboard_is_sane_post_recovery (tc)); - TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); return 0; } @@ -2115,7 +2119,6 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, new_tc0->flags |= TCP_CONN_SNDACK; /* Update rtt with the syn-ack sample */ - new_tc0->bytes_acked = 1; tcp_update_rtt (new_tc0, vnet_buffer (b0)->tcp.ack_number); TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, new_tc0); } @@ -2352,7 +2355,6 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } /* Update rtt and rto */ - tc0->bytes_acked = 1; tcp_update_rtt (tc0, vnet_buffer (b0)->tcp.ack_number); /* Switch state to ESTABLISHED */ diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index be29f05f..cb1fcc9a 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -918,7 +918,24 @@ tcp_send_reset (tcp_connection_t * tc) opts_write_len = tcp_options_write ((u8 *) (th + 1), &tc->snd_opts); ASSERT (opts_write_len == tc->snd_opts_len); vnet_buffer (b)->tcp.connection_index = tc->c_c_index; - tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4); + if (tc->c_is_ip4) + { + ip4_header_t *ih4; + ih4 = vlib_buffer_push_ip4 (vm, b, &tc->c_lcl_ip.ip4, + &tc->c_rmt_ip.ip4, IP_PROTOCOL_TCP, 0); + th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih4); + } + else + { + int bogus = ~0; + ip6_header_t *ih6; + ih6 = vlib_buffer_push_ip6 (vm, b, &tc->c_lcl_ip.ip6, + &tc->c_rmt_ip.ip6, IP_PROTOCOL_TCP); + th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih6, &bogus); + ASSERT (!bogus); + } + tcp_enqueue_to_ip_lookup_now (vm, b, bi, tc->c_is_ip4); + TCP_EVT_DBG (TCP_EVT_RST_SENT, tc); } void @@ -1324,7 +1341,7 @@ tcp_rtx_timeout_cc (tcp_connection_t * tc) tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss); tc->cwnd = tcp_loss_wnd (tc); tc->snd_congestion = tc->snd_una_max; - + tc->rtt_ts = 0; tcp_recovery_on (tc); } -- cgit 1.2.3-korg From 84275e96559ced02d456cb8b034237d5782b9693 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Tue, 26 Sep 2017 12:30:40 -0400 Subject: tcp: update snd_nxt after congestion recovery Change-Id: I2cf4c4850b9c3c093a7dce0cec89b9f710f69393 Signed-off-by: Florin Coras --- src/vnet/tcp/tcp_input.c | 2 ++ src/vnet/tcp/tcp_output.c | 14 +++++--------- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 0a36d063..62dcdc5e 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -937,6 +937,7 @@ tcp_cc_recovery_exit (tcp_connection_t * tc) tc->rto_boff = 0; tcp_update_rto (tc); tc->snd_rxt_ts = 0; + tc->snd_nxt = tc->snd_una_max; tcp_recovery_off (tc); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); } @@ -947,6 +948,7 @@ tcp_cc_fastrecovery_exit (tcp_connection_t * tc) tc->cc_algo->recovered (tc); tc->snd_rxt_bytes = 0; tc->rcv_dupacks = 0; + tc->snd_nxt = tc->snd_una_max; tcp_fastrecovery_off (tc); tcp_fastrecovery_1_smss_off (tc); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index cb1fcc9a..a954bfa7 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -1391,13 +1391,12 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Increment RTO backoff (also equal to number of retries) and go back * to first un-acked byte */ tc->rto_boff += 1; - tc->snd_nxt = tc->snd_una; /* First retransmit timeout */ if (tc->rto_boff == 1) tcp_rtx_timeout_cc (tc); - /* Exponential backoff */ + tc->snd_nxt = tc->snd_una; tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1); @@ -1515,7 +1514,7 @@ tcp_timer_persist_handler (u32 index) u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; - u32 bi, old_snd_nxt, max_snd_bytes, available_bytes, offset; + u32 bi, max_snd_bytes, available_bytes, offset; int n_bytes = 0; u8 *data; @@ -1567,14 +1566,11 @@ tcp_timer_persist_handler (u32 index) n_bytes = stream_session_peek_bytes (&tc->connection, data, offset, max_snd_bytes); b->current_length = n_bytes; - ASSERT (n_bytes != 0 && (tc->snd_nxt == tc->snd_una_max || tc->rto_boff > 1 - || tcp_timer_is_active (tc, - TCP_TIMER_RETRANSMIT))); + ASSERT (n_bytes != 0 && (tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT) + || tc->snd_nxt == tc->snd_una_max + || tc->rto_boff > 1)); - /* Allow updating of snd_una_max but don't update snd_nxt */ - old_snd_nxt = tc->snd_nxt; tcp_push_hdr_i (tc, b, tc->state, 0); - tc->snd_nxt = old_snd_nxt; tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); /* Just sent new data, enable retransmit */ -- cgit 1.2.3-korg From 879ace3d3b238ec8db80f52ebfd556aa6f12b4c7 Mon Sep 17 00:00:00 2001 From: Chris Luke Date: Tue, 26 Sep 2017 13:15:16 -0400 Subject: Various fixes for issues found by Coverity (VPP-972) 174267: Revisit this string termination issue 174816: Add check for NULL when trace is enabled 177211: Add notation that mutex is not required here 177117: Added check for log2_page_size == 0 and returns an error if so 163697,163698: Added missing sw_if_index validation Change-Id: I5a76fcf6505c785bfb3269e353360031c6a0fd0f Signed-off-by: Chris Luke --- src/uri/sock_test_server.c | 8 ++++++-- src/vnet/srv6/sr_api.c | 10 ++++++++++ src/vnet/tcp/tcp_input.c | 3 ++- src/vpp-api/vapi/vapi.c | 1 + src/vppinfra/linux/mem.c | 6 ++++++ 5 files changed, 25 insertions(+), 3 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/uri/sock_test_server.c b/src/uri/sock_test_server.c index 29adea25..35046aa0 100644 --- a/src/uri/sock_test_server.c +++ b/src/uri/sock_test_server.c @@ -514,7 +514,7 @@ main (int argc, char **argv) continue; } - else if (((char *) conn->buf)[0] != 0) + else if (isascii (conn->buf[0])) { // If it looks vaguely like a string, make sure it's terminated ((char *) conn->buf)[rx_bytes < @@ -536,8 +536,12 @@ main (int argc, char **argv) continue; } - if (isascii (conn->buf[0]) && strlen ((const char *) conn->buf)) + if (isascii (conn->buf[0])) { + // If it looks vaguely like a string, make sure it's terminated + ((char *) conn->buf)[rx_bytes < + conn->buf_size ? rx_bytes : + conn->buf_size - 1] = 0; if (xtra) fprintf (stderr, "ERROR: FIFO not drained in previous test!\n" diff --git a/src/vnet/srv6/sr_api.c b/src/vnet/srv6/sr_api.c index 925b50a1..623f672a 100644 --- a/src/vnet/srv6/sr_api.c +++ b/src/vnet/srv6/sr_api.c @@ -60,6 +60,9 @@ static void vl_api_sr_localsid_add_del_t_handler * char end_psp, u8 behavior, u32 sw_if_index, u32 vlan_index, u32 fib_table, * ip46_address_t *nh_addr, void *ls_plugin_mem) */ + + VALIDATE_SW_IF_INDEX (mp); + rv = sr_cli_localsid (mp->is_del, (ip6_address_t *) & mp->localsid_addr, mp->end_psp, @@ -69,6 +72,8 @@ static void vl_api_sr_localsid_add_del_t_handler ntohl (mp->fib_table), (ip46_address_t *) & mp->nh_addr, NULL); + BAD_SW_IF_INDEX_LABEL; + REPLY_MACRO (VL_API_SR_LOCALSID_ADD_DEL_REPLY); } @@ -158,6 +163,9 @@ static void vl_api_sr_steering_add_del_t_handler * u32 table_id, ip46_address_t *prefix, u32 mask_width, u32 sw_if_index, * u8 traffic_type) */ + + VALIDATE_SW_IF_INDEX (mp); + rv = sr_steering_policy (mp->is_del, (ip6_address_t *) & mp->bsid_addr, ntohl (mp->sr_policy_index), @@ -166,6 +174,8 @@ static void vl_api_sr_steering_add_del_t_handler ntohl (mp->mask_width), ntohl (mp->sw_if_index), mp->traffic_type); + BAD_SW_IF_INDEX_LABEL; + REPLY_MACRO (VL_API_SR_STEERING_ADD_DEL_REPLY); } diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 62dcdc5e..63d6fd87 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -2163,7 +2163,8 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, drop: b0->error = error0 ? node->errors[error0] : 0; - if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + if (PREDICT_FALSE + ((b0->flags & VLIB_BUFFER_IS_TRACED) && tcp0 != 0)) { t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); clib_memcpy (&t0->tcp_header, tcp0, sizeof (t0->tcp_header)); diff --git a/src/vpp-api/vapi/vapi.c b/src/vpp-api/vapi/vapi.c index 59415e03..3150d2b4 100644 --- a/src/vpp-api/vapi/vapi.c +++ b/src/vpp-api/vapi/vapi.c @@ -305,6 +305,7 @@ vapi_connect (vapi_ctx_t ctx, const char *name, } ctx->requests = tmp; memset (ctx->requests, 0, size); + /* coverity[MISSING_LOCK] - 177211 requests_mutex is not needed here */ ctx->requests_start = ctx->requests_count = 0; if (chroot_prefix) { diff --git a/src/vppinfra/linux/mem.c b/src/vppinfra/linux/mem.c index df46763a..2d8f593d 100644 --- a/src/vppinfra/linux/mem.c +++ b/src/vppinfra/linux/mem.c @@ -132,6 +132,12 @@ clib_mem_vm_ext_alloc (clib_mem_vm_alloc_t * a) } } log2_page_size = clib_mem_vm_get_log2_page_size (fd); + + if (log2_page_size == 0) + { + err = clib_error_return_unix (0, "cannot determine page size"); + goto error; + } } else /* not CLIB_MEM_VM_F_SHARED */ { -- cgit 1.2.3-korg