summaryrefslogtreecommitdiffstats
path: root/src/vnet/tcp
diff options
context:
space:
mode:
authorFlorin Coras <fcoras@cisco.com>2019-02-16 20:47:32 -0800
committerDamjan Marion <dmarion@me.com>2019-02-18 20:43:33 +0000
commit222e1f4160a5828bb2b5bf62716cd76664f6100b (patch)
treeeeb8b47fa94dc23152f1652e9dbd14c384d910ff /src/vnet/tcp
parenta333795d1c425877645754a384af47755a80712e (diff)
tcp: harden for high scale scenarios
- Better handle buffer starvation scenarios - Handle case when both peers enter recovery due to packet loss. - Fix passive open establish cleanup Change-Id: I2f28baa2ff0383bb8f5f6d2452b49aa38ce69bce Signed-off-by: Florin Coras <fcoras@cisco.com>
Diffstat (limited to 'src/vnet/tcp')
-rw-r--r--src/vnet/tcp/tcp.c8
-rw-r--r--src/vnet/tcp/tcp.h2
-rw-r--r--src/vnet/tcp/tcp_error.def1
-rw-r--r--src/vnet/tcp/tcp_input.c44
-rw-r--r--src/vnet/tcp/tcp_output.c43
5 files changed, 69 insertions, 29 deletions
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index 81f209b5d7c..c51224447fc 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Copyright (c) 2016-2019 Cisco and/or its affiliates.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
@@ -1244,11 +1244,13 @@ tcp_timer_establish_handler (u32 conn_index)
if (PREDICT_FALSE (tc == 0))
return;
ASSERT (tc->state == TCP_STATE_SYN_RCVD);
+ tc->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID;
+ tcp_connection_set_state (tc, TCP_STATE_CLOSED);
/* Start cleanup. App wasn't notified yet so use delete notify as
* opposed to delete to cleanup session layer state. */
+ tcp_connection_timers_reset (tc);
session_transport_delete_notify (&tc->connection);
- tc->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID;
- tcp_connection_cleanup (tc);
+ tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME);
}
static void
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index 68750ce373f..d1fbf156ed5 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Copyright (c) 2016-2019 Cisco and/or its affiliates.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
diff --git a/src/vnet/tcp/tcp_error.def b/src/vnet/tcp/tcp_error.def
index 7bed10f1840..7dbe952e104 100644
--- a/src/vnet/tcp/tcp_error.def
+++ b/src/vnet/tcp/tcp_error.def
@@ -28,6 +28,7 @@ tcp_error (SEGMENT_INVALID, "Invalid segments")
tcp_error (SYNS_RCVD, "SYNs received")
tcp_error (SPURIOUS_SYN, "Spurious SYNs received")
tcp_error (SYN_ACKS_RCVD, "SYN-ACKs received")
+tcp_error (SPURIOUS_SYN_ACK, "Spurious SYN-ACKs received")
tcp_error (MSG_QUEUE_FULL, "Events not sent for lack of msg queue space")
tcp_error (CREATE_SESSION_FAIL, "Sessions couldn't be allocated")
tcp_error (ACK_OK, "Pure ACKs received")
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index 6514dca4b4a..3d8ae890372 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -350,20 +350,34 @@ tcp_segment_validate (tcp_worker_ctx_t * wrk, tcp_connection_t * tc0,
goto error;
}
- *error0 = TCP_ERROR_RCV_WND;
/* If our window is 0 and the packet is in sequence, let it pass
* through for ack processing. It should be dropped later. */
- if (!(tc0->rcv_wnd == 0
- && tc0->rcv_nxt == vnet_buffer (b0)->tcp.seq_number))
+ if (tc0->rcv_wnd == 0
+ && tc0->rcv_nxt == vnet_buffer (b0)->tcp.seq_number)
+ goto check_reset;
+
+ /* If we entered recovery and peer did so as well, there's a chance that
+ * dup acks won't be acceptable on either end because seq_end may be less
+ * than rcv_las. This can happen if acks are lost in both directions. */
+ if (tcp_in_recovery (tc0)
+ && seq_geq (vnet_buffer (b0)->tcp.seq_number,
+ tc0->rcv_las - tc0->rcv_wnd)
+ && seq_leq (vnet_buffer (b0)->tcp.seq_end,
+ tc0->rcv_nxt + tc0->rcv_wnd))
+ goto check_reset;
+
+ *error0 = TCP_ERROR_RCV_WND;
+
+ /* If not RST, send dup ack */
+ if (!tcp_rst (th0))
{
- /* If not RST, send dup ack */
- if (!tcp_rst (th0))
- {
- tcp_program_dupack (wrk, tc0);
- TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0, vnet_buffer (b0)->tcp);
- }
- goto error;
+ tcp_program_dupack (wrk, tc0);
+ TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0, vnet_buffer (b0)->tcp);
}
+ goto error;
+
+ check_reset:
+ ;
}
/* 2nd: check the RST bit */
@@ -507,6 +521,7 @@ tcp_estimate_initial_rtt (tcp_connection_t * tc)
if (tc->rtt_ts)
{
tc->mrtt_us = tcp_time_now_us (thread_index) - tc->rtt_ts;
+ tc->mrtt_us = clib_max (tc->mrtt_us, 0.0001);
mrtt = clib_max ((u32) (tc->mrtt_us * THZ), 1);
tc->rtt_ts = 0;
}
@@ -514,6 +529,9 @@ tcp_estimate_initial_rtt (tcp_connection_t * tc)
{
mrtt = tcp_time_now_w_thread (thread_index) - tc->rcv_opts.tsecr;
mrtt = clib_max (mrtt, 1);
+ /* Due to retransmits we don't know the initial mrtt */
+ if (tc->rto_boff && mrtt > 1 * THZ)
+ mrtt = 1 * THZ;
tc->mrtt_us = (f64) mrtt *TCP_TICK;
}
@@ -2360,6 +2378,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
/* Make sure the connection actually exists */
ASSERT (tcp_lookup_connection (tc0->c_fib_index, b0,
my_thread_index, is_ip4));
+ error0 = TCP_ERROR_SPURIOUS_SYN_ACK;
goto drop;
}
@@ -2441,7 +2460,6 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
/* No SYN flag. Drop. */
if (!tcp_syn (tcp0))
{
- clib_warning ("not synack");
error0 = TCP_ERROR_SEGMENT_INVALID;
goto drop;
}
@@ -2449,7 +2467,6 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
/* Parse options */
if (tcp_options_parse (tcp0, &tc0->rcv_opts, 1))
{
- clib_warning ("options parse fail");
error0 = TCP_ERROR_OPTIONS;
goto drop;
}
@@ -3676,6 +3693,9 @@ do { \
_(SYN_SENT, TCP_FLAG_RST, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE);
_(SYN_SENT, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT,
TCP_ERROR_NONE);
+ _(SYN_SENT, TCP_FLAG_FIN, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE);
+ _(SYN_SENT, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT,
+ TCP_ERROR_NONE);
/* ACK for for established connection -> tcp-established. */
_(ESTABLISHED, TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
/* FIN for for established connection -> tcp-established. */
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index 9f38851b026..725ffec0852 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Copyright (c) 2016-2019 Cisco and/or its affiliates.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
@@ -587,11 +587,6 @@ tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b)
tcp_options_write ((u8 *) (th + 1), snd_opts);
vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
-
- /* Init retransmit timer. Use update instead of set because of
- * retransmissions */
- tcp_retransmit_timer_force_update (tc);
- TCP_EVT_DBG (TCP_EVT_SYNACK_SENT, tc);
}
always_inline void
@@ -951,7 +946,10 @@ tcp_send_syn (tcp_connection_t * tc)
tc->rto * TCP_TO_TIMER_TICK);
if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
- return;
+ {
+ tcp_timer_update (tc, TCP_TIMER_RETRANSMIT_SYN, 1);
+ return;
+ }
b = vlib_get_buffer (vm, bi);
tcp_init_buffer (vm, b);
@@ -975,14 +973,20 @@ tcp_send_synack (tcp_connection_t * tc)
vlib_buffer_t *b;
u32 bi;
+ tcp_retransmit_timer_force_update (tc);
+
if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
- return;
+ {
+ tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, 1);
+ return;
+ }
tc->rtt_ts = tcp_time_now_us (tc->c_thread_index);
b = vlib_get_buffer (vm, bi);
tcp_init_buffer (vm, b);
tcp_make_synack (tc, b);
tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
+ TCP_EVT_DBG (TCP_EVT_SYNACK_SENT, tc);
}
/**
@@ -1050,6 +1054,9 @@ tcp_send_fin (tcp_connection_t * tc)
tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, 1);
if (fin_snt)
tc->snd_nxt = tc->snd_una_max;
+ else
+ /* Make sure retransmit retries a fin not data */
+ tc->flags |= TCP_CONN_FINSNT;
return;
}
@@ -1195,7 +1202,10 @@ tcp_send_ack (tcp_connection_t * tc)
u32 bi;
if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
- return;
+ {
+ tcp_update_rcv_wnd (tc);
+ return;
+ }
b = vlib_get_buffer (vm, bi);
tcp_init_buffer (vm, b);
tcp_make_ack (tc, b);
@@ -1458,6 +1468,8 @@ tcp_rxt_timeout_cc (tcp_connection_t * tc)
scoreboard_clear (&tc->sack_sb);
tcp_cc_fastrecovery_exit (tc);
}
+ else
+ tc->rcv_dupacks = 0;
/* Start again from the beginning */
tc->cc_algo->congestion (tc);
@@ -1504,7 +1516,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2);
/* Lost FIN, retransmit and return */
- if (tcp_is_lost_fin (tc))
+ if (tc->flags & TCP_CONN_FINSNT)
{
tcp_send_fin (tc);
tc->rto_boff += 1;
@@ -1553,7 +1565,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
n_bytes = tcp_prepare_retransmit_segment (wrk, tc, 0, tc->snd_mss, &b);
if (!n_bytes)
{
- tcp_retransmit_timer_force_update (tc);
+ tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, 1);
return;
}
@@ -1591,7 +1603,10 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
tc->rto * TCP_TO_TIMER_TICK);
if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
- return;
+ {
+ tcp_timer_update (tc, TCP_TIMER_RETRANSMIT_SYN, 1);
+ return;
+ }
b = vlib_get_buffer (vm, bi);
tcp_init_buffer (vm, b);
@@ -1614,9 +1629,11 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
tc->rtt_ts = 0;
+ tcp_retransmit_timer_force_update (tc);
+
if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
{
- tcp_retransmit_timer_force_update (tc);
+ tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, 1);
return;
}
ight .vc { color: #f8f8f2 } /* Name.Variable.Class */ .highlight .vg { color: #f8f8f2 } /* Name.Variable.Global */ .highlight .vi { color: #f8f8f2 } /* Name.Variable.Instance */ .highlight .vm { color: #f8f8f2 } /* Name.Variable.Magic */ .highlight .il { color: #ae81ff } /* Literal.Number.Integer.Long */ } @media (prefers-color-scheme: light) { .highlight .hll { background-color: #ffffcc } .highlight .c { color: #888888 } /* Comment */ .highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */ .highlight .k { color: #008800; font-weight: bold } /* Keyword */ .highlight .ch { color: #888888 } /* Comment.Hashbang */ .highlight .cm { color: #888888 } /* Comment.Multiline */ .highlight .cp { color: #cc0000; font-weight: bold } /* Comment.Preproc */ .highlight .cpf { color: #888888 } /* Comment.PreprocFile */ .highlight .c1 { color: #888888 } /* Comment.Single */ .highlight .cs { color: #cc0000; font-weight: bold; background-color: #fff0f0 } /* Comment.Special */ .highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */ .highlight .ge { font-style: italic } /* Generic.Emph */ .highlight .gr { color: #aa0000 } /* Generic.Error */ .highlight .gh { color: #333333 } /* Generic.Heading */ .highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */ .highlight .go { color: #888888 } /* Generic.Output */ .highlight .gp { color: #555555 } /* Generic.Prompt */ .highlight .gs { font-weight: bold } /* Generic.Strong */ .highlight .gu { color: #666666 } /* Generic.Subheading */ .highlight .gt { color: #aa0000 } /* Generic.Traceback */ .highlight .kc { color: #008800; font-weight: bold } /* Keyword.Constant */ .highlight .kd { color: #008800; font-weight: bold } /* Keyword.Declaration */ .highlight .kn { color: #008800; font-weight: bold } /* Keyword.Namespace */ .highlight .kp { color: #008800 } /* Keyword.Pseudo */ .highlight .kr { color: #008800; font-weight: bold } /* Keyword.Reserved */ .highlight .kt { color: #888888; font-weight: bold } /* Keyword.Type */ .highlight .m { color: #0000DD; font-weight: bold } /* Literal.Number */ .highlight .s { color: #dd2200; background-color: #fff0f0 } /* Literal.String */ .highlight .na { color: #336699 } /* Name.Attribute */ .highlight .nb { color: #003388 } /* Name.Builtin */ .highlight .nc { color: #bb0066; font-weight: bold } /* Name.Class */ .highlight .no { color: #003366; font-weight: bold } /* Name.Constant */ .highlight .nd { color: #555555 } /* Name.Decorator */ .highlight .ne { color: #bb0066; font-weight: bold } /* Name.Exception */ .highlight .nf { color: #0066bb; font-weight: bold } /* Name.Function */ .highlight .nl { color: #336699; font-style: italic } /* Name.Label */ .highlight .nn { color: #bb0066; font-weight: bold } /* Name.Namespace */ .highlight .py { color: #336699; font-weight: bold } /* Name.Property */ .highlight .nt { color: #bb0066; font-weight: bold } /* Name.Tag */ .highlight .nv { color: #336699 } /* Name.Variable */ .highlight .ow { color: #008800 } /* Operator.Word */ .highlight .w { color: #bbbbbb } /* Text.Whitespace */ .highlight .mb { color: #0000DD; font-weight: bold } /* Literal.Number.Bin */ .highlight .mf { color: #0000DD; font-weight: bold } /* Literal.Number.Float */ .highlight .mh { color: #0000DD; font-weight: bold } /* Literal.Number.Hex */ .highlight .mi { color: #0000DD; font-weight: bold } /* Literal.Number.Integer */ .highlight .mo { color: #0000DD; font-weight: bold } /* Literal.Number.Oct */ .highlight .sa { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Affix */ .highlight .sb { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Backtick */ .highlight .sc { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Char */ .highlight .dl { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Delimiter */ .highlight .sd { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Doc */ .highlight .s2 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Double */ .highlight .se { color: #0044dd; background-color: #fff0f0 } /* Literal.String.Escape */ .highlight .sh { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Heredoc */ .highlight .si { color: #3333bb; background-color: #fff0f0 } /* Literal.String.Interpol */ .highlight .sx { color: #22bb22; background-color: #f0fff0 } /* Literal.String.Other */ .highlight .sr { color: #008800; background-color: #fff0ff } /* Literal.String.Regex */ .highlight .s1 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Single */ .highlight .ss { color: #aa6600; background-color: #fff0f0 } /* Literal.String.Symbol */ .highlight .bp { color: #003388 } /* Name.Builtin.Pseudo */ .highlight .fm { color: #0066bb; font-weight: bold } /* Name.Function.Magic */ .highlight .vc { color: #336699 } /* Name.Variable.Class */ .highlight .vg { color: #dd7700 } /* Name.Variable.Global */ .highlight .vi { color: #3333bb } /* Name.Variable.Instance */ .highlight .vm { color: #336699 } /* Name.Variable.Magic */ .highlight .il { color: #0000DD; font-weight: bold } /* Literal.Number.Integer.Long */ }
/*
 * Copyright (c) 2016 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <vnet/vnet.h>
#include <vppinfra/xxhash.h>
#include <vlib/threads.h>
#include <vnet/handoff.h>
#include <vnet/feature/feature.h>

typedef struct
{
  uword *workers_bitmap;
  u32 *workers;
} per_inteface_handoff_data_t;

typedef struct
{
  u32 cached_next_index;
  u32 num_workers;
  u32 first_worker_index;

  per_inteface_handoff_data_t *if_data;

  /* Worker handoff index */
  u32 frame_queue_index;

  /* convenience variables */
  vlib_main_t *vlib_main;
  vnet_main_t *vnet_main;

    u64 (*hash_fn) (ethernet_header_t *);
} handoff_main_t;

handoff_main_t handoff_main;
vlib_node_registration_t handoff_dispatch_node;

typedef struct
{
  u32 sw_if_index;
  u32 next_worker_index;
  u32 buffer_index;
} worker_handoff_trace_t;

/* packet trace format function */
static u8 *
format_worker_handoff_trace (u8 * s, va_list * args)
{
  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
  worker_handoff_trace_t *t = va_arg (*args, worker_handoff_trace_t *);

  s =
    format (s, "worker-handoff: sw_if_index %d, next_worker %d, buffer 0x%x",
	    t->sw_if_index, t->next_worker_index, t->buffer_index);
  return s;
}

vlib_node_registration_t handoff_node;

static uword
worker_handoff_node_fn (vlib_main_t * vm,
			vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  handoff_main_t *hm = &handoff_main;
  vlib_thread_main_t *tm = vlib_get_thread_main ();
  u32 n_left_from, *from;
  static __thread vlib_frame_queue_elt_t **handoff_queue_elt_by_worker_index;
  static __thread vlib_frame_queue_t **congested_handoff_queue_by_worker_index
    = 0;
  vlib_frame_queue_elt_t *hf = 0;
  int i;
  u32 n_left_to_next_worker = 0, *to_next_worker = 0;
  u32 next_worker_index = 0;
  u32 current_worker_index = ~0;

  if (PREDICT_FALSE (handoff_queue_elt_by_worker_index == 0))
    {
      vec_validate (handoff_queue_elt_by_worker_index, tm->n_vlib_mains - 1);

      vec_validate_init_empty (congested_handoff_queue_by_worker_index,
			       hm->first_worker_index + hm->num_workers - 1,
			       (vlib_frame_queue_t *) (~0));
    }

  from = vlib_frame_vector_args (frame);
  n_left_from = frame->n_vectors;

  while (n_left_from > 0)
    {
      u32 bi0;
      vlib_buffer_t *b0;
      u32 sw_if_index0;
      u32 hash;
      u64 hash_key;
      per_inteface_handoff_data_t *ihd0;
      u32 index0;

      bi0 = from[0];
      from += 1;
      n_left_from -= 1;

      b0 = vlib_get_buffer (vm, bi0);
      sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
      ASSERT (hm->if_data);
      ihd0 = vec_elt_at_index (hm->if_data, sw_if_index0);

      next_worker_index = hm->first_worker_index;

      /*
       * Force unknown traffic onto worker 0,
       * and into ethernet-input. $$$$ add more hashes.
       */

      /* Compute ingress LB hash */
      hash_key = hm->hash_fn ((ethernet_header_t *) b0->data);
      hash = (u32) clib_xxhash (hash_key);

      /* if input node did not specify next index, then packet
         should go to eternet-input */
      if (PREDICT_FALSE ((b0->flags & VNET_BUFFER_F_HANDOFF_NEXT_VALID) == 0))
	vnet_buffer (b0)->handoff.next_index =
	  HANDOFF_DISPATCH_NEXT_ETHERNET_INPUT;
      else if (vnet_buffer (b0)->handoff.next_index ==
	       HANDOFF_DISPATCH_NEXT_IP4_INPUT
	       || vnet_buffer (b0)->handoff.next_index ==
	       HANDOFF_DISPATCH_NEXT_IP6_INPUT
	       || vnet_buffer (b0)->handoff.next_index ==
	       HANDOFF_DISPATCH_NEXT_MPLS_INPUT)
	vlib_buffer_advance (b0, (sizeof (ethernet_header_t)));

      if (PREDICT_TRUE (is_pow2 (vec_len (ihd0->workers))))
	index0 = hash & (vec_len (ihd0->workers) - 1);
      else
	index0 = hash % vec_len (ihd0->workers);

      next_worker_index += ihd0->workers[index0];

      if (next_worker_index != current_worker_index)
	{
	  if (hf)
	    hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker;

	  hf = vlib_get_worker_handoff_queue_elt (hm->frame_queue_index,
						  next_worker_index,
						  handoff_queue_elt_by_worker_index);

	  n_left_to_next_worker = VLIB_FRAME_SIZE - hf->n_vectors;
	  to_next_worker = &hf->buffer_index[hf->n_vectors];
	  current_worker_index = next_worker_index;
	}

      /* enqueue to correct worker thread */
      to_next_worker[0] = bi0;
      to_next_worker++;
      n_left_to_next_worker--;

      if (n_left_to_next_worker == 0)
	{
	  hf->n_vectors = VLIB_FRAME_SIZE;
	  vlib_put_frame_queue_elt (hf);
	  current_worker_index = ~0;
	  handoff_queue_elt_by_worker_index[next_worker_index] = 0;
	  hf = 0;
	}

      if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
			 && (b0->flags & VLIB_BUFFER_IS_TRACED)))
	{
	  worker_handoff_trace_t *t =
	    vlib_add_trace (vm, node, b0, sizeof (*t));
	  t->sw_if_index = sw_if_index0;
	  t->next_worker_index = next_worker_index - hm->first_worker_index;
	  t->buffer_index = bi0;
	}

    }

  if (hf)
    hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker;

  /* Ship frames to the worker nodes */
  for (i = 0; i < vec_len (handoff_queue_elt_by_worker_index); i++)
    {
      if (handoff_queue_elt_by_worker_index[i])
	{
	  hf = handoff_queue_elt_by_worker_index[i];
	  /*
	   * It works better to let the handoff node
	   * rate-adapt, always ship the handoff queue element.
	   */
	  if (1 || hf->n_vectors == hf->last_n_vectors)
	    {
	      vlib_put_frame_queue_elt (hf);
	      handoff_queue_elt_by_worker_index[i] = 0;
	    }
	  else
	    hf->last_n_vectors = hf->n_vectors;
	}
      congested_handoff_queue_by_worker_index[i] =
	(vlib_frame_queue_t *) (~0);
    }
  hf = 0;
  current_worker_index = ~0;
  return frame->n_vectors;
}

/* *INDENT-OFF* */
VLIB_REGISTER_NODE (worker_handoff_node) = {
  .function = worker_handoff_node_fn,
  .name = "worker-handoff",
  .vector_size = sizeof (u32),
  .format_trace = format_worker_handoff_trace,
  .type = VLIB_NODE_TYPE_INTERNAL,

  .n_next_nodes = 1,
  .next_nodes = {
    [0] = "error-drop",
  },
};

VLIB_NODE_FUNCTION_MULTIARCH (worker_handoff_node, worker_handoff_node_fn)
/* *INDENT-ON* */

int
interface_handoff_enable_disable (vlib_main_t * vm, u32 sw_if_index,
				  uword * bitmap, int enable_disable)
{
  handoff_main_t *hm = &handoff_main;
  vnet_sw_interface_t *sw;
  vnet_main_t *vnm = vnet_get_main ();
  per_inteface_handoff_data_t *d;
  int i, rv = 0;

  if (pool_is_free_index (vnm->interface_main.sw_interfaces, sw_if_index))
    return VNET_API_ERROR_INVALID_SW_IF_INDEX;

  sw = vnet_get_sw_interface (vnm, sw_if_index);
  if (sw->type != VNET_SW_INTERFACE_TYPE_HARDWARE)
    return VNET_API_ERROR_INVALID_SW_IF_INDEX;

  if (clib_bitmap_last_set (bitmap) >= hm->num_workers)
    return VNET_API_ERROR_INVALID_WORKER;

  if (hm->frame_queue_index == ~0)
    hm->frame_queue_index =
      vlib_frame_queue_main_init (handoff_dispatch_node.index, 0);

  vec_validate (hm->if_data, sw_if_index);
  d = vec_elt_at_index (hm->if_data, sw_if_index);

  vec_free (d->workers);
  vec_free (d->workers_bitmap);

  if (enable_disable)
    {
      d->workers_bitmap = bitmap;
      /* *INDENT-OFF* */
      clib_bitmap_foreach (i, bitmap,
	({
	  vec_add1(d->workers, i);
	}));
      /* *INDENT-ON* */
    }

  vnet_feature_enable_disable ("device-input", "worker-handoff",
			       sw_if_index, enable_disable, 0, 0);
  return rv;
}

static clib_error_t *
set_interface_handoff_command_fn (vlib_main_t * vm,
				  unformat_input_t * input,
				  vlib_cli_command_t * cmd)
{
  handoff_main_t *hm = &handoff_main;
  u32 sw_if_index = ~0;
  int enable_disable = 1;
  uword *bitmap = 0;
  u32 sym = ~0;

  int rv = 0;

  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
    {
      if (unformat (input, "disable"))
	enable_disable = 0;
      else if (unformat (input, "workers %U", unformat_bitmap_list, &bitmap))
	;
      else if (unformat (input, "%U", unformat_vnet_sw_interface,
			 vnet_get_main (), &sw_if_index))
	;
      else if (unformat (input, "symmetrical"))
	sym = 1;
      else if (unformat (input, "asymmetrical"))
	sym = 0;
      else
	break;
    }

  if (sw_if_index == ~0)
    return clib_error_return (0, "Please specify an interface...");

  if (bitmap == 0)
    return clib_error_return (0, "Please specify list of workers...");

  rv =
    interface_handoff_enable_disable (vm, sw_if_index, bitmap,
				      enable_disable);

  switch (rv)
    {
    case 0:
      break;

    case VNET_API_ERROR_INVALID_SW_IF_INDEX:
      return clib_error_return (0, "Invalid interface");
      break;

    case VNET_API_ERROR_INVALID_WORKER:
      return clib_error_return (0, "Invalid worker(s)");
      break;

    case VNET_API_ERROR_UNIMPLEMENTED:
      return clib_error_return (0,
				"Device driver doesn't support redirection");
      break;

    default:
      return clib_error_return (0, "unknown return value %d", rv);
    }

  if (sym == 1)
    hm->hash_fn = eth_get_sym_key;
  else if (sym == 0)
    hm->hash_fn = eth_get_key;

  return 0;
}

/* *INDENT-OFF* */
VLIB_CLI_COMMAND (set_interface_handoff_command, static) = {
  .path = "set interface handoff",
  .short_help =
  "set interface handoff <interface-name> workers <workers-list> [symmetrical|asymmetrical]",
  .function = set_interface_handoff_command_fn,
};
/* *INDENT-ON* */

typedef struct
{
  u32 buffer_index;
  u32 next_index;
  u32 sw_if_index;
} handoff_dispatch_trace_t;

/* packet trace format function */
static u8 *
format_handoff_dispatch_trace (u8 * s, va_list * args)
{
  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
  handoff_dispatch_trace_t *t = va_arg (*args, handoff_dispatch_trace_t *);

  s = format (s, "handoff-dispatch: sw_if_index %d next_index %d buffer 0x%x",
	      t->sw_if_index, t->next_index, t->buffer_index);
  return s;
}

#define foreach_handoff_dispatch_error \
_(EXAMPLE, "example packets")

typedef enum
{
#define _(sym,str) HANDOFF_DISPATCH_ERROR_##sym,
  foreach_handoff_dispatch_error
#undef _
    HANDOFF_DISPATCH_N_ERROR,
} handoff_dispatch_error_t;

static char *handoff_dispatch_error_strings[] = {
#define _(sym,string) string,
  foreach_handoff_dispatch_error
#undef _
};

static uword
handoff_dispatch_node_fn (vlib_main_t * vm,
			  vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  u32 n_left_from, *from, *to_next;
  handoff_dispatch_next_t next_index;

  from = vlib_frame_vector_args (frame);
  n_left_from = frame->n_vectors;
  next_index = node->cached_next_index;

  while (n_left_from > 0)
    {
      u32 n_left_to_next;

      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);

      while (n_left_from >= 4 && n_left_to_next >= 2)
	{
	  u32 bi0, bi1;
	  vlib_buffer_t *b0, *b1;
	  u32 next0, next1;
	  u32 sw_if_index0, sw_if_index1;

	  /* Prefetch next iteration. */
	  {
	    vlib_buffer_t *p2, *p3;

	    p2 = vlib_get_buffer (vm, from[2]);
	    p3 = vlib_get_buffer (vm, from[3]);

	    vlib_prefetch_buffer_header (p2, LOAD);
	    vlib_prefetch_buffer_header (p3, LOAD);
	  }

	  /* speculatively enqueue b0 and b1 to the current next frame */
	  to_next[0] = bi0 = from[0];
	  to_next[1] = bi1 = from[1];
	  from += 2;
	  to_next += 2;
	  n_left_from -= 2;
	  n_left_to_next -= 2;

	  b0 = vlib_get_buffer (vm, bi0);
	  b1 = vlib_get_buffer (vm, bi1);

	  next0 = vnet_buffer (b0)->handoff.next_index;
	  next1 = vnet_buffer (b1)->handoff.next_index;

	  if (PREDICT_FALSE (vm->trace_main.trace_active_hint))
	    {
	      if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
		{
		  vlib_trace_buffer (vm, node, next0, b0,	/* follow_chain */
				     0);
		  handoff_dispatch_trace_t *t =
		    vlib_add_trace (vm, node, b0, sizeof (*t));
		  sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
		  t->sw_if_index = sw_if_index0;
		  t->next_index = next0;
		  t->buffer_index = bi0;
		}
	      if (PREDICT_FALSE (b1->flags & VLIB_BUFFER_IS_TRACED))
		{
		  vlib_trace_buffer (vm, node, next1, b1,	/* follow_chain */
				     0);
		  handoff_dispatch_trace_t *t =
		    vlib_add_trace (vm, node, b1, sizeof (*t));
		  sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX];
		  t->sw_if_index = sw_if_index1;
		  t->next_index = next1;
		  t->buffer_index = bi1;
		}
	    }

	  /* verify speculative enqueues, maybe switch current next frame */
	  vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
					   to_next, n_left_to_next,
					   bi0, bi1, next0, next1);
	}

      while (n_left_from > 0 && n_left_to_next > 0)
	{
	  u32 bi0;
	  vlib_buffer_t *b0;
	  u32 next0;
	  u32 sw_if_index0;

	  /* speculatively enqueue b0 to the current next frame */
	  bi0 = from[0];
	  to_next[0] = bi0;
	  from += 1;
	  to_next += 1;
	  n_left_from -= 1;
	  n_left_to_next -= 1;

	  b0 = vlib_get_buffer (vm, bi0);

	  next0 = vnet_buffer (b0)->handoff.next_index;

	  if (PREDICT_FALSE (vm->trace_main.trace_active_hint))
	    {
	      if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
		{
		  vlib_trace_buffer (vm, node, next0, b0,	/* follow_chain */
				     0);
		  handoff_dispatch_trace_t *t =
		    vlib_add_trace (vm, node, b0, sizeof (*t));
		  sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
		  t->sw_if_index = sw_if_index0;
		  t->next_index = next0;
		  t->buffer_index = bi0;
		}
	    }

	  /* verify speculative enqueue, maybe switch current next frame */
	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
					   to_next, n_left_to_next,
					   bi0, next0);
	}

      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
    }

  return frame->n_vectors;
}

/* *INDENT-OFF* */
VLIB_REGISTER_NODE (handoff_dispatch_node) = {
  .function = handoff_dispatch_node_fn,
  .name = "handoff-dispatch",
  .vector_size = sizeof (u32),
  .format_trace = format_handoff_dispatch_trace,
  .type = VLIB_NODE_TYPE_INTERNAL,
  .flags = VLIB_NODE_FLAG_IS_HANDOFF,

  .n_errors = ARRAY_LEN(handoff_dispatch_error_strings),
  .error_strings = handoff_dispatch_error_strings,

  .n_next_nodes = HANDOFF_DISPATCH_N_NEXT,

  .next_nodes = {
        [HANDOFF_DISPATCH_NEXT_DROP] = "error-drop",
        [HANDOFF_DISPATCH_NEXT_ETHERNET_INPUT] = "ethernet-input",
        [HANDOFF_DISPATCH_NEXT_IP4_INPUT] = "ip4-input-no-checksum",
        [HANDOFF_DISPATCH_NEXT_IP6_INPUT] = "ip6-input",
        [HANDOFF_DISPATCH_NEXT_MPLS_INPUT] = "mpls-input",
  },
};

VLIB_NODE_FUNCTION_MULTIARCH (handoff_dispatch_node, handoff_dispatch_node_fn)
/* *INDENT-ON* */

clib_error_t *
handoff_init (vlib_main_t * vm)
{
  handoff_main_t *hm = &handoff_main;
  vlib_thread_main_t *tm = vlib_get_thread_main ();
  clib_error_t *error;
  uword *p;

  if ((error = vlib_call_init_function (vm, threads_init)))
    return error;

  vlib_thread_registration_t *tr;
  /* Only the standard vnet worker threads are supported */
  p = hash_get_mem (tm->thread_registrations_by_name, "workers");
  if (p)
    {
      tr = (vlib_thread_registration_t *) p[0];
      if (tr)
	{
	  hm->num_workers = tr->count;
	  hm->first_worker_index = tr->first_index;
	}
    }

  hm->hash_fn = eth_get_key;

  hm->vlib_main = vm;
  hm->vnet_main = &vnet_main;

  hm->frame_queue_index = ~0;

  return 0;
}

VLIB_INIT_FUNCTION (handoff_init);

/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */