/*
 * vnet/pipeline.h: software pipeline
 *
 * Copyright (c) 2012 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * Usage example.
 *
 * #define NSTAGES 3 or whatever
 *
 * If using an aux data vector - to hold bihash keys or some such:
 *
 * #define AUX_DATA_TYPE my_aux_data_t
 *
 * <Define pipeline stages>
 *
 * #include <vnet/pipeline.h>
 *
 * static uword my_node_fn (vlib_main_t * vm,
 *                               vlib_node_runtime_t * node,
 *                               vlib_frame_t * frame)
 * {
 *     return dispatch_pipeline (vm, node, frame);
 * }
 *
 */

#ifndef NSTAGES
#error files which #include <vnet/pipeline.h> must define NSTAGES
#endif

#ifndef STAGE_INLINE
#define STAGE_INLINE inline
#endif

/* Unless the user wants the aux data scheme, don't configure it */
#ifndef AUX_DATA_TYPE
#define AUX_DATA_ARG
#define AUX_DATA_DECL
#define AUX_DATA_PTR(pi)
#else
#define AUX_DATA_ARG ,##AUX_DATA_TYPE *ap
#define AUX_DATA_DECL AUX_DATA_TYPE aux_data[VLIB_FRAME_SIZE]
#define AUX_DATA_PTR(pi) ,aux_data +(pi)
#endif

/*
 * A prefetch stride of 2 is quasi-equivalent to doubling the number
 * of stages with every other pipeline stage empty.
 */

/*
 * This is a typical first pipeline stage, which prefetches
 * buffer metadata and the first line of pkt data.
 *
 * To use it:
 *  #define stage0 generic_stage0
 *
 * This implementation won't use the aux data argument
 */
static STAGE_INLINE void
generic_stage0 (vlib_main_t * vm,
		vlib_node_runtime_t * node, vlib_buffer_t * b AUX_DATA_ARG)
{
  vlib_prefetch_buffer_header (b, STORE);
  CLIB_PREFETCH (b->data, CLIB_CACHE_LINE_BYTES, STORE);
}

#if NSTAGES == 2

static STAGE_INLINE uword
dispatch_pipeline (vlib_main_t * vm,
		   vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  u32 *from;
  u32 n_left_from;
  int pi;
  vlib_buffer_t *bufs[VLIB_FRAME_SIZE];
  u16 nexts[VLIB_FRAME_SIZE];
  AUX_DATA_DECL;

  n_left_from = frame->n_vectors;
  from = vlib_frame_args (frame);
  vlib_get_buffers (vm, from, bufs, n_left_from);

  for (pi = 0; pi < NSTAGES - 1; pi++)
    {
      if (pi == n_left_from)
	break;
      stage0 (vm, node, bufs[pi] AUX_DATA_PTR (pi));
    }

  for (; pi < n_left_from; pi++)
    {
      stage0 (vm, node, bufs[pi]);
      nexts[pi - 1] =
	last_stage (vm, node, bufs[pi - 1] AUX_DATA_PTR (pi - 1));
    }

  for (; pi < (n_left_from + (NSTAGES - 1)); pi++)
    {
      if (((pi - 1) >= 0) && ((pi - 1) < n_left_from))
	nexts[pi - 1] =
	  last_stage (vm, node, bufs[pi - 1] AUX_DATA_PTR (pi - 1));
    }

  vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
  return frame->n_vectors;
}
#endif

#if NSTAGES == 3
static STAGE_INLINE uword
dispatch_pipeline (vlib_main_t * vm,
		   vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  u32 *from;
  u32 n_left_from;
  int pi;
  vlib_buffer_t *bufs[VLIB_FRAME_SIZE];
  u16 nexts[VLIB_FRAME_SIZE];
  AUX_DATA_DECL;

  n_left_from = frame->n_vectors;
  from = vlib_frame_args (frame);
  vlib_get_buffers (vm, from, bufs, n_left_from);

  for (pi = 0; pi < NSTAGES - 1; pi++)
    {
      if (pi == n_left_from)
	break;
      stage0 (vm, node, bufs[pi] AUX_DATA_PTR (pi));
      if (pi - 1 >= 0)
	stage1 (vm, node, bufs[pi - 1]);
    }

  for (; pi < n_left_from; pi++)
    {
      stage0 (vm, node, bufs[pi] AUX_DATA_PTR (pi));
      stage1 (vm, node, bufs[pi - 1] AUX_DATA_PTR (pi - 1));
      nexts[pi - 2] =
	last_stage (vm, node, bufs[pi - 2] AUX_DATA_PTR (pi - 2));
    }

  for (; pi < (n_left_from + (NSTAGES - 1)); pi++)
    {
      if (((pi - 1) >= 0) && ((pi - 1) < n_left_from))
	stage1 (vm, node, bufs[pi - 1] AUX_DATA_PTR (pi - 1));
      if (((pi - 2) >= 0) && ((pi - 2) < n_left_from))
	nexts[pi - 2] =
	  last_stage (vm, node, bufs[pi - 2] AUX_DATA_PTR (pi - 2));
    }

  vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
  return frame->n_vectors;
}
#endif

#if NSTAGES == 4
static STAGE_INLINE uword
dispatch_pipeline (vlib_main_t * vm,
		   vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  u32 *from;
  u32 n_left_from;
  int pi;
  vlib_buffer_t *bufs[VLIB_FRAME_SIZE];
  u16 nexts[VLIB_FRAME_SIZE];
  AUX_DATA_DECL;

  n_left_from = frame->n_vectors;
  from = vlib_frame_args (frame);
  vlib_get_buffers (vm, from, bufs, n_left_from);

  for (pi = 0; pi < NSTAGES - 1; pi++)
    {
      if (pi == n_left_from)
	break;
      stage0 (vm, node, bufs[pi] AUX_DATA_PTR (pi));
      if (pi - 1 >= 0)
	stage1 (vm, node, bufs[pi - 1] AUX_DATA_PTR (pi - 1));
      if (pi - 2 >= 0)
	stage2 (vm, node, bufs[pi - 2] AUX_DATA_PTR (pi - 2));
    }

  for (; pi < n_left_from; pi++)
    {
      stage0 (vm, node, bufs[pi] AUX_DATA_PTR (pi));
      stage1 (vm, node, bufs[pi - 1] AUX_DATA_PTR (pi - 1));
      stage2 (vm, node, bufs[pi - 2] AUX_DATA_PTR (pi - 2));
      nexts[pi - 3] =
	last_stage (vm, node, bufs[pi - 3] AUX_DATA_PTR (pi - 3));
    }

  for (; pi < (n_left_from + (NSTAGES - 1)); pi++)
    {
      if (((pi - 1) >= 0) && ((pi - 1) < n_left_from))
	stage1 (vm, node, bufs[pi - 1] AUX_DATA_PTR (pi - 1));
      if (((pi - 2) >= 0) && ((pi - 2) < n_left_from))
	stage2 (vm, node, bufs[pi - 2] AUX_DATA_PTR (pi - 2));
      if (((pi - 3) >= 0) && ((pi - 3) < n_left_from))
	nexts[pi - 3] =
	  last_stage (vm, node, bufs[pi - 3] AUX_DATA_PTR (pi - 3));
    }

  vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
  return frame->n_vectors;
}
#endif

#if NSTAGES == 5
static STAGE_INLINE uword
dispatch_pipeline (vlib_main_t * vm,
		   vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  u32 *from;
  u32 n_left_from;
  int pi;
  vlib_buffer_t *bufs[VLIB_FRAME_SIZE];
  u16 nexts[VLIB_FRAME_SIZE];
  AUX_DATA_DECL;

  n_left_from = frame->n_vectors;
  from = vlib_frame_args (frame);
  vlib_get_buffers (vm, from, bufs, n_left_from);

  for (pi = 0; pi < NSTAGES - 1; pi++)
    {
      if (pi == n_left_from)
	break;
      stage0 (vm, node, bufs[pi] AUX_DATA_PTR (pi));
      if (pi - 1 >= 0)
	stage1 (vm, node, bufs[pi - 1] AUX_DATA_PTR (pi - 1));
      if (pi - 2 >= 0)
	stage2 (vm, node, bufs[pi - 2] AUX_DATA_PTR (pi - 2));
      if (pi - 3 >= 0)
	stage3 (vm, node, bufs[pi - 3] AUX_DATA_PTR (pi - 3));
    }

  for (; pi < n_left_from; pi++)
    {
      stage0 (vm, node, bufs[pi] AUX_DATA_PTR (pi));
      stage1 (vm, node, bufs[pi - 1] AUX_DATA_PTR (pi - 1));
      stage2 (vm, node, bufs[pi - 2] AUX_DATA_PTR (pi - 2));
      stage3 (vm, node, bufs[pi - 3] AUX_DATA_PTR (pi - 3));<style>.highlight .hll { background-color: #ffffcc }
.highlight .c { color: #888888 } /* Comment */
.highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */
.highlight .k { color: #008800; font-weight: bold } /* Keyword */
.highlight .ch { color: #888888 } /* Comment.Hashbang */
.highlight .cm { color: #888888 } /* Comment.Multiline */
.highlight .cp { color: #cc0000; font-weight: bold } /* Comment.Preproc */
.highlight .cpf { color: #888888 } /* Comment.PreprocFile */
.highlight .c1 { color: #888888 } /* Comment.Single */
.highlight .cs { color: #cc0000; font-weight: bold; background-color: #fff0f0 } /* Comment.Special */
.highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
.highlight .ge { font-style: italic } /* Generic.Emph */
.highlight .gr { color: #aa0000 } /* Generic.Error */
.highlight .gh { color: #333333 } /* Generic.Heading */
.highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
.highlight .go { color: #888888 } /* Generic.Output */
.highlight .gp { color: #555555 } /* Generic.Prompt */
.highlight .gs { font-weight: bold } /* Generic.Strong */
.highlight .gu { color: #666666 } /* Generic.Subheading */
.highlight .gt { color: #aa0000 } /* Generic.Traceback */
.highlight .kc { color: #008800; font-weight: bold } /* Keyword.Constant */
.highlight .kd { color: #008800; font-weight: bold } /* Keyword.Declaration */
.highlight .kn { color: #008800; font-weight: bold } /* Keyword.Namespace */
.highlight .kp { color: #008800 } /* Keyword.Pseudo */
.highlight .kr { color: #008800; font-weight: bold } /* Keyword.Reserved */
.highlight .kt { color: #888888; font-weight: bold } /* Keyword.Type */
.highlight .m { color: #0000DD; font-weight: bold } /* Literal.Number */
.highlight .s { color: #dd2200; background-color: #fff0f0 } /* Literal.String */
.highlight .na { color: #336699 } /* Name.Attribute */
.highlight .nb { color: #003388 } /* Name.Builtin */
.highlight .nc { color: #bb0066; font-weight: bold } /* Name.Class */
.highlight .no { color: #003366; font-weight: bold } /* Name.Constant */
.highlight .nd { color: #555555 } /* Name.Decorator */
.highlight .ne { color: #bb0066; font-weight: bold } /* Name.Exception */
.highlight .nf { color: #0066bb; font-weight: bold } /* Name.Function */
.highlight .nl { color: #336699; font-style: italic } /* Name.Label */
.highlight .nn { color: #bb0066; font-weight: bold } /* Name.Namespace */
.highlight .py { color: #336699; font-weight: bold } /* Name.Property */
.highlight .nt { color: #bb0066; font-weight: bold } /* Name.Tag */
.highlight .nv { color: #336699 } /* Name.Variable */
.highlight .ow { color: #008800 } /* Operator.Word */
.highlight .w { color: #bbbbbb } /* Text.Whitespace */
.highlight .mb { color: #0000DD; font-weight: bold } /* Literal.Number.Bin */
.highlight .mf { color: #000