/*
 * vnet/pipeline.h: software pipeline
 *
 * Copyright (c) 2012 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * Usage example.
 *
 * #define NSTAGES 3 or whatever
 *
 * <Define pipeline stages>
 *
 * #include <vnet/pipeline.h>
 *
 * static uword my_node_fn (vlib_main_t * vm,
 *                               vlib_node_runtime_t * node,
 *                               vlib_frame_t * frame)
 * {
 *     return dispatch_pipeline (vm, node, frame);
 * }
 *
 */

#ifndef NSTAGES
#error files which #include <vnet/pipeline.h> must define NSTAGES
#endif

#ifndef STAGE_INLINE
#define STAGE_INLINE inline
#endif

/*
 * A prefetch stride of 2 is quasi-equivalent to doubling the number
 * of stages with every other pipeline stage empty.
 */

/*
 * This is a typical first pipeline stage, which prefetches
 * buffer metadata and the first line of pkt data.
 * To use it:
 *  #define stage0 generic_stage0
 */
static STAGE_INLINE void
generic_stage0 (vlib_main_t * vm,
		vlib_node_runtime_t * node, u32 buffer_index)
{
  /* generic default stage 0 here */
  vlib_buffer_t *b = vlib_get_buffer (vm, buffer_index);
  vlib_prefetch_buffer_header (b, STORE);
  CLIB_PREFETCH (b->data, CLIB_CACHE_LINE_BYTES, STORE);
}

#if NSTAGES == 2

static STAGE_INLINE uword
dispatch_pipeline (vlib_main_t * vm,
		   vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  u32 *from = vlib_frame_vector_args (frame);
  u32 n_left_from, n_left_to_next, *to_next, next_index, next0;
  int pi, pi_limit;

  n_left_from = frame->n_vectors;
  next_index = node->cached_next_index;

  while (n_left_from > 0)
    {
      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);

      pi_limit = clib_min (n_left_from, n_left_to_next);

      for (pi = 0; pi < NSTAGES - 1; pi++)
	{
	  if (pi == pi_limit)
	    break;
	  stage0 (vm, node, from[pi]);
	}

      for (; pi < pi_limit; pi++)
	{
	  stage0 (vm, node, from[pi]);
	  to_next[0] = from[pi - 1];
	  to_next++;
	  n_left_to_next--;
	  next0 = last_stage (vm, node, from[pi - 1]);
	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
					   to_next, n_left_to_next,
					   from[pi - 1], next0);
	  n_left_from--;
	  if ((int) n_left_to_next < 0 && n_left_from > 0)
	    vlib_get_next_frame (vm, node, next_index, to_next,
				 n_left_to_next);
	}

      for (; pi < (pi_limit + (NSTAGES - 1)); pi++)
	{
	  if (((pi - 1) >= 0) && ((pi - 1) < pi_limit))
	    {
	      to_next[0] = from[pi - 1];
	      to_next++;
	      n_left_to_next--;
	      next0 = last_stage (vm, node, from[pi - 1]);
	      vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
					       to_next, n_left_to_next,
					       from[pi - 1], next0);
	      n_left_from--;
	      if ((int) n_left_to_next < 0 && n_left_from > 0)
		vlib_get_next_frame (vm, node, next_index, to_next,
				     n_left_to_next);
	    }
	}
      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
      from += pi_limit;
    }
  return frame->n_vectors;
}
#endif

#if NSTAGES == 3
static STAGE_INLINE uword
dispatch_pipeline (vlib_main_t * vm,
		   vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  u32 *from = vlib_frame_vector_args (frame);
  u32 n_left_from, n_left_to_next, *to_next, next_index, next0;
  int pi, pi_limit;

  n_left_from = frame->n_vectors;
  next_index = node->cached_next_index;

  while (n_left_from > 0)
    {
      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);

      pi_limit = clib_min (n_left_from, n_left_to_next);

      for (pi = 0; pi < NSTAGES - 1; pi++)
	{
	  if (pi == pi_limit)
	    break;
	  stage0 (vm, node, from[pi]);
	  if (pi - 1 >= 0)
	    stage1 (vm, node, from[pi - 1]);
	}

      for (; pi < pi_limit; pi++)
	{
	  stage0 (vm, node, from[pi]);
	  stage1 (vm, node, from[pi - 1]);
	  to_next[0] = from[pi - 2];
	  to_next++;
	  n_left_to_next--;
	  next0 = last_stage (vm, node, from[pi - 2]);
	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
					   to_next, n_left_to_next,
					   from[pi - 2], next0);
	  n_left_from--;
	  if ((int) n_left_to_next < 0 && n_left_from > 0)
	    vlib_get_next_frame (vm, node, next_index, to_next,
				 n_left_to_next);
	}


      for (; pi < (pi_limit + (NSTAGES - 1)); pi++)
	{
	  if (((pi - 1) >= 0) && ((pi - 1) < pi_limit))
	    stage1 (vm, node, from[pi - 1]);
	  if (((pi - 2) >= 0) && ((pi - 2) < pi_limit))
	    {
	      to_next[0] = from[pi - 2];
	      to_next++;
	      n_left_to_next--;
	      next0 = last_stage (vm, node, from[pi - 2]);
	      vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
					       to_next, n_left_to_next,
					       from[pi - 2], next0);
	      n_left_from--;
	      if ((int) n_left_to_next < 0 && n_left_from > 0)
		vlib_get_next_frame (vm, node, next_index, to_next,
				     n_left_to_next);
	    }
	}

      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
      from += pi_limit;
    }
  return frame->n_vectors;
}
#endif

#if NSTAGES == 4
static STAGE_INLINE uword
dispatch_pipeline (vlib_main_t * vm,
		   vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  u32 *from = vlib_frame_vector_args (frame);
  u32 n_left_from, n_left_to_next, *to_next, next_index, next0;
  int pi, pi_limit;

  n_left_from = frame->n_vectors;
  next_index = node->cached_next_index;

  while (n_left_from > 0)
    {
      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);

      pi_limit = clib_min (n_left_from, n_left_to_next);

      for (pi = 0; pi < NSTAGES - 1; pi++)
	{
	  if (pi == pi_limit)
	    break;
	  stage0 (vm, node, from[pi]);
	  if (pi - 1 >= 0)
	    stage1 (vm, node, from[pi - 1]);
	  if (pi - 2 >= 0)
	    stage2 (vm, node, from[pi - 2]);
	}

      for (; pi < pi_limit; pi++)
	{
	  stage0 (vm, node, from[pi]);
	  stage1 (vm, node, from[pi - 1]);
	  stage2 (vm, node, from[pi - 2]);
	  to_next[0] = from[pi - 3];
	  to_next++;
	  n_left_to_next--;
	  next0 = last_stage (vm, node, from[pi - 3]);
	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
					   to_next, n_left_to_next,
					   from[pi - 3], next0);
	  n_left_from--;
	  if ((int) n_left_to_next < 0 && n_left_from > 0)
	    vlib_get_next_frame (vm, node, next_index, to_next,
				 n_left_to_next);
	}


      for (; pi < (pi_limit + (NSTAGES - 1)); pi++)
	{
	  if (((pi - 1) >= 0) && ((pi - 1) < pi_limit))
	    stage1 (vm, node, from[pi - 1]);
	  if (((pi - 2) >= 0) && ((pi - 2) < pi_limit))
	    stage2 (vm, node, from[pi - 2]);
	  if (((pi - 3) >= 0) && ((pi - 3) < pi_limit))
	    {
	      to_next[0] = from[pi - 3];
	      to_next++;
	      n_left_to_next--;
	      next0 = last_stage (vm, node, from[pi - 3]);
	      vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
					       to_next, n_left_to_next,
					       from[pi - 3], next0);
	      n_left_from--;
	      if ((int) n_left_to_next < 0 && n_left_from > 0)
		vlib_get_next_frame (vm, node, next_index, to_next,
				     n_left_to_next);
	    }
	}

      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
      from += pi_limit;
    }
  return frame->n_vectors;
}
#endif


#if NSTAGES == 5
static STAGE_INLINE uword
dispatch_pipeline (vlib_main_t * vm,
		   vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  u32 *from = vlib_frame_vector_args (frame);
  u32 n_left_from, n_left_to_next, *to_next, next_index, next0;
  int pi, pi_limit;

  n_left_from = frame->n_vectors;
  next_index = node->cached_next_index;

  while (n_left_from > 0)
    {
      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);

      pi_limit = clib_min (n_left_from, n_left_to_next);

      for (pi = 0; pi < NSTAGES - 1; pi++)
	{
	  if (pi == pi_limit)
	    break;
	  stage0 (vm, node, from[pi]);
	  if (pi - 1 >= 0)
	    stage1 (vm, node, from[pi - 1]);
	  if (pi - 2 >= 0)
	    stage2 (vm, node, from[pi - 2]);
	  if (pi - 3 >= 0)
	    stage3 (vm, node, from[pi - 3]);
	}

      for (; pi < pi_limit; pi++)
	{
	  stage0 (vm, node, from[pi]);
	  stage1 (vm, node, from[pi - 1]);
	  stage2 (vm, node, from[pi - 2]);
	  stage3 (vm, node, from[pi - 3]);
	  to_next[0] = from[pi - 4];
	  to_next++;
	  n_left_to_next--;
	  next0 = last_stage (vm, node, from[pi - 4]);
	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
					   to_next, n_left_to_next,
					   from[pi - 4], next0);
	  n_left_from--;
	  if ((int) n_left_to_next < 0 && n_left_from > 0)
	    vlib_get_next_frame (vm, node, next_index, to_next,
				 n_left_to_next);
	}


      for (; pi < (pi_limit + (NSTAGES - 1)); pi++)
	{
	  if (((pi - 1) >= 0) && ((pi - 1) < pi_limit))
	    stage1 (vm, node, from[pi - 1]);
	  if (((pi - 2) >= 0) && ((pi - 2) < pi_limit))
	    stage2 (vm, node, from[pi - 2]);
	  if (((pi - 3) >= 0) && ((pi - 3) < pi_limit))
	    stage3 (vm, node, from[pi - 3]);
	  if (((pi - 4) >= 0) && ((pi - 4) < pi_limit))
	    {
	      to_next[0] = from[pi - 4];
	      to_next++;
	      n_left_to_next--;
	      next0 = last_stage (vm, node, from[pi - 4]);
	      vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
					       to_next, n_left_to_next,
					       from[pi - 4], next0);
	      n_left_from--;
	      if ((int) n_left_to_next < 0 && n_left_from > 0)
		vlib_get_next_frame (vm, node, next_index, to_next,
				     n_left_to_next);
	    }
	}

      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
      from += pi_limit;
    }
  return frame->n_vectors;
}
#endif

#if NSTAGES == 6
static STAGE_INLINE uword
dispatch_pipeline (vlib_main_t * vm,
		   vlib_node_runtime_t * node, vlib_frame_t * frame)
{
  u32 *from = vlib_frame_vector_args (frame);
  u32 n_left_from, n_left_to_next, *to_next, next_index, next0;
  int pi, pi_limit;

  n_left_from = frame->n_vectors;
  next_index = node->cached_next_index;

  while (n_left_from > 0)
    {
      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);

      pi_limit = clib_min (n_left_from, n_left_to_next);

      for (pi = 0; pi < NSTAGES - 1; pi++)
	{
	  if (pi == pi_limit)
	    break;
	  stage0 (vm, node, from[pi]);
	  if (pi - 1 >= 0)
	    stage1 (vm, node, from[pi - 1]);
	  if (pi - 2 >= 0)
	    stage2 (vm, node, from[pi - 2]);
	  if (pi - 3 >= 0)
	    stage3 (vm, node, from[pi - 3]);
	  if (pi - 4 >= 0)
	    stage4 (vm, node, from[pi - 4]);
	}

      for (; pi < pi_limit; pi++)
	{
	  stage0 (vm, node, from[pi]);
	  stage1 (vm, node, from[pi - 1]);
	  stage2 (vm, node, from[pi - 2]);
	  stage3 (vm, node, from[pi - 3]);
	  stage4 (vm, node, from[pi - 4]);
	  to_next[0] = from[pi - 5];
	  to_next++;
	  n_left_to_next--;
	  next0 = last_stage (vm, node, from[pi - 5]);
	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
					   to_next, n_left_to_next,
					   from[pi - 5], next0);
	  n_left_from--;
	  if ((int) n_left_to_next < 0 && n_left_from > 0)
	    vlib_get_next_frame (vm, node, next_index, to_next,
				 n_left_to_next);
	}


      for (; pi < (pi_limit + (NSTAGES - 1)); pi++)
	{
	  if (((pi - 1) >= 0) && ((pi - 1) < pi_limit))
	    stage1 (vm, node, from[pi - 1]);
	  if (((pi - 2) >= 0) && ((pi - 2) < pi_limit))
	    stage2 (vm, node, from[pi - 2]);
	  if (((pi - 3) >= 0) && ((pi - 3) < pi_limit))
	    stage3 (vm, node, from[pi - 3]);
	  if (((pi - 4) >= 0) && ((pi - 4) < pi_limit))
	    stage4 (vm, node, from[pi - 4]);
	  if (((pi - 5) >= 0) && ((pi - 5) < pi_limit))
	    {
	      to_next[0] = from[pi - 5];
	      to_next++;
	      n_left_to_next--;
	      next0 = last_stage (vm, node, from[pi - 5]);
	      vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
					       to_next, n_left_to_next,
					       from[pi - 5], next0);
	      n_left_from--;
	      if ((int) n_left_to_next < 0 && n_left_from > 0)
		vlib_get_next_frame (vm, node, next_index, to_next,
				     n_left_to_next);
	    }
	}

      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
      from += pi_limit;
    }
  return frame->n_vectors;
}
#endif

/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */