/*
 * Copyright (c) 2015 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#include <vlib/vlib.h>
#include <vnet/vnet.h>
#include <vnet/pg/pg.h>
#include <vnet/ethernet/ethernet.h>
#include <vppinfra/error.h>
#include <sample/sample.h>

typedef struct
{
  u32 next_index;
  u32 sw_if_index;
  u8 new_src_mac[6];
  u8 new_dst_mac[6];
} sample_trace_t;


/* packet trace format function */
static u8 *
format_sample_trace (u8 * s, va_list * args)
{
  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
  sample_trace_t *t = va_arg (*args, sample_trace_t *);

  s = format (s, "SAMPLE: sw_if_index %d, next index %d\n",
	      t->sw_if_index, t->next_index);
  s = format (s, "  new src %U -> new dst %U",
	      format_mac_address, t->new_src_mac,
	      format_mac_address, t->new_dst_mac);

  return s;
}

extern vlib_node_registration_t sample_node;

#define foreach_sample_error \
_(SWAPPED, "Mac swap packets processed")

typedef enum
{
#define _(sym,str) SAMPLE_ERROR_##sym,
  foreach_sample_error
#undef _
    SAMPLE_N_ERROR,
} sample_error_t;

static char *sample_error_strings[] = {
#define _(sym,string) string,
  foreach_sample_error
#undef _
};

typedef enum
{
  SAMPLE_NEXT_INTERFACE_OUTPUT,
  SAMPLE_N_NEXT,
} sample_next_t;

/*
 * Simple dual/single loop version, default version which will compile
 * everywhere.
 *
 * Node costs 30 clocks/pkt at a vector size of 51
 */

#define VERSION_1 1
#ifdef VERSION_1
#define foreach_mac_address_offset              \
_(0)                                            \
_(1)                                            \
_(2)                                            \
_(3)                                            \
_(4)                                            \
_(5)

VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
			    vlib_frame_t * frame)
{
  u32 n_left_from, *from, *to_next;
  sample_next_t next_index;
  u32 pkts_swapped = 0;

  from = vlib_frame_vector_args (frame);
  n_left_from = frame->n_vectors;
  next_index = node->cached_next_index;

  while (n_left_from > 0)
    {
      u32 n_left_to_next;

      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);

      while (n_left_from >= 4 && n_left_to_next >= 2)
	{
	  u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
	  u32 next1 = SAMPLE_NEXT_INTERFACE_OUTPUT;
	  u32 sw_if_index0, sw_if_index1;
	  u8 tmp0[6], tmp1[6];
	  ethernet_header_t *en0, *en1;
	  u32 bi0, bi1;
	  vlib_buffer_t *b0, *b1;

	  /* Prefetch next iteration. */
	  {
	    vlib_buffer_t *p2, *p3;

	    p2 = vlib_get_buffer (vm, from[2]);
	    p3 = vlib_get_buffer (vm, from[3]);

	    vlib_prefetch_buffer_header (p2, LOAD);
	    vlib_prefetch_buffer_header (p3, LOAD);

	    CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
	    CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
	  }

	  /* speculatively enqueue b0 and b1 to the current next frame */
	  to_next[0] = bi0 = from[0];
	  to_next[1] = bi1 = from[1];
	  from += 2;
	  to_next += 2;
	  n_left_from -= 2;
	  n_left_to_next -= 2;

	  b0 = vlib_get_buffer (vm, bi0);
	  b1 = vlib_get_buffer (vm, bi1);

	  ASSERT (b0->current_data == 0);
	  ASSERT (b1->current_data == 0);

	  en0 = vlib_buffer_get_current (b0);
	  en1 = vlib_buffer_get_current (b1);

	  /* This is not the fastest way to swap src + dst mac addresses */
#define _(a) tmp0[a] = en0->src_address[a];
	  foreach_mac_address_offset;
#undef _
#define _(a) en0->src_address[a] = en0->dst_address[a];
	  foreach_mac_address_offset;
#undef _
#define _(a) en0->dst_address[a] = tmp0[a];
	  foreach_mac_address_offset;
#undef _

#define _(a) tmp1[a] = en1->src_address[a];
	  foreach_mac_address_offset;
#undef _
#define _(a) en1->src_address[a] = en1->dst_address[a];
	  foreach_mac_address_offset;
#undef _
#define _(a) en1->dst_address[a] = tmp1[a];
	  foreach_mac_address_offset;
#undef _

	  sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
	  sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX];

	  /* Send pkt back out the RX interface */
	  vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
	  vnet_buffer (b1)->sw_if_index[VLIB_TX] = sw_if_index1;

	  pkts_swapped += 2;

	  if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
	    {
	      if (b0->flags & VLIB_BUFFER_IS_TRACED)
		{
		  sample_trace_t *t =
		    vlib_add_trace (vm, node, b0, sizeof (*t));
		  t->sw_if_index = sw_if_index0;
		  t->next_index = next0;
		  clib_memcpy_fast (t->new_src_mac, en0->src_address,
				    sizeof (t->new_src_mac));
		  clib_memcpy_fast (t->new_dst_mac, en0->dst_address,
				    sizeof (t->new_dst_mac));

		}
	      if (b1->flags & VLIB_BUFFER_IS_TRACED)
		{
		  sample_trace_t *t =
		    vlib_add_trace (vm, node, b1, sizeof (*t));
		  t->sw_if_index = sw_if_index1;
		  t->next_index = next1;
		  clib_memcpy_fast (t->new_src_mac, en1->src_address,
				    sizeof (t->new_src_mac));
		  clib_memcpy_fast (t->new_dst_mac, en1->dst_address,
				    sizeof (t->new_dst_mac));
		}
	    }

	  /* verify speculative enqueues, maybe switch current next frame */
	  vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
					   to_next, n_left_to_next,
					   bi0, bi1, next0, next1);
	}

      while (n_left_from > 0 && n_left_to_next > 0)
	{
	  u32 bi0;
	  vlib_buffer_t *b0;
	  u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
	  u32 sw_if_index0;
	  u8 tmp0[6];
	  ethernet_header_t *en0;

	  /* speculatively enqueue b0 to the current next frame */
	  bi0 = from[0];
	  to_next[0] = bi0;
	  from += 1;
	  to_next += 1;
	  n_left_from -= 1;
	  n_left_to_next -= 1;

	  b0 = vlib_get_buffer (vm, bi0);
	  /*
	   * Direct from the driver, we should be at offset 0
	   * aka at &b0->data[0]
	   */
	  ASSERT (b0->current_data == 0);

	  en0 = vlib_buffer_get_current (b0);

	  /* This is not the fastest way to swap src + dst mac addresses */
#define _(a) tmp0[a] = en0->src_address[a];
	  foreach_mac_address_offset;
#undef _
#define _(a) en0->src_address[a] = en0->dst_address[a];
	  foreach_mac_address_offset;
#undef _
#define _(a) en0->dst_address[a] = tmp0[a];
	  foreach_mac_address_offset;
#undef _

	  sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];

	  /* Send pkt back out the RX interface */
	  vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;

	  if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
			     && (b0->flags & VLIB_BUFFER_IS_TRACED)))
	    {
	      sample_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t));
	      t->sw_if_index = sw_if_index0;
	      t->next_index = next0;
	      clib_memcpy_fast (t->new_src_mac, en0->src_address,
				sizeof (t->new_src_mac));
	      clib_memcpy_fast (t->new_dst_mac, en0->dst_address,
				sizeof (t->new_dst_mac));
	    }

	  pkts_swapped += 1;

	  /* verify speculative enqueue, maybe switch current next frame */
	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
					   to_next, n_left_to_next,
					   bi0, next0);
	}

      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
    }

  vlib_node_increment_counter (vm, sample_node.index,
			       SAMPLE_ERROR_SWAPPED, pkts_swapped);
  return frame->n_vectors;
}
#endif

/*
 * This version swaps mac addresses using an MMX vector shuffle
 * Node costs about 17 clocks/pkt at a vector size of 26
 */
#ifdef VERSION_2
VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
			    vlib_frame_t * frame)
{
  u32 n_left_from, *from, *to_next;
  sample_next_t next_index;
  u32 pkts_swapped = 0;
  /* Vector shuffle mask to swap src, dst */
  u8x16 swapmac = { 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, 13, 14, 15 };

  from = vlib_frame_vector_args (frame);
  n_left_from = frame->n_vectors;
  next_index = node->cached_next_index;

  while (n_left_from > 0)
    {
      u32 n_left_to_next;

      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
      while (n_left_from >= 4 && n_left_to_next >= 2)
	{
	  u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
	  u32 next1 = SAMPLE_NEXT_INTERFACE_OUTPUT;
	  u32 sw_if_index0, sw_if_index1;
	  u8x16 src_dst0, src_dst1;
	  ethernet_header_t *en0, *en1;
	  u32 bi0, bi1;
	  vlib_buffer_t *b0, *b1;

	  /* Prefetch next iteration. */
	  {
	    vlib_buffer_t *p2, *p3;

	    p2 = vlib_get_buffer (vm, from[2]);
	    p3 = vlib_get_buffer (vm, from[3]);

	    vlib_prefetch_buffer_header (p2, LOAD);
	    vlib_prefetch_buffer_header (p3, LOAD);

	    CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
	    CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
	  }

	  /* speculatively enqueue b0 and b1 to the current next frame */
	  to_next[0] = bi0 = from[0];
	  to_next[1] = bi1 = from[1];
	  from += 2;
	  to_next += 2;
	  n_left_from -= 2;
	  n_left_to_next -= 2;

	  b0 = vlib_get_buffer (vm, bi0);
	  b1 = vlib_get_buffer (vm, bi1);

	  ASSERT (b0->current_data == 0);
	  ASSERT (b1->current_data == 0);

	  en0 = vlib_buffer_get_current (b0);
	  en1 = vlib_buffer_get_current (b1);

	  src_dst0 = ((u8x16 *) en0)[0];
	  src_dst1 = ((u8x16 *) en1)[0];
	  src_dst0 = u8x16_shuffle (src_dst0, swapmac);
	  src_dst1 = u8x16_shuffle (src_dst1, swapmac);
	  ((u8x16 *) en0)[0] = src_dst0;
	  ((u8x16 *) en1)[0] = src_dst1;

	  sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
	  sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX];

	  /* Send pkt back out the RX interface */
	  vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
	  vnet_buffer (b1)->sw_if_index[VLIB_TX] = sw_if_index1;

	  pkts_swapped += 2;

	  if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
	    {
	      if (b0->flags & VLIB_BUFFER_IS_TRACED)
		{
		  sample_trace_t *t =
		    vlib_add_trace (vm, node, b0, sizeof (*t));
		  t->sw_if_index = sw_if_index0;
		  t->next_index = next0;
		  clib_memcpy_fast (t->new_src_mac, en0->src_address,
				    sizeof (t->new_src_mac));
		  clib_memcpy_fast (t->new_dst_mac, en0->dst_address,
				    sizeof (t->new_dst_mac));

		}
	      if (b1->flags & VLIB_BUFFER_IS_TRACED)
		{
		  sample_trace_t *t =
		    vlib_add_trace (vm, node, b1, sizeof (*t));
		  t->sw_if_index = sw_if_index1;
		  t->next_index = next1;
		  clib_memcpy_fast (t->new_src_mac, en1->src_address,
				    sizeof (t->new_src_mac));
		  clib_memcpy_fast (t->new_dst_mac, en1->dst_address,
				    sizeof (t->new_dst_mac));
		}
	    }

	  /* verify speculative enqueues, maybe switch current next frame */
	  vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
					   to_next, n_left_to_next,
					   bi0, bi1, next0, next1);
	}

      while (n_left_from > 0 && n_left_to_next > 0)
	{
	  u32 bi0;
	  vlib_buffer_t *b0;
	  u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
	  u32 sw_if_index0;
	  u8x16 src_dst0;
	  ethernet_header_t *en0;

	  /* speculatively enqueue b0 to the current next frame */
	  bi0 = from[0];
	  to_next[0] = bi0;
	  from += 1;
	  to_next += 1;
	  n_left_from -= 1;
	  n_left_to_next -= 1;

	  b0 = vlib_get_buffer (vm, bi0);
	  /*
	   * Direct from the driver, we should be at offset 0
	   * aka at &b0->data[0]
	   */
	  ASSERT (b0->current_data == 0);

	  en0 = vlib_buffer_get_current (b0);
	  src_dst0 = ((u8x16 *) en0)[0];
	  src_dst0 = u8x16_shuffle (src_dst0, swapmac);
	  ((u8x16 *) en0)[0] = src_dst0;

	  sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];

	  /* Send pkt back out the RX interface */
	  vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;

	  if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
			     && (b0->flags & VLIB_BUFFER_IS_TRACED)))
	    {
	      sample_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t));
	      t->sw_if_index = sw_if_index0;
	      t->next_index = next0;
	      clib_memcpy_fast (t->new_src_mac, en0->src_address,
				sizeof (t->new_src_mac));
	      clib_memcpy_fast (t->new_dst_mac, en0->dst_address,
				sizeof (t->new_dst_mac));
	    }

	  pkts_swapped += 1;

	  /* verify speculative enqueue, maybe switch current next frame */
	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
					   to_next, n_left_to_next,
					   bi0, next0);
	}

      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
    }

  vlib_node_increment_counter (vm, sample_node.index,
			       SAMPLE_ERROR_SWAPPED, pkts_swapped);
  return frame->n_vectors;
}
#endif


/*
 * This version computes all of the buffer pointers in
 * one motion, uses a quad/single loop model, and
 * traces the entire frame in one motion.
 *
 * Node costs about 16 clocks/pkt at a vector size of 26
 *
 * Some compilation drama with u8x16_shuffle, so turned off by
 * default.
 */

#ifdef VERSION_3

#define u8x16_shuffle __builtin_shuffle
/* This would normally be a stack local, but since it's a constant... */
static const u16 nexts[VLIB_FRAME_SIZE] = { 0 };

VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
			    vlib_frame_t * frame)
{
  u32 n_left_from, *from;
  u32 pkts_swapped = 0;
  /* Vector shuffle mask to swap src, dst */
  u8x16 swapmac = { 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, 13, 14, 15 };
  vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
  /* See comment below about sending all pkts to the same place... */
  u16 *next __attribute__ ((unused));

  from = vlib_frame_vector_args (frame);
  n_left_from = frame->n_vectors;

  vlib_get_buffers (vm, from, bufs, n_left_from);
  b = bufs;
  // next = nexts;

  /*
   * We send all pkts to SAMPLE_NEXT_INTERFACE_OUTPUT, aka
   * graph arc 0. So the usual setting of next[0...3] is commented
   * out below
   */

  while (n_left_from >= 4)
    {
      u8x16 src_dst0, src_dst1, src_dst2, src_dst3;
      /* Prefetch next iteration. */
      if (PREDICT_TRUE (n_left_from >= 8))
	{
	  vlib_prefetch_buffer_header (b[4], STORE);
	  vlib_prefetch_buffer_header (b[5], STORE);
	  vlib_prefetch_buffer_header (b[6], STORE);
	  vlib_prefetch_buffer_header (b[7], STORE);
	  CLIB_PREFETCH (&b[4]->data, CLIB_CACHE_LINE_BYTES, STORE);
	  CLIB_PREFETCH (&b[5]->data, CLIB_CACHE_LINE_BYTES, STORE);
	  CLIB_PREFETCH (&b[6]->data, CLIB_CACHE_LINE_BYTES, STORE);
	  CLIB_PREFETCH (&b[7]->data, CLIB_CACHE_LINE_BYTES, STORE);
	}

      src_dst0 = ((u8x16 *) vlib_buffer_get_current (b[0]))[0];
      src_dst1 = ((u8x16 *) vlib_buffer_get_current (b[1]))[0];
      src_dst2 = ((u8x16 *) vlib_buffer_get_current (b[2]))[0];
      src_dst3 = ((u8x16 *) vlib_buffer_get_current (b[3]))[0];

      src_dst0 = u8x16_shuffle (src_dst0, swapmac);
      src_dst1 = u8x16_shuffle (src_dst1, swapmac);
      src_dst2 = u8x16_shuffle (src_dst2, swapmac);
      src_dst3 = u8x16_shuffle (src_dst3, swapmac);

      ((u8x16 *) vlib_buffer_get_current (b[0]))[0] = src_dst0;
      ((u8x16 *) vlib_buffer_get_current (b[1]))[0] = src_dst1;
      ((u8x16 *) vlib_buffer_get_current (b[2]))[0] = src_dst2;
      ((u8x16 *) vlib_buffer_get_current (b[3]))[0] = src_dst3;

      vnet_buffer (b[0])->sw_if_index[VLIB_TX] =
	vnet_buffer (b[0])->sw_if_index[VLIB_RX];
      vnet_buffer (b[1])->sw_if_index[VLIB_TX] =
	vnet_buffer (b[1])->sw_if_index[VLIB_RX];
      vnet_buffer (b[2])->sw_if_index[VLIB_TX] =
	vnet_buffer (b[2])->sw_if_index[VLIB_RX];
      vnet_buffer (b[3])->sw_if_index[VLIB_TX] =
	vnet_buffer (b[3])->sw_if_index[VLIB_RX];

      // next[0] = SAMPLE_NEXT_INTERFACE_OUTPUT;
      // next[1] = SAMPLE_NEXT_INTERFACE_OUTPUT;
      // next[2] = SAMPLE_NEXT_INTERFACE_OUTPUT;
      // next[3] = SAMPLE_NEXT_INTERFACE_OUTPUT;

      b += 4;
      // next += 4;
      n_left_from -= 4;
      pkts_swapped += 4;
    }

  while (n_left_from > 0)
    {
      u8x16 src_dst0;
      src_dst0 = ((u8x16 *) vlib_buffer_get_current (b[0]))[0];
      src_dst0 = u8x16_shuffle (src_dst0, swapmac);
      ((u8x16 *) vlib_buffer_get_current (b[0]))[0] = src_dst0;
      vnet_buffer (b[0])->sw_if_index[VLIB_TX] =
	vnet_buffer (b[0])->sw_if_index[VLIB_RX];
      // next[0] = SAMPLE_NEXT_INTERFACE_OUTPUT;

      b += 1;
      // next += 1;
      n_left_from -= 1;
      pkts_swapped += 1;

    }
  vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
			       frame->n_vectors);

  vlib_node_increment_counter (vm, sample_node.index,
			       SAMPLE_ERROR_SWAPPED, pkts_swapped);

  if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
    {
      int i;
      b = bufs;

      for (i = 0; i < frame->n_vectors; i++)
	{
	  if (b[0]->flags & VLIB_BUFFER_IS_TRACED)
	    {
	      ethernet_header_t *en;
	      sample_trace_t *t =
		vlib_add_trace (vm, node, b[0], sizeof (*t));
	      t->sw_if_index = vnet_buffer (b[0])->sw_if_index[VLIB_TX];
	      t->next_index = SAMPLE_NEXT_INTERFACE_OUTPUT;
	      en = vlib_buffer_get_current (b[0]);
	      clib_memcpy_fast (t->new_src_mac, en->src_address,
				sizeof (t->new_src_mac));
	      clib_memcpy_fast (t->new_dst_mac, en->dst_address,
				sizeof (t->new_dst_mac));
	      b++;
	    }
	  else
	    break;
	}
    }
  return frame->n_vectors;
}
#endif

/*
 * This version computes all of the buffer pointers in
 * one motion, uses a fully pipelined loop model, and
 * traces the entire frame in one motion.
 *
 * It's performance-competative with other coding paradigms,
 * and it's the simplest way to write performant vpp code
 */


#ifdef VERSION_4

#define u8x16_shuffle __builtin_shuffle

static u8x16 swapmac =
  { 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, 13, 14, 15 };

/* Final stage in the pipeline, do the mac swap */
static inline u32
last_stage (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t * b)
{
  u8x16 src_dst0;
  src_dst0 = ((u8x16 *) vlib_buffer_get_current (b))[0];
  src_dst0 = u8x16_shuffle (src_dst0, swapmac);
  ((u8x16 *) vlib_buffer_get_current (b))[0] = src_dst0;
  vnet_buffer (b)->sw_if_index[VLIB_TX] =
    vnet_buffer (b)->sw_if_index[VLIB_RX];
  /* set next-index[] to 0 for this buffer */
  return 0;
}

/*
 * Add a couple of nil stages to increase the prefetch stride.
 * For any specific platform, the optimal prefetch stride may differ.
 */
static inline void
stage1 (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t * b)
{
}

static inline void
stage2 (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t * b)
{
}

#define NSTAGES 4
#define STAGE_INLINE inline __attribute__((__always_inline__))

#define stage0 generic_stage0

#include <vnet/pipeline.h>

VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
			    vlib_frame_t * frame)
{
  dispatch_pipeline (vm, node, frame);

  vlib_node_increment_counter (vm, sample_node.index,
			       SAMPLE_ERROR_SWAPPED, frame->n_vectors);
  if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
    {
      int i;
      b = bufs;

      for (i = 0; i < frame->n_vectors; i++)
	{
	  if (b[0]->flags & VLIB_BUFFER_IS_TRACED)
	    {
	      ethernet_header_t *en;
	      sample_trace_t *t =
		vlib_add_trace (vm, node, b[0], sizeof (*t));
	      t->sw_if_index = vnet_buffer (b[0])->sw_if_index[VLIB_TX];
	      t->next_index = SAMPLE_NEXT_INTERFACE_OUTPUT;
	      en = vlib_buffer_get_current (b[0]);
	      clib_memcpy_fast (t->new_src_mac, en->src_address,
				sizeof (t->new_src_mac));
	      clib_memcpy_fast (t->new_dst_mac, en->dst_address,
				sizeof (t->new_dst_mac));
	      b++;
	    }
	  else
	    break;
	}
    }
  return frame->n_vectors;
}
#endif

/* *INDENT-OFF* */
VLIB_REGISTER_NODE (sample_node) =
{
  .name = "sample",
  .vector_size = sizeof (u32),
  .format_trace = format_sample_trace,
  .type = VLIB_NODE_TYPE_INTERNAL,

  .n_errors = ARRAY_LEN(sample_error_strings),
  .error_strings = sample_error_strings,

  .n_next_nodes = SAMPLE_N_NEXT,

  /* edit / add dispositions here */
  .next_nodes = {
    [SAMPLE_NEXT_INTERFACE_OUTPUT] = "interface-output",
  },
};
/* *INDENT-ON* */

/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */