aboutsummaryrefslogtreecommitdiffstats
path: root/src/examples
diff options
context:
space:
mode:
authorDave Barach <dbarach@cisco.com>2018-07-25 16:56:38 -0400
committerFlorin Coras <florin.coras@gmail.com>2018-07-26 19:09:14 +0000
commit9594b56e6f795751cfef562920ded7874376e91c (patch)
tree717f1fab947ff7d3f3cb944660c7dd990cd93857 /src/examples
parent06dcd45ff81e06bc8cf40ed487c0b2652d346a5a (diff)
Improve the sample plugin node dispatch function
Three separate implementations, which vary by nearly a factor of two in performance. Most of the performance difference is due to swapping the src/dst mac addresses with an avx2 vector shuffle instruction. Change-Id: Ieb36546d6074e4ac720d452a99d013c698135c57 Signed-off-by: Dave Barach <dave@barachs.net>
Diffstat (limited to 'src/examples')
-rw-r--r--src/examples/sample-plugin/sample/node.c601
1 files changed, 473 insertions, 128 deletions
diff --git a/src/examples/sample-plugin/sample/node.c b/src/examples/sample-plugin/sample/node.c
index 94c1706b20a..1b512090469 100644
--- a/src/examples/sample-plugin/sample/node.c
+++ b/src/examples/sample-plugin/sample/node.c
@@ -18,7 +18,8 @@
#include <vppinfra/error.h>
#include <sample/sample.h>
-typedef struct {
+typedef struct
+{
u32 next_index;
u32 sw_if_index;
u8 new_src_mac[6];
@@ -34,17 +35,18 @@ format_mac_address (u8 * s, va_list * args)
}
/* packet trace format function */
-static u8 * format_sample_trace (u8 * s, va_list * args)
+static u8 *
+format_sample_trace (u8 * s, va_list * args)
{
CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
- sample_trace_t * t = va_arg (*args, sample_trace_t *);
-
+ sample_trace_t *t = va_arg (*args, sample_trace_t *);
+
s = format (s, "SAMPLE: sw_if_index %d, next index %d\n",
- t->sw_if_index, t->next_index);
+ t->sw_if_index, t->next_index);
s = format (s, " new src %U -> new dst %U",
- format_mac_address, t->new_src_mac,
- format_mac_address, t->new_dst_mac);
+ format_mac_address, t->new_src_mac,
+ format_mac_address, t->new_dst_mac);
return s;
}
@@ -54,24 +56,35 @@ vlib_node_registration_t sample_node;
#define foreach_sample_error \
_(SWAPPED, "Mac swap packets processed")
-typedef enum {
+typedef enum
+{
#define _(sym,str) SAMPLE_ERROR_##sym,
foreach_sample_error
#undef _
- SAMPLE_N_ERROR,
+ SAMPLE_N_ERROR,
} sample_error_t;
-static char * sample_error_strings[] = {
+static char *sample_error_strings[] = {
#define _(sym,string) string,
foreach_sample_error
#undef _
};
-typedef enum {
+typedef enum
+{
SAMPLE_NEXT_INTERFACE_OUTPUT,
SAMPLE_N_NEXT,
} sample_next_t;
+/*
+ * Simple dual/single loop version, default version which will compile
+ * everywhere.
+ *
+ * Node costs 30 clocks/pkt at a vector size of 51
+ */
+#define VERSION_1 1
+
+#ifdef VERSION_1
#define foreach_mac_address_offset \
_(0) \
_(1) \
@@ -82,10 +95,9 @@ _(5)
static uword
sample_node_fn (vlib_main_t * vm,
- vlib_node_runtime_t * node,
- vlib_frame_t * frame)
+ vlib_node_runtime_t * node, vlib_frame_t * frame)
{
- u32 n_left_from, * from, * to_next;
+ u32 n_left_from, *from, *to_next;
sample_next_t next_index;
u32 pkts_swapped = 0;
@@ -97,26 +109,25 @@ sample_node_fn (vlib_main_t * vm,
{
u32 n_left_to_next;
- vlib_get_next_frame (vm, node, next_index,
- to_next, n_left_to_next);
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
while (n_left_from >= 4 && n_left_to_next >= 2)
{
- u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
- u32 next1 = SAMPLE_NEXT_INTERFACE_OUTPUT;
- u32 sw_if_index0, sw_if_index1;
- u8 tmp0[6], tmp1[6];
- ethernet_header_t *en0, *en1;
- u32 bi0, bi1;
- vlib_buffer_t * b0, * b1;
-
+ u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
+ u32 next1 = SAMPLE_NEXT_INTERFACE_OUTPUT;
+ u32 sw_if_index0, sw_if_index1;
+ u8 tmp0[6], tmp1[6];
+ ethernet_header_t *en0, *en1;
+ u32 bi0, bi1;
+ vlib_buffer_t *b0, *b1;
+
/* Prefetch next iteration. */
{
- vlib_buffer_t * p2, * p3;
-
+ vlib_buffer_t *p2, *p3;
+
p2 = vlib_get_buffer (vm, from[2]);
p3 = vlib_get_buffer (vm, from[3]);
-
+
vlib_prefetch_buffer_header (p2, LOAD);
vlib_prefetch_buffer_header (p3, LOAD);
@@ -124,7 +135,7 @@ sample_node_fn (vlib_main_t * vm,
CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
}
- /* speculatively enqueue b0 and b1 to the current next frame */
+ /* speculatively enqueue b0 and b1 to the current next frame */
to_next[0] = bi0 = from[0];
to_next[1] = bi1 = from[1];
from += 2;
@@ -135,87 +146,85 @@ sample_node_fn (vlib_main_t * vm,
b0 = vlib_get_buffer (vm, bi0);
b1 = vlib_get_buffer (vm, bi1);
- ASSERT (b0->current_data == 0);
- ASSERT (b1->current_data == 0);
-
- en0 = vlib_buffer_get_current (b0);
- en1 = vlib_buffer_get_current (b1);
+ ASSERT (b0->current_data == 0);
+ ASSERT (b1->current_data == 0);
+
+ en0 = vlib_buffer_get_current (b0);
+ en1 = vlib_buffer_get_current (b1);
- /* This is not the fastest way to swap src + dst mac addresses */
+ /* This is not the fastest way to swap src + dst mac addresses */
#define _(a) tmp0[a] = en0->src_address[a];
- foreach_mac_address_offset;
+ foreach_mac_address_offset;
#undef _
#define _(a) en0->src_address[a] = en0->dst_address[a];
- foreach_mac_address_offset;
+ foreach_mac_address_offset;
#undef _
#define _(a) en0->dst_address[a] = tmp0[a];
- foreach_mac_address_offset;
+ foreach_mac_address_offset;
#undef _
#define _(a) tmp1[a] = en1->src_address[a];
- foreach_mac_address_offset;
+ foreach_mac_address_offset;
#undef _
#define _(a) en1->src_address[a] = en1->dst_address[a];
- foreach_mac_address_offset;
+ foreach_mac_address_offset;
#undef _
#define _(a) en1->dst_address[a] = tmp1[a];
- foreach_mac_address_offset;
+ foreach_mac_address_offset;
#undef _
-
-
- sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
- sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
-
- /* Send pkt back out the RX interface */
- vnet_buffer(b0)->sw_if_index[VLIB_TX] = sw_if_index0;
- vnet_buffer(b1)->sw_if_index[VLIB_TX] = sw_if_index1;
-
- pkts_swapped += 2;
-
- if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)))
- {
- if (b0->flags & VLIB_BUFFER_IS_TRACED)
- {
- sample_trace_t *t =
- vlib_add_trace (vm, node, b0, sizeof (*t));
- t->sw_if_index = sw_if_index0;
- t->next_index = next0;
- clib_memcpy (t->new_src_mac, en0->src_address,
- sizeof (t->new_src_mac));
- clib_memcpy (t->new_dst_mac, en0->dst_address,
- sizeof (t->new_dst_mac));
-
- }
- if (b1->flags & VLIB_BUFFER_IS_TRACED)
- {
- sample_trace_t *t =
- vlib_add_trace (vm, node, b1, sizeof (*t));
- t->sw_if_index = sw_if_index1;
- t->next_index = next1;
- clib_memcpy (t->new_src_mac, en1->src_address,
- sizeof (t->new_src_mac));
- clib_memcpy (t->new_dst_mac, en1->dst_address,
- sizeof (t->new_dst_mac));
- }
- }
-
- /* verify speculative enqueues, maybe switch current next frame */
- vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
- to_next, n_left_to_next,
- bi0, bi1, next0, next1);
- }
+ sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
+ sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX];
+
+ /* Send pkt back out the RX interface */
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
+ vnet_buffer (b1)->sw_if_index[VLIB_TX] = sw_if_index1;
+
+ pkts_swapped += 2;
+
+ if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
+ {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ sample_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ clib_memcpy (t->new_src_mac, en0->src_address,
+ sizeof (t->new_src_mac));
+ clib_memcpy (t->new_dst_mac, en0->dst_address,
+ sizeof (t->new_dst_mac));
+
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ sample_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ t->sw_if_index = sw_if_index1;
+ t->next_index = next1;
+ clib_memcpy (t->new_src_mac, en1->src_address,
+ sizeof (t->new_src_mac));
+ clib_memcpy (t->new_dst_mac, en1->dst_address,
+ sizeof (t->new_dst_mac));
+ }
+ }
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
while (n_left_from > 0 && n_left_to_next > 0)
{
- u32 bi0;
- vlib_buffer_t * b0;
- u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
- u32 sw_if_index0;
- u8 tmp0[6];
- ethernet_header_t *en0;
-
- /* speculatively enqueue b0 to the current next frame */
+ u32 bi0;
+ vlib_buffer_t *b0;
+ u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
+ u32 sw_if_index0;
+ u8 tmp0[6];
+ ethernet_header_t *en0;
+
+ /* speculatively enqueue b0 to the current next frame */
bi0 = from[0];
to_next[0] = bi0;
from += 1;
@@ -224,45 +233,222 @@ sample_node_fn (vlib_main_t * vm,
n_left_to_next -= 1;
b0 = vlib_get_buffer (vm, bi0);
- /*
- * Direct from the driver, we should be at offset 0
- * aka at &b0->data[0]
- */
- ASSERT (b0->current_data == 0);
-
- en0 = vlib_buffer_get_current (b0);
-
- /* This is not the fastest way to swap src + dst mac addresses */
+ /*
+ * Direct from the driver, we should be at offset 0
+ * aka at &b0->data[0]
+ */
+ ASSERT (b0->current_data == 0);
+
+ en0 = vlib_buffer_get_current (b0);
+
+ /* This is not the fastest way to swap src + dst mac addresses */
#define _(a) tmp0[a] = en0->src_address[a];
- foreach_mac_address_offset;
+ foreach_mac_address_offset;
#undef _
#define _(a) en0->src_address[a] = en0->dst_address[a];
- foreach_mac_address_offset;
+ foreach_mac_address_offset;
#undef _
#define _(a) en0->dst_address[a] = tmp0[a];
- foreach_mac_address_offset;
+ foreach_mac_address_offset;
#undef _
- sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
-
- /* Send pkt back out the RX interface */
- vnet_buffer(b0)->sw_if_index[VLIB_TX] = sw_if_index0;
-
- if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
- && (b0->flags & VLIB_BUFFER_IS_TRACED))) {
- sample_trace_t *t =
- vlib_add_trace (vm, node, b0, sizeof (*t));
- t->sw_if_index = sw_if_index0;
- t->next_index = next0;
- clib_memcpy (t->new_src_mac, en0->src_address,
- sizeof (t->new_src_mac));
- clib_memcpy (t->new_dst_mac, en0->dst_address,
- sizeof (t->new_dst_mac));
- }
-
- pkts_swapped += 1;
-
- /* verify speculative enqueue, maybe switch current next frame */
+ sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
+
+ /* Send pkt back out the RX interface */
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
+
+ if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ sample_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ clib_memcpy (t->new_src_mac, en0->src_address,
+ sizeof (t->new_src_mac));
+ clib_memcpy (t->new_dst_mac, en0->dst_address,
+ sizeof (t->new_dst_mac));
+ }
+
+ pkts_swapped += 1;
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_node_increment_counter (vm, sample_node.index,
+ SAMPLE_ERROR_SWAPPED, pkts_swapped);
+ return frame->n_vectors;
+}
+#endif
+
+/*
+ * This version swaps mac addresses using an MMX vector shuffle
+ * Node costs about 17 clocks/pkt at a vector size of 26
+ */
+#ifdef VERSION_2
+static uword
+sample_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node, vlib_frame_t * frame)
+{
+ u32 n_left_from, *from, *to_next;
+ sample_next_t next_index;
+ u32 pkts_swapped = 0;
+ /* Vector shuffle mask to swap src, dst */
+ u8x16 swapmac = { 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, 13, 14, 15 };
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
+ u32 next1 = SAMPLE_NEXT_INTERFACE_OUTPUT;
+ u32 sw_if_index0, sw_if_index1;
+ u8x16 src_dst0, src_dst1;
+ ethernet_header_t *en0, *en1;
+ u32 bi0, bi1;
+ vlib_buffer_t *b0, *b1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t *p2, *p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ ASSERT (b0->current_data == 0);
+ ASSERT (b1->current_data == 0);
+
+ en0 = vlib_buffer_get_current (b0);
+ en1 = vlib_buffer_get_current (b1);
+
+ src_dst0 = ((u8x16 *) en0)[0];
+ src_dst1 = ((u8x16 *) en1)[0];
+ src_dst0 = u8x16_shuffle (src_dst0, swapmac);
+ src_dst1 = u8x16_shuffle (src_dst1, swapmac);
+ ((u8x16 *) en0)[0] = src_dst0;
+ ((u8x16 *) en1)[0] = src_dst1;
+
+ sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
+ sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX];
+
+ /* Send pkt back out the RX interface */
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
+ vnet_buffer (b1)->sw_if_index[VLIB_TX] = sw_if_index1;
+
+ pkts_swapped += 2;
+
+ if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
+ {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ sample_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ clib_memcpy (t->new_src_mac, en0->src_address,
+ sizeof (t->new_src_mac));
+ clib_memcpy (t->new_dst_mac, en0->dst_address,
+ sizeof (t->new_dst_mac));
+
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ sample_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ t->sw_if_index = sw_if_index1;
+ t->next_index = next1;
+ clib_memcpy (t->new_src_mac, en1->src_address,
+ sizeof (t->new_src_mac));
+ clib_memcpy (t->new_dst_mac, en1->dst_address,
+ sizeof (t->new_dst_mac));
+ }
+ }
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t *b0;
+ u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
+ u32 sw_if_index0;
+ u8x16 src_dst0;
+ ethernet_header_t *en0;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ /*
+ * Direct from the driver, we should be at offset 0
+ * aka at &b0->data[0]
+ */
+ ASSERT (b0->current_data == 0);
+
+ en0 = vlib_buffer_get_current (b0);
+ src_dst0 = ((u8x16 *) en0)[0];
+ src_dst0 = u8x16_shuffle (src_dst0, swapmac);
+ ((u8x16 *) en0)[0] = src_dst0;
+
+ sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
+
+ /* Send pkt back out the RX interface */
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
+
+ if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ sample_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ clib_memcpy (t->new_src_mac, en0->src_address,
+ sizeof (t->new_src_mac));
+ clib_memcpy (t->new_dst_mac, en0->dst_address,
+ sizeof (t->new_dst_mac));
+ }
+
+ pkts_swapped += 1;
+
+ /* verify speculative enqueue, maybe switch current next frame */
vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
to_next, n_left_to_next,
bi0, next0);
@@ -271,18 +457,166 @@ sample_node_fn (vlib_main_t * vm,
vlib_put_next_frame (vm, node, next_index, n_left_to_next);
}
- vlib_node_increment_counter (vm, sample_node.index,
- SAMPLE_ERROR_SWAPPED, pkts_swapped);
+ vlib_node_increment_counter (vm, sample_node.index,
+ SAMPLE_ERROR_SWAPPED, pkts_swapped);
return frame->n_vectors;
}
+#endif
-VLIB_REGISTER_NODE (sample_node) = {
+
+/*
+ * This version computes all of the buffer pointers in
+ * one motion, uses a quad/single loop model, and
+ * traces the entire frame in one motion.
+ *
+ * Node costs about 16 clocks/pkt at a vector size of 26
+ *
+ * Some compilation drama with u8x16_shuffle, so turned off by
+ * default.
+ */
+
+#ifdef VERSION_3
+
+#define u8x16_shuffle __builtin_shuffle
+/* This would normally be a stack local, but since it's a constant... */
+static const u16 nexts[VLIB_FRAME_SIZE] = { 0 };
+
+static uword
+sample_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node, vlib_frame_t * frame)
+{
+ u32 n_left_from, *from;
+ u32 pkts_swapped = 0;
+ /* Vector shuffle mask to swap src, dst */
+ u8x16 swapmac = { 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, 13, 14, 15 };
+ vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
+ /* See comment below about sending all pkts to the same place... */
+ u16 *next __attribute__ ((unused));
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+
+ vlib_get_buffers (vm, from, bufs, n_left_from);
+ b = bufs;
+ // next = nexts;
+
+ /*
+ * We send all pkts to SAMPLE_NEXT_INTERFACE_OUTPUT, aka
+ * graph arc 0. So the usual setting of next[0...3] is commented
+ * out below
+ */
+
+ while (n_left_from >= 4)
+ {
+ u8x16 src_dst0, src_dst1, src_dst2, src_dst3;
+ /* Prefetch next iteration. */
+ if (PREDICT_TRUE (n_left_from >= 8))
+ {
+ vlib_prefetch_buffer_header (b[4], STORE);
+ vlib_prefetch_buffer_header (b[5], STORE);
+ vlib_prefetch_buffer_header (b[6], STORE);
+ vlib_prefetch_buffer_header (b[7], STORE);
+ CLIB_PREFETCH (&b[4]->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (&b[5]->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (&b[6]->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (&b[7]->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ src_dst0 = ((u8x16 *) vlib_buffer_get_current (b[0]))[0];
+ src_dst1 = ((u8x16 *) vlib_buffer_get_current (b[1]))[0];
+ src_dst2 = ((u8x16 *) vlib_buffer_get_current (b[2]))[0];
+ src_dst3 = ((u8x16 *) vlib_buffer_get_current (b[3]))[0];
+
+ src_dst0 = u8x16_shuffle (src_dst0, swapmac);
+ src_dst1 = u8x16_shuffle (src_dst1, swapmac);
+ src_dst2 = u8x16_shuffle (src_dst2, swapmac);
+ src_dst3 = u8x16_shuffle (src_dst3, swapmac);
+
+ ((u8x16 *) vlib_buffer_get_current (b[0]))[0] = src_dst0;
+ ((u8x16 *) vlib_buffer_get_current (b[1]))[0] = src_dst1;
+ ((u8x16 *) vlib_buffer_get_current (b[2]))[0] = src_dst2;
+ ((u8x16 *) vlib_buffer_get_current (b[3]))[0] = src_dst3;
+
+ vnet_buffer (b[0])->sw_if_index[VLIB_TX] =
+ vnet_buffer (b[0])->sw_if_index[VLIB_RX];
+ vnet_buffer (b[1])->sw_if_index[VLIB_TX] =
+ vnet_buffer (b[1])->sw_if_index[VLIB_RX];
+ vnet_buffer (b[2])->sw_if_index[VLIB_TX] =
+ vnet_buffer (b[2])->sw_if_index[VLIB_RX];
+ vnet_buffer (b[3])->sw_if_index[VLIB_TX] =
+ vnet_buffer (b[3])->sw_if_index[VLIB_RX];
+
+ // next[0] = SAMPLE_NEXT_INTERFACE_OUTPUT;
+ // next[1] = SAMPLE_NEXT_INTERFACE_OUTPUT;
+ // next[2] = SAMPLE_NEXT_INTERFACE_OUTPUT;
+ // next[3] = SAMPLE_NEXT_INTERFACE_OUTPUT;
+
+ b += 4;
+ // next += 4;
+ n_left_from -= 4;
+ pkts_swapped += 4;
+ }
+
+ while (n_left_from > 0)
+ {
+ u8x16 src_dst0;
+ src_dst0 = ((u8x16 *) vlib_buffer_get_current (b[0]))[0];
+ src_dst0 = u8x16_shuffle (src_dst0, swapmac);
+ ((u8x16 *) vlib_buffer_get_current (b[0]))[0] = src_dst0;
+ vnet_buffer (b[0])->sw_if_index[VLIB_TX] =
+ vnet_buffer (b[0])->sw_if_index[VLIB_RX];
+ // next[0] = SAMPLE_NEXT_INTERFACE_OUTPUT;
+
+ b += 1;
+ // next += 1;
+ n_left_from -= 1;
+ pkts_swapped += 1;
+
+ }
+ vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
+ frame->n_vectors);
+
+ vlib_node_increment_counter (vm, sample_node.index,
+ SAMPLE_ERROR_SWAPPED, pkts_swapped);
+
+ if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
+ {
+ int i;
+ b = bufs;
+
+ for (i = 0; i < frame->n_vectors; i++)
+ {
+ if (b[0]->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ ethernet_header_t *en;
+ sample_trace_t *t =
+ vlib_add_trace (vm, node, b[0], sizeof (*t));
+ t->sw_if_index = vnet_buffer (b[0])->sw_if_index[VLIB_TX];
+ t->next_index = SAMPLE_NEXT_INTERFACE_OUTPUT;
+ en = vlib_buffer_get_current (b[0]);
+ clib_memcpy (t->new_src_mac, en->src_address,
+ sizeof (t->new_src_mac));
+ clib_memcpy (t->new_dst_mac, en->dst_address,
+ sizeof (t->new_dst_mac));
+ b++;
+ }
+ else
+ break;
+ }
+ }
+ return frame->n_vectors;
+}
+#endif
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (sample_node) =
+{
.function = sample_node_fn,
.name = "sample",
.vector_size = sizeof (u32),
.format_trace = format_sample_trace,
.type = VLIB_NODE_TYPE_INTERNAL,
-
+
.n_errors = ARRAY_LEN(sample_error_strings),
.error_strings = sample_error_strings,
@@ -290,6 +624,17 @@ VLIB_REGISTER_NODE (sample_node) = {
/* edit / add dispositions here */
.next_nodes = {
- [SAMPLE_NEXT_INTERFACE_OUTPUT] = "interface-output",
+ [SAMPLE_NEXT_INTERFACE_OUTPUT] = "interface-output",
},
};
+/* *INDENT-ON* */
+
+VLIB_NODE_FUNCTION_MULTIARCH (sample_node, sample_node_fn);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */