misc: add vnet/pipeline.h example

To the sample plugin. We should probably suggest that folks use the pipeline.h coding model more often. It's really easy, and these days the performance results are similar to quad-single loop coding. Type: refactor Change-Id: Ie2caa087972737e6d9c31c4ac79355f3d8ced282 Signed-off-by: Dave Barach <dave@barachs.net>
author: Dave Barach <dbarach@cisco.com> 2019-07-26 11:58:16 -0400
committer: Florin Coras <florin.coras@gmail.com> 2019-07-29 22:35:19 +0000
commit: d56550c2b669558aa38d93f44a9a3b31e0b9370f (patch)
tree: fc5c06b5b2586dbda4086bbec425d01e29984f04 /src
parent: 2b5fed8696ce2a9b67e63cf5b5dbf49505172c9a (diff)
1 files changed, 89 insertions, 1 deletions
diff --git a/src/examples/sample-plugin/sample/node.c b/src/examples/sample-plugin/sample/node.c
index 1f0a2e9775b..2c71b48b42e 100644
--- a/src/examples/sample-plugin/sample/node.c
+++ b/src/examples/sample-plugin/sample/node.c
@@ -76,8 +76,8 @@ typedef enum
  *
  * Node costs 30 clocks/pkt at a vector size of 51
  */
-#define VERSION_1 1
 
+#define VERSION_1 1
 #ifdef VERSION_1
 #define foreach_mac_address_offset              \
 _(0)                                            \
@@ -599,6 +599,94 @@ VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
 }
 #endif
 
+/*
+ * This version computes all of the buffer pointers in
+ * one motion, uses a fully pipelined loop model, and
+ * traces the entire frame in one motion.
+ *
+ * It's performance-competative with other coding paradigms,
+ * and it's the simplest way to write performant vpp code
+ */
+
+
+#ifdef VERSION_4
+
+#define u8x16_shuffle __builtin_shuffle
+
+static u8x16 swapmac =
+  { 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, 13, 14, 15 };
+
+/* Final stage in the pipeline, do the mac swap */
+static inline u32
+last_stage (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t * b)
+{
+  u8x16 src_dst0;
+  src_dst0 = ((u8x16 *) vlib_buffer_get_current (b))[0];
+  src_dst0 = u8x16_shuffle (src_dst0, swapmac);
+  ((u8x16 *) vlib_buffer_get_current (b))[0] = src_dst0;
+  vnet_buffer (b)->sw_if_index[VLIB_TX] =
+    vnet_buffer (b)->sw_if_index[VLIB_RX];
+  /* set next-index[] to 0 for this buffer */
+  return 0;
+}
+
+/*
+ * Add a couple of nil stages to increase the prefetch stride.
+ * For any specific platform, the optimal prefetch stride may differ.
+ */
+static inline void
+stage1 (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t * b)
+{
+}
+
+static inline void
+stage2 (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t * b)
+{
+}
+
+#define NSTAGES 4
+#define STAGE_INLINE inline __attribute__((__always_inline__))
+
+#define stage0 generic_stage0
+
+#include <vnet/pipeline.h>
+
+VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
+			    vlib_frame_t * frame)
+{
+  dispatch_pipeline (vm, node, frame);
+
+  vlib_node_increment_counter (vm, sample_node.index,
+			       SAMPLE_ERROR_SWAPPED, frame->n_vectors);
+  if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
+    {
+      int i;
+      b = bufs;
+
+      for (i = 0; i < frame->n_vectors; i++)
+	{
+	  if (b[0]->flags & VLIB_BUFFER_IS_TRACED)
+	    {
+	      ethernet_header_t *en;
+	      sample_trace_t *t =
+		vlib_add_trace (vm, node, b[0], sizeof (*t));
+	      t->sw_if_index = vnet_buffer (b[0])->sw_if_index[VLIB_TX];
+	      t->next_index = SAMPLE_NEXT_INTERFACE_OUTPUT;
+	      en = vlib_buffer_get_current (b[0]);
+	      clib_memcpy_fast (t->new_src_mac, en->src_address,
+				sizeof (t->new_src_mac));
+	      clib_memcpy_fast (t->new_dst_mac, en->dst_address,
+				sizeof (t->new_dst_mac));
+	      b++;
+	    }
+	  else
+	    break;
+	}
+    }
+  return frame->n_vectors;
+}
+#endif
+
 /* *INDENT-OFF* */
 VLIB_REGISTER_NODE (sample_node) =
 {
author	Dave Barach <dbarach@cisco.com>	2019-07-26 11:58:16 -0400
committer	Florin Coras <florin.coras@gmail.com>	2019-07-29 22:35:19 +0000
commit	d56550c2b669558aa38d93f44a9a3b31e0b9370f (patch)
tree	fc5c06b5b2586dbda4086bbec425d01e29984f04 /src
parent	2b5fed8696ce2a9b67e63cf5b5dbf49505172c9a (diff)