diff options
-rw-r--r-- | src/examples/sample-plugin/sample/node.c | 35 | ||||
-rw-r--r-- | src/vppinfra/vector.h | 62 | ||||
-rw-r--r-- | src/vppinfra/vector_avx2.h | 6 | ||||
-rw-r--r-- | src/vppinfra/vector_avx512.h | 6 | ||||
-rw-r--r-- | src/vppinfra/vector_neon.h | 6 | ||||
-rw-r--r-- | src/vppinfra/vector_sse42.h | 18 |
6 files changed, 80 insertions, 53 deletions
diff --git a/src/examples/sample-plugin/sample/node.c b/src/examples/sample-plugin/sample/node.c index a31c3e86e08..a9d8b66d788 100644 --- a/src/examples/sample-plugin/sample/node.c +++ b/src/examples/sample-plugin/sample/node.c @@ -291,7 +291,6 @@ VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node, sample_next_t next_index; u32 pkts_swapped = 0; /* Vector shuffle mask to swap src, dst */ - u8x16 swapmac = { 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, 13, 14, 15 }; from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -345,8 +344,10 @@ VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node, src_dst0 = ((u8x16 *) en0)[0]; src_dst1 = ((u8x16 *) en1)[0]; - src_dst0 = u8x16_shuffle (src_dst0, swapmac); - src_dst1 = u8x16_shuffle (src_dst1, swapmac); + src_dst0 = u8x16_shuffle (src_dst0, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, + 4, 5, 12, 13, 14, 15); + src_dst1 = u8x16_shuffle (src_dst1, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, + 4, 5, 12, 13, 14, 15); ((u8x16 *) en0)[0] = src_dst0; ((u8x16 *) en1)[0] = src_dst1; @@ -418,7 +419,8 @@ VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node, en0 = vlib_buffer_get_current (b0); src_dst0 = ((u8x16 *) en0)[0]; - src_dst0 = u8x16_shuffle (src_dst0, swapmac); + src_dst0 = u8x16_shuffle (src_dst0, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, + 4, 5, 12, 13, 14, 15); ((u8x16 *) en0)[0] = src_dst0; sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX]; @@ -469,7 +471,6 @@ VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node, #ifdef VERSION_3 -#define u8x16_shuffle __builtin_shuffle /* This would normally be a stack local, but since it's a constant... */ static const u16 nexts[VLIB_FRAME_SIZE] = { 0 }; @@ -479,7 +480,6 @@ VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node, u32 n_left_from, *from; u32 pkts_swapped = 0; /* Vector shuffle mask to swap src, dst */ - u8x16 swapmac = { 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, 13, 14, 15 }; vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b; /* See comment below about sending all pkts to the same place... */ u16 *next __attribute__ ((unused)); @@ -518,10 +518,14 @@ VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node, src_dst2 = ((u8x16 *) vlib_buffer_get_current (b[2]))[0]; src_dst3 = ((u8x16 *) vlib_buffer_get_current (b[3]))[0]; - src_dst0 = u8x16_shuffle (src_dst0, swapmac); - src_dst1 = u8x16_shuffle (src_dst1, swapmac); - src_dst2 = u8x16_shuffle (src_dst2, swapmac); - src_dst3 = u8x16_shuffle (src_dst3, swapmac); + src_dst0 = u8x16_shuffle (src_dst0, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, + 12, 13, 14, 15); + src_dst1 = u8x16_shuffle (src_dst1, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, + 12, 13, 14, 15); + src_dst2 = u8x16_shuffle (src_dst2, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, + 12, 13, 14, 15); + src_dst3 = u8x16_shuffle (src_dst3, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, + 12, 13, 14, 15); ((u8x16 *) vlib_buffer_get_current (b[0]))[0] = src_dst0; ((u8x16 *) vlib_buffer_get_current (b[1]))[0] = src_dst1; @@ -552,7 +556,8 @@ VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node, { u8x16 src_dst0; src_dst0 = ((u8x16 *) vlib_buffer_get_current (b[0]))[0]; - src_dst0 = u8x16_shuffle (src_dst0, swapmac); + src_dst0 = u8x16_shuffle (src_dst0, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, + 12, 13, 14, 15); ((u8x16 *) vlib_buffer_get_current (b[0]))[0] = src_dst0; vnet_buffer (b[0])->sw_if_index[VLIB_TX] = vnet_buffer (b[0])->sw_if_index[VLIB_RX]; @@ -611,18 +616,14 @@ VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node, #ifdef VERSION_4 -#define u8x16_shuffle __builtin_shuffle - -static u8x16 swapmac = - { 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, 13, 14, 15 }; - /* Final stage in the pipeline, do the mac swap */ static inline u32 last_stage (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t * b) { u8x16 src_dst0; src_dst0 = ((u8x16 *) vlib_buffer_get_current (b))[0]; - src_dst0 = u8x16_shuffle (src_dst0, swapmac); + src_dst0 = u8x16_shuffle (src_dst0, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, + 13, 14, 15); ((u8x16 *) vlib_buffer_get_current (b))[0] = src_dst0; vnet_buffer (b)->sw_if_index[VLIB_TX] = vnet_buffer (b)->sw_if_index[VLIB_RX]; diff --git a/src/vppinfra/vector.h b/src/vppinfra/vector.h index 88cf288cb26..49bc2976b3e 100644 --- a/src/vppinfra/vector.h +++ b/src/vppinfra/vector.h @@ -126,6 +126,68 @@ foreach_vec #undef _vector_size + /* _shuffle and _shuffle2 */ +#if defined(__GNUC__) && !defined(__clang__) +#define __builtin_shufflevector(v1, v2, ...) \ + __builtin_shuffle ((v1), (v2), (__typeof__ (v1)){ __VA_ARGS__ }) +#endif + +#define u8x16_shuffle(v1, ...) \ + (u8x16) __builtin_shufflevector ((u8x16) (v1), (u8x16) (v1), __VA_ARGS__) +#define u8x32_shuffle(v1, ...) \ + (u8x32) __builtin_shufflevector ((u8x32) (v1), (u8x32) (v1), __VA_ARGS__) +#define u8x64_shuffle(v1, ...) \ + (u8x64) __builtin_shufflevector ((u8x64) (v1), (u8x64) (v1), __VA_ARGS__) + +#define u16x8_shuffle(v1, ...) \ + (u16x8) __builtin_shufflevector ((u16x8) (v1), (u16x8) (v1), __VA_ARGS__) +#define u16x16_shuffle(v1, ...) \ + (u16x16) __builtin_shufflevector ((u16x16) (v1), (u16x16) (v1), __VA_ARGS__) +#define u16x32_shuffle(v1, ...) \ + (u16u32) __builtin_shufflevector ((u16x32) (v1), (u16x32) (v1), __VA_ARGS__); + +#define u32x4_shuffle(v1, ...) \ + (u32x4) __builtin_shufflevector ((u32x4) (v1), (u32x4) (v1), __VA_ARGS__) +#define u32x8_shuffle(v1, ...) \ + (u32x8) __builtin_shufflevector ((u32x8) (v1), (u32x8) (v1), __VA_ARGS__) +#define u32x16_shuffle(v1, ...) \ + (u32x16) __builtin_shufflevector ((u32x16) (v1), (u32x16) (v1), __VA_ARGS__) + +#define u64x2_shuffle(v1, ...) \ + (u64x2) __builtin_shufflevector ((u64x2) (v1), (u64x2) (v1), __VA_ARGS__) +#define u64x4_shuffle(v1, ...) \ + (u64x4) __builtin_shufflevector ((u64x4) (v1), (u64x4) (v1), __VA_ARGS__) +#define u64x8_shuffle(v1, ...) \ + (u64x8) __builtin_shufflevector ((u64x8) (v1), (u64x8) (v1), __VA_ARGS__) + +#define u8x16_shuffle2(v1, v2, ...) \ + (u8x16) __builtin_shufflevector ((u8x16) (v1), (u8x16) (v2), __VA_ARGS__) +#define u8x32_shuffle2(v1, v2, ...) \ + (u8x32) __builtin_shufflevector ((u8x32) (v1), (u8x32) (v2), __VA_ARGS__) +#define u8x64_shuffle2(v1, v2, ...) \ + (u8x64) __builtin_shufflevector ((u8x64) (v1), (u8x64) (v2), __VA_ARGS__) + +#define u16x8_shuffle2(v1, v2, ...) \ + (u16x8) __builtin_shufflevector ((u16x8) (v1), (u16x8) (v2), __VA_ARGS__) +#define u16x16_shuffle2(v1, v2, ...) \ + (u16x16) __builtin_shufflevector ((u16x16) (v1), (u16x16) (v2), __VA_ARGS__) +#define u16x32_shuffle2(v1, v2, ...) \ + (u16u32) __builtin_shufflevector ((u16x32) (v1), (u16x32) (v2), __VA_ARGS__); + +#define u32x4_shuffle2(v1, v2, ...) \ + (u32x4) __builtin_shufflevector ((u32x4) (v1), (u32x4) (v2), __VA_ARGS__) +#define u32x8_shuffle2(v1, v2, ...) \ + (u32x8) __builtin_shufflevector ((u32x8) (v1), (u32x8) (v2), __VA_ARGS__) +#define u32x16_shuffle2(v1, v2, ...) \ + (u32x16) __builtin_shufflevector ((u32x16) (v1), (u32x16) (v2), __VA_ARGS__) + +#define u64x2_shuffle2(v1, v2, ...) \ + (u64x2) __builtin_shufflevector ((u64x2) (v1), (u64x2) (v2), __VA_ARGS__) +#define u64x4_shuffle2(v1, v2, ...) \ + (u64x4) __builtin_shufflevector ((u64x4) (v1), (u64x4) (v2), __VA_ARGS__) +#define u64x8_shuffle2(v1, v2, ...) \ + (u64x8) __builtin_shufflevector ((u64x8) (v1), (u64x8) (v2), __VA_ARGS__) + #define VECTOR_WORD_TYPE(t) t##x #define VECTOR_WORD_TYPE_LEN(t) (sizeof (VECTOR_WORD_TYPE(t)) / sizeof (t)) diff --git a/src/vppinfra/vector_avx2.h b/src/vppinfra/vector_avx2.h index 7226c230e68..59857182a93 100644 --- a/src/vppinfra/vector_avx2.h +++ b/src/vppinfra/vector_avx2.h @@ -183,12 +183,6 @@ u16x16_byte_swap (u16x16 v) return (u16x16) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap); } -static_always_inline u8x32 -u8x32_shuffle (u8x32 v, u8x32 m) -{ - return (u8x32) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) m); -} - #define u8x32_align_right(a, b, imm) \ (u8x32) _mm256_alignr_epi8 ((__m256i) a, (__m256i) b, imm) diff --git a/src/vppinfra/vector_avx512.h b/src/vppinfra/vector_avx512.h index 8acac2a3a9f..33f40ef7b5a 100644 --- a/src/vppinfra/vector_avx512.h +++ b/src/vppinfra/vector_avx512.h @@ -196,12 +196,6 @@ u8x64_reflect_u8x16 (u8x64 x) return (u8x64) _mm512_shuffle_epi8 ((__m512i) x, (__m512i) mask); } -static_always_inline u8x64 -u8x64_shuffle (u8x64 v, u8x64 m) -{ - return (u8x64) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) m); -} - #define u8x64_align_right(a, b, imm) \ (u8x64) _mm512_alignr_epi8 ((__m512i) a, (__m512i) b, imm) diff --git a/src/vppinfra/vector_neon.h b/src/vppinfra/vector_neon.h index 80d7bda9f3c..e7b31259f73 100644 --- a/src/vppinfra/vector_neon.h +++ b/src/vppinfra/vector_neon.h @@ -129,12 +129,6 @@ u32x4_byte_swap (u32x4 v) return (u32x4) vrev32q_u8 ((u8x16) v); } -static_always_inline u8x16 -u8x16_shuffle (u8x16 v, u8x16 m) -{ - return (u8x16) vqtbl1q_u8 (v, m); -} - static_always_inline u32x4 u32x4_hadd (u32x4 v1, u32x4 v2) { diff --git a/src/vppinfra/vector_sse42.h b/src/vppinfra/vector_sse42.h index 7e75ad28710..35495d6658f 100644 --- a/src/vppinfra/vector_sse42.h +++ b/src/vppinfra/vector_sse42.h @@ -411,24 +411,6 @@ u32x4_sum_elts (u32x4 sum4) return sum4[0]; } -static_always_inline u8x16 -u8x16_shuffle (u8x16 v, u8x16 m) -{ - return (u8x16) _mm_shuffle_epi8 ((__m128i) v, (__m128i) m); -} - -static_always_inline u32x4 -u32x4_shuffle (u32x4 v, const int a, const int b, const int c, const int d) -{ -#if defined(__clang__) || !__OPTIMIZE__ - u32x4 r = { v[a], v[b], v[c], v[d] }; - return r; -#else - return (u32x4) _mm_shuffle_epi32 ((__m128i) v, - a | b << 2 | c << 4 | d << 6); -#endif -} - /* _from_ */ /* *INDENT-OFF* */ #define _(f,t,i) \ |