aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorDamjan Marion <damarion@cisco.com>2021-04-21 15:25:47 +0200
committerFlorin Coras <florin.coras@gmail.com>2021-04-21 18:30:52 +0000
commit542088597886df774e63f841166721deeffef1c1 (patch)
tree719589064b0a1d090398bfe39cbe5b448a96a5bf /src
parent1c2297144a37ea6a42f5deec6e1613a297de3321 (diff)
buffers: vlib_get_buffers() with 512-bit SIMD
Type: improvement Change-Id: Id8ce3ffc1299a38171b82a7082454412c840a40c Signed-off-by: Damjan Marion <damarion@cisco.com>
Diffstat (limited to 'src')
-rw-r--r--src/vlib/buffer_funcs.h51
-rw-r--r--src/vppinfra/vector_avx512.h1
2 files changed, 47 insertions, 5 deletions
diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h
index e4427d6c382..8ab9c4e53ad 100644
--- a/src/vlib/buffer_funcs.h
+++ b/src/vlib/buffer_funcs.h
@@ -235,20 +235,61 @@ vlib_get_buffers_with_offset (vlib_main_t * vm, u32 * bi, void **b, int count,
i32 offset)
{
uword buffer_mem_start = vm->buffer_main->buffer_mem_start;
-#ifdef CLIB_HAVE_VEC256
- u64x4 off = u64x4_splat (buffer_mem_start + offset);
+#ifdef CLIB_HAVE_VEC512
+ u64x8 of8 = u64x8_splat (buffer_mem_start + offset);
+ u64x4 off = u64x8_extract_lo (of8);
/* if count is not const, compiler will not unroll while loop
se we maintain two-in-parallel variant */
+ while (count >= 32)
+ {
+ u64x8 b0 = u64x8_from_u32x8 (u32x8_load_unaligned (bi));
+ u64x8 b1 = u64x8_from_u32x8 (u32x8_load_unaligned (bi + 8));
+ u64x8 b2 = u64x8_from_u32x8 (u32x8_load_unaligned (bi + 16));
+ u64x8 b3 = u64x8_from_u32x8 (u32x8_load_unaligned (bi + 24));
+ /* shift and add to get vlib_buffer_t pointer */
+ u64x8_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b);
+ u64x8_store_unaligned ((b1 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b + 8);
+ u64x8_store_unaligned ((b2 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b + 16);
+ u64x8_store_unaligned ((b3 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b + 24);
+ b += 32;
+ bi += 32;
+ count -= 32;
+ }
while (count >= 8)
{
+ u64x8 b0 = u64x8_from_u32x8 (u32x8_load_unaligned (bi));
+ /* shift and add to get vlib_buffer_t pointer */
+ u64x8_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b);
+ b += 8;
+ bi += 8;
+ count -= 8;
+ }
+#elif defined CLIB_HAVE_VEC256
+ u64x4 off = u64x4_splat (buffer_mem_start + offset);
+ /* if count is not const, compiler will not unroll while loop
+ se we maintain two-in-parallel variant */
+ while (count >= 32)
+ {
u64x4 b0 = u64x4_from_u32x4 (u32x4_load_unaligned (bi));
u64x4 b1 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 4));
+ u64x4 b2 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 8));
+ u64x4 b3 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 12));
+ u64x4 b4 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 16));
+ u64x4 b5 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 20));
+ u64x4 b6 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 24));
+ u64x4 b7 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 28));
/* shift and add to get vlib_buffer_t pointer */
u64x4_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b);
u64x4_store_unaligned ((b1 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 4);
- b += 8;
- bi += 8;
- count -= 8;
+ u64x4_store_unaligned ((b2 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 8);
+ u64x4_store_unaligned ((b3 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 12);
+ u64x4_store_unaligned ((b4 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 16);
+ u64x4_store_unaligned ((b5 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 20);
+ u64x4_store_unaligned ((b6 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 24);
+ u64x4_store_unaligned ((b7 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 28);
+ b += 32;
+ bi += 32;
+ count -= 32;
}
#endif
while (count >= 4)
diff --git a/src/vppinfra/vector_avx512.h b/src/vppinfra/vector_avx512.h
index 3406a7ddb91..87999eee947 100644
--- a/src/vppinfra/vector_avx512.h
+++ b/src/vppinfra/vector_avx512.h
@@ -264,6 +264,7 @@ _ (u64x8, u8, epu64, _mm512, __m512i)
_ (u16x16, u32x16, _mm512_cvtepi16_epi32, __m256i)
_ (u32x16, u16x16, _mm512_cvtusepi32_epi16, __m512i)
_ (u32x8, u16x8, _mm256_cvtusepi32_epi16, __m256i)
+_ (u32x8, u64x8, _mm512_cvtepu32_epi64, __m256i)
#undef _
#define _(vt, mt, bits, epi) \