summaryrefslogtreecommitdiffstats
path: root/src/vppinfra
diff options
context:
space:
mode:
authorDamjan Marion <damarion@cisco.com>2021-04-21 21:11:35 +0200
committerBeno�t Ganne <bganne@cisco.com>2021-04-26 13:47:40 +0000
commit856d062ce67f93d83f1ff302a394e8a4448ad017 (patch)
tree7b7dc8436850bcf03ab74d91de65eb8fee59e6d2 /src/vppinfra
parentc99ab12cbb9252806db55f7a79e5d84b41df3525 (diff)
vppinfra: clib_memcpy_u32() utilizing SIMD mask loads/stores
Type: improvement Change-Id: I55c4688bd1feffae139ce12a66d15885373e5cd7 Signed-off-by: Damjan Marion <damarion@cisco.com>
Diffstat (limited to 'src/vppinfra')
-rw-r--r--src/vppinfra/CMakeLists.txt1
-rw-r--r--src/vppinfra/memcpy.h155
-rw-r--r--src/vppinfra/string.h2
3 files changed, 158 insertions, 0 deletions
diff --git a/src/vppinfra/CMakeLists.txt b/src/vppinfra/CMakeLists.txt
index 8cebd32ffe3..11193a37482 100644
--- a/src/vppinfra/CMakeLists.txt
+++ b/src/vppinfra/CMakeLists.txt
@@ -144,6 +144,7 @@ set(VPPINFRA_HEADERS
macros.h
maplog.h
math.h
+ memcpy.h
memcpy_avx2.h
memcpy_avx512.h
memcpy_sse3.h
diff --git a/src/vppinfra/memcpy.h b/src/vppinfra/memcpy.h
new file mode 100644
index 00000000000..115379797d2
--- /dev/null
+++ b/src/vppinfra/memcpy.h
@@ -0,0 +1,155 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Cisco Systems, Inc.
+ */
+
+#include <vppinfra/clib.h>
+#ifndef included_memcpy_h
+#define included_memcpy_h
+
+static_always_inline void
+clib_memcpy_u32_x4 (u32 *dst, u32 *src)
+{
+#if defined(CLIB_HAVE_VEC128)
+ u32x4_store_unaligned (u32x4_load_unaligned (src), dst);
+#else
+ clib_memcpy_fast (dst, src, 4 * sizeof (u32));
+#endif
+}
+static_always_inline void
+clib_memcpy_u32_x8 (u32 *dst, u32 *src)
+{
+#if defined(CLIB_HAVE_VEC256)
+ u32x8_store_unaligned (u32x8_load_unaligned (src), dst);
+#else
+ clib_memcpy_u32_x4 (dst, src);
+ clib_memcpy_u32_x4 (dst + 4, src + 4);
+#endif
+}
+
+static_always_inline void
+clib_memcpy_u32_x16 (u32 *dst, u32 *src)
+{
+#if defined(CLIB_HAVE_VEC512)
+ u32x16_store_unaligned (u32x16_load_unaligned (src), dst);
+#else
+ clib_memcpy_u32_x8 (dst, src);
+ clib_memcpy_u32_x8 (dst + 8, src + 8);
+#endif
+}
+
+static_always_inline void
+clib_memcpy_u32 (u32 *dst, u32 *src, u32 n_left)
+{
+#if defined(CLIB_HAVE_VEC128)
+ if (COMPILE_TIME_CONST (n_left))
+ {
+ /* for n_left defined as compile-time constant we should prevent compiler
+ * to use more expensive mask load/store for common cases where smaller
+ * register load/store exists */
+ switch (n_left)
+ {
+ case 4:
+ clib_memcpy_u32_x4 (dst, src);
+ return;
+ case 8:
+ clib_memcpy_u32_x8 (dst, src);
+ return;
+ case 12:
+ clib_memcpy_u32_x8 (dst, src);
+ clib_memcpy_u32_x4 (dst + 8, src + 8);
+ return;
+ case 16:
+ clib_memcpy_u32_x16 (dst, src);
+ return;
+ case 32:
+ clib_memcpy_u32_x16 (dst, src);
+ clib_memcpy_u32_x16 (dst + 16, src + 16);
+ return;
+ case 64:
+ clib_memcpy_u32_x16 (dst, src);
+ clib_memcpy_u32_x16 (dst + 16, src + 16);
+ clib_memcpy_u32_x16 (dst + 32, src + 32);
+ clib_memcpy_u32_x16 (dst + 48, src + 48);
+ return;
+ default:
+ break;
+ }
+ }
+
+#if defined(CLIB_HAVE_VEC512)
+ while (n_left >= 64)
+ {
+ clib_memcpy_u32_x16 (dst, src);
+ clib_memcpy_u32_x16 (dst + 16, src + 16);
+ clib_memcpy_u32_x16 (dst + 32, src + 32);
+ clib_memcpy_u32_x16 (dst + 48, src + 48);
+ dst += 64;
+ src += 64;
+ n_left -= 64;
+ }
+#endif
+
+#if defined(CLIB_HAVE_VEC256)
+ while (n_left >= 32)
+ {
+ clib_memcpy_u32_x16 (dst, src);
+ clib_memcpy_u32_x16 (dst + 16, src + 16);
+ dst += 32;
+ src += 32;
+ n_left -= 32;
+ }
+#endif
+
+ while (n_left >= 16)
+ {
+ clib_memcpy_u32_x16 (dst, src);
+ dst += 16;
+ src += 16;
+ n_left -= 16;
+ }
+
+#if defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE)
+ if (n_left)
+ {
+ u16 mask = pow2_mask (n_left);
+ u32x16_mask_store (u32x16_mask_load_zero (src, mask), dst, mask);
+ }
+ return;
+#endif
+
+ if (n_left >= 8)
+ {
+ clib_memcpy_u32_x8 (dst, src);
+ dst += 8;
+ src += 8;
+ n_left -= 8;
+ }
+
+#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
+ if (n_left)
+ {
+ u8 mask = pow2_mask (n_left);
+ u32x8_mask_store (u32x8_mask_load_zero (src, mask), dst, mask);
+ }
+ return;
+#endif
+
+ if (n_left >= 4)
+ {
+ clib_memcpy_u32_x4 (dst, src);
+ dst += 4;
+ src += 4;
+ n_left -= 4;
+ }
+#endif
+
+ while (n_left)
+ {
+ dst[0] = src[0];
+ dst += 1;
+ src += 1;
+ n_left -= 1;
+ }
+}
+
+#endif
diff --git a/src/vppinfra/string.h b/src/vppinfra/string.h
index 76d345e16bc..5d01e48eb1c 100644
--- a/src/vppinfra/string.h
+++ b/src/vppinfra/string.h
@@ -99,6 +99,8 @@ clib_memcpy_fast (void *restrict dst, const void *restrict src, size_t n)
#undef clib_memcpy_fast_arch
+#include <vppinfra/memcpy.h>
+
/* c-11 string manipulation variants */
#ifndef EOK