aboutsummaryrefslogtreecommitdiffstats
path: root/src/vppinfra/memcpy.h
diff options
context:
space:
mode:
authorDamjan Marion <damarion@cisco.com>2021-04-21 21:11:35 +0200
committerBeno�t Ganne <bganne@cisco.com>2021-04-26 13:47:40 +0000
commit856d062ce67f93d83f1ff302a394e8a4448ad017 (patch)
tree7b7dc8436850bcf03ab74d91de65eb8fee59e6d2 /src/vppinfra/memcpy.h
parentc99ab12cbb9252806db55f7a79e5d84b41df3525 (diff)
vppinfra: clib_memcpy_u32() utilizing SIMD mask loads/stores
Type: improvement Change-Id: I55c4688bd1feffae139ce12a66d15885373e5cd7 Signed-off-by: Damjan Marion <damarion@cisco.com>
Diffstat (limited to 'src/vppinfra/memcpy.h')
-rw-r--r--src/vppinfra/memcpy.h155
1 files changed, 155 insertions, 0 deletions
diff --git a/src/vppinfra/memcpy.h b/src/vppinfra/memcpy.h
new file mode 100644
index 00000000000..115379797d2
--- /dev/null
+++ b/src/vppinfra/memcpy.h
@@ -0,0 +1,155 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Cisco Systems, Inc.
+ */
+
+#include <vppinfra/clib.h>
+#ifndef included_memcpy_h
+#define included_memcpy_h
+
+static_always_inline void
+clib_memcpy_u32_x4 (u32 *dst, u32 *src)
+{
+#if defined(CLIB_HAVE_VEC128)
+ u32x4_store_unaligned (u32x4_load_unaligned (src), dst);
+#else
+ clib_memcpy_fast (dst, src, 4 * sizeof (u32));
+#endif
+}
+static_always_inline void
+clib_memcpy_u32_x8 (u32 *dst, u32 *src)
+{
+#if defined(CLIB_HAVE_VEC256)
+ u32x8_store_unaligned (u32x8_load_unaligned (src), dst);
+#else
+ clib_memcpy_u32_x4 (dst, src);
+ clib_memcpy_u32_x4 (dst + 4, src + 4);
+#endif
+}
+
+static_always_inline void
+clib_memcpy_u32_x16 (u32 *dst, u32 *src)
+{
+#if defined(CLIB_HAVE_VEC512)
+ u32x16_store_unaligned (u32x16_load_unaligned (src), dst);
+#else
+ clib_memcpy_u32_x8 (dst, src);
+ clib_memcpy_u32_x8 (dst + 8, src + 8);
+#endif
+}
+
+static_always_inline void
+clib_memcpy_u32 (u32 *dst, u32 *src, u32 n_left)
+{
+#if defined(CLIB_HAVE_VEC128)
+ if (COMPILE_TIME_CONST (n_left))
+ {
+ /* for n_left defined as compile-time constant we should prevent compiler
+ * to use more expensive mask load/store for common cases where smaller
+ * register load/store exists */
+ switch (n_left)
+ {
+ case 4:
+ clib_memcpy_u32_x4 (dst, src);
+ return;
+ case 8:
+ clib_memcpy_u32_x8 (dst, src);
+ return;
+ case 12:
+ clib_memcpy_u32_x8 (dst, src);
+ clib_memcpy_u32_x4 (dst + 8, src + 8);
+ return;
+ case 16:
+ clib_memcpy_u32_x16 (dst, src);
+ return;
+ case 32:
+ clib_memcpy_u32_x16 (dst, src);
+ clib_memcpy_u32_x16 (dst + 16, src + 16);
+ return;
+ case 64:
+ clib_memcpy_u32_x16 (dst, src);
+ clib_memcpy_u32_x16 (dst + 16, src + 16);
+ clib_memcpy_u32_x16 (dst + 32, src + 32);
+ clib_memcpy_u32_x16 (dst + 48, src + 48);
+ return;
+ default:
+ break;
+ }
+ }
+
+#if defined(CLIB_HAVE_VEC512)
+ while (n_left >= 64)
+ {
+ clib_memcpy_u32_x16 (dst, src);
+ clib_memcpy_u32_x16 (dst + 16, src + 16);
+ clib_memcpy_u32_x16 (dst + 32, src + 32);
+ clib_memcpy_u32_x16 (dst + 48, src + 48);
+ dst += 64;
+ src += 64;
+ n_left -= 64;
+ }
+#endif
+
+#if defined(CLIB_HAVE_VEC256)
+ while (n_left >= 32)
+ {
+ clib_memcpy_u32_x16 (dst, src);
+ clib_memcpy_u32_x16 (dst + 16, src + 16);
+ dst += 32;
+ src += 32;
+ n_left -= 32;
+ }
+#endif
+
+ while (n_left >= 16)
+ {
+ clib_memcpy_u32_x16 (dst, src);
+ dst += 16;
+ src += 16;
+ n_left -= 16;
+ }
+
+#if defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE)
+ if (n_left)
+ {
+ u16 mask = pow2_mask (n_left);
+ u32x16_mask_store (u32x16_mask_load_zero (src, mask), dst, mask);
+ }
+ return;
+#endif
+
+ if (n_left >= 8)
+ {
+ clib_memcpy_u32_x8 (dst, src);
+ dst += 8;
+ src += 8;
+ n_left -= 8;
+ }
+
+#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
+ if (n_left)
+ {
+ u8 mask = pow2_mask (n_left);
+ u32x8_mask_store (u32x8_mask_load_zero (src, mask), dst, mask);
+ }
+ return;
+#endif
+
+ if (n_left >= 4)
+ {
+ clib_memcpy_u32_x4 (dst, src);
+ dst += 4;
+ src += 4;
+ n_left -= 4;
+ }
+#endif
+
+ while (n_left)
+ {
+ dst[0] = src[0];
+ dst += 1;
+ src += 1;
+ n_left -= 1;
+ }
+}
+
+#endif