summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/vlib/buffer_funcs.h38
-rw-r--r--src/vppinfra/CMakeLists.txt1
-rw-r--r--src/vppinfra/memcpy.h155
-rw-r--r--src/vppinfra/string.h2
4 files changed, 159 insertions, 37 deletions
diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h
index 8ab9c4e53ad..7829986d643 100644
--- a/src/vlib/buffer_funcs.h
+++ b/src/vlib/buffer_funcs.h
@@ -121,43 +121,7 @@ vlib_buffer_get_default_data_size (vlib_main_t * vm)
static_always_inline void
vlib_buffer_copy_indices (u32 * dst, u32 * src, u32 n_indices)
{
-#if defined(CLIB_HAVE_VEC512)
- while (n_indices >= 16)
- {
- u32x16_store_unaligned (u32x16_load_unaligned (src), dst);
- dst += 16;
- src += 16;
- n_indices -= 16;
- }
-#endif
-
-#if defined(CLIB_HAVE_VEC256)
- while (n_indices >= 8)
- {
- u32x8_store_unaligned (u32x8_load_unaligned (src), dst);
- dst += 8;
- src += 8;
- n_indices -= 8;
- }
-#endif
-
-#if defined(CLIB_HAVE_VEC128)
- while (n_indices >= 4)
- {
- u32x4_store_unaligned (u32x4_load_unaligned (src), dst);
- dst += 4;
- src += 4;
- n_indices -= 4;
- }
-#endif
-
- while (n_indices)
- {
- dst[0] = src[0];
- dst += 1;
- src += 1;
- n_indices -= 1;
- }
+ clib_memcpy_u32 (dst, src, n_indices);
}
always_inline void
diff --git a/src/vppinfra/CMakeLists.txt b/src/vppinfra/CMakeLists.txt
index 8cebd32ffe3..11193a37482 100644
--- a/src/vppinfra/CMakeLists.txt
+++ b/src/vppinfra/CMakeLists.txt
@@ -144,6 +144,7 @@ set(VPPINFRA_HEADERS
macros.h
maplog.h
math.h
+ memcpy.h
memcpy_avx2.h
memcpy_avx512.h
memcpy_sse3.h
diff --git a/src/vppinfra/memcpy.h b/src/vppinfra/memcpy.h
new file mode 100644
index 00000000000..115379797d2
--- /dev/null
+++ b/src/vppinfra/memcpy.h
@@ -0,0 +1,155 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Cisco Systems, Inc.
+ */
+
+#include <vppinfra/clib.h>
+#ifndef included_memcpy_h
+#define included_memcpy_h
+
+static_always_inline void
+clib_memcpy_u32_x4 (u32 *dst, u32 *src)
+{
+#if defined(CLIB_HAVE_VEC128)
+ u32x4_store_unaligned (u32x4_load_unaligned (src), dst);
+#else
+ clib_memcpy_fast (dst, src, 4 * sizeof (u32));
+#endif
+}
+static_always_inline void
+clib_memcpy_u32_x8 (u32 *dst, u32 *src)
+{
+#if defined(CLIB_HAVE_VEC256)
+ u32x8_store_unaligned (u32x8_load_unaligned (src), dst);
+#else
+ clib_memcpy_u32_x4 (dst, src);
+ clib_memcpy_u32_x4 (dst + 4, src + 4);
+#endif
+}
+
+static_always_inline void
+clib_memcpy_u32_x16 (u32 *dst, u32 *src)
+{
+#if defined(CLIB_HAVE_VEC512)
+ u32x16_store_unaligned (u32x16_load_unaligned (src), dst);
+#else
+ clib_memcpy_u32_x8 (dst, src);
+ clib_memcpy_u32_x8 (dst + 8, src + 8);
+#endif
+}
+
+static_always_inline void
+clib_memcpy_u32 (u32 *dst, u32 *src, u32 n_left)
+{
+#if defined(CLIB_HAVE_VEC128)
+ if (COMPILE_TIME_CONST (n_left))
+ {
+ /* for n_left defined as compile-time constant we should prevent compiler
+ * to use more expensive mask load/store for common cases where smaller
+ * register load/store exists */
+ switch (n_left)
+ {
+ case 4:
+ clib_memcpy_u32_x4 (dst, src);
+ return;
+ case 8:
+ clib_memcpy_u32_x8 (dst, src);
+ return;
+ case 12:
+ clib_memcpy_u32_x8 (dst, src);
+ clib_memcpy_u32_x4 (dst + 8, src + 8);
+ return;
+ case 16:
+ clib_memcpy_u32_x16 (dst, src);
+ return;
+ case 32:
+ clib_memcpy_u32_x16 (dst, src);
+ clib_memcpy_u32_x16 (dst + 16, src + 16);
+ return;
+ case 64:
+ clib_memcpy_u32_x16 (dst, src);
+ clib_memcpy_u32_x16 (dst + 16, src + 16);
+ clib_memcpy_u32_x16 (dst + 32, src + 32);
+ clib_memcpy_u32_x16 (dst + 48, src + 48);
+ return;
+ default:
+ break;
+ }
+ }
+
+#if defined(CLIB_HAVE_VEC512)
+ while (n_left >= 64)
+ {
+ clib_memcpy_u32_x16 (dst, src);
+ clib_memcpy_u32_x16 (dst + 16, src + 16);
+ clib_memcpy_u32_x16 (dst + 32, src + 32);
+ clib_memcpy_u32_x16 (dst + 48, src + 48);
+ dst += 64;
+ src += 64;
+ n_left -= 64;
+ }
+#endif
+
+#if defined(CLIB_HAVE_VEC256)
+ while (n_left >= 32)
+ {
+ clib_memcpy_u32_x16 (dst, src);
+ clib_memcpy_u32_x16 (dst + 16, src + 16);
+ dst += 32;
+ src += 32;
+ n_left -= 32;
+ }
+#endif
+
+ while (n_left >= 16)
+ {
+ clib_memcpy_u32_x16 (dst, src);
+ dst += 16;
+ src += 16;
+ n_left -= 16;
+ }
+
+#if defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE)
+ if (n_left)
+ {
+ u16 mask = pow2_mask (n_left);
+ u32x16_mask_store (u32x16_mask_load_zero (src, mask), dst, mask);
+ }
+ return;
+#endif
+
+ if (n_left >= 8)
+ {
+ clib_memcpy_u32_x8 (dst, src);
+ dst += 8;
+ src += 8;
+ n_left -= 8;
+ }
+
+#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
+ if (n_left)
+ {
+ u8 mask = pow2_mask (n_left);
+ u32x8_mask_store (u32x8_mask_load_zero (src, mask), dst, mask);
+ }
+ return;
+#endif
+
+ if (n_left >= 4)
+ {
+ clib_memcpy_u32_x4 (dst, src);
+ dst += 4;
+ src += 4;
+ n_left -= 4;
+ }
+#endif
+
+ while (n_left)
+ {
+ dst[0] = src[0];
+ dst += 1;
+ src += 1;
+ n_left -= 1;
+ }
+}
+
+#endif
diff --git a/src/vppinfra/string.h b/src/vppinfra/string.h
index 76d345e16bc..5d01e48eb1c 100644
--- a/src/vppinfra/string.h
+++ b/src/vppinfra/string.h
@@ -99,6 +99,8 @@ clib_memcpy_fast (void *restrict dst, const void *restrict src, size_t n)
#undef clib_memcpy_fast_arch
+#include <vppinfra/memcpy.h>
+
/* c-11 string manipulation variants */
#ifndef EOK