aboutsummaryrefslogtreecommitdiffstats
path: root/src/vppinfra/memcpy_x86_64.h
diff options
context:
space:
mode:
authorDamjan Marion <damarion@cisco.com>2021-10-12 20:30:02 +0200
committerFlorin Coras <florin.coras@gmail.com>2021-11-10 16:45:23 +0000
commit56f54af21d18f9fdd471b81db77a3942b0aa4d9c (patch)
tree80c5e9681dc209cdbb3c54d7205bc07ad4379f69 /src/vppinfra/memcpy_x86_64.h
parent904638f4625c82d166d67870f9cf8088dd29a8b2 (diff)
vppinfra: new memcpy for x86_64
Change-Id: I5a5055580479960ac53e3f989aa188faf57fb05d Type: improvement Signed-off-by: Damjan Marion <damarion@cisco.com>
Diffstat (limited to 'src/vppinfra/memcpy_x86_64.h')
-rw-r--r--src/vppinfra/memcpy_x86_64.h611
1 files changed, 611 insertions, 0 deletions
diff --git a/src/vppinfra/memcpy_x86_64.h b/src/vppinfra/memcpy_x86_64.h
new file mode 100644
index 00000000000..9662ab4e7ef
--- /dev/null
+++ b/src/vppinfra/memcpy_x86_64.h
@@ -0,0 +1,611 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Damjan Marion
+ */
+
+#ifndef included_clib_memcpy_x86_64_h
+#define included_clib_memcpy_x86_64_h
+#ifdef __x86_64__
+
+#include <vppinfra/clib.h>
+#include <vppinfra/warnings.h>
+#include <stdio.h>
+
+/* clang-format off */
+WARN_OFF (stringop-overflow)
+/* clang-format on */
+
+static_always_inline void
+clib_memcpy1 (void *d, void *s)
+{
+ *(u8 *) d = *(u8 *) s;
+}
+
+static_always_inline void
+clib_memcpy2 (void *d, void *s)
+{
+ *(u16u *) d = *(u16u *) s;
+}
+
+static_always_inline void
+clib_memcpy4 (void *d, void *s)
+{
+ *(u32u *) d = *(u32u *) s;
+}
+
+static_always_inline void
+clib_memcpy8 (void *d, void *s)
+{
+ *(u64u *) d = *(u64u *) s;
+}
+
+#ifdef CLIB_HAVE_VEC128
+static_always_inline void
+clib_memcpy16 (void *d, void *s)
+{
+ *(u8x16u *) d = *(u8x16u *) s;
+}
+#endif
+
+#ifdef CLIB_HAVE_VEC256
+static_always_inline void
+clib_memcpy32 (void *d, void *s)
+{
+ *(u8x32u *) d = *(u8x32u *) s;
+}
+#endif
+
+#ifdef CLIB_HAVE_VEC512
+static_always_inline void
+clib_memcpy64 (void *d, void *s)
+{
+ *(u8x64u *) d = *(u8x64u *) s;
+}
+#endif
+
+static_always_inline void
+clib_memcpy_const_le32 (u8 *dst, u8 *src, size_t n)
+{
+ switch (n)
+ {
+ case 1:
+ clib_memcpy1 (dst, src);
+ break;
+ case 2:
+ clib_memcpy2 (dst, src);
+ break;
+ case 3:
+ clib_memcpy2 (dst, src);
+ clib_memcpy1 (dst + 2, src + 2);
+ break;
+ case 4:
+ clib_memcpy4 (dst, src);
+ break;
+ case 5:
+ clib_memcpy4 (dst, src);
+ clib_memcpy1 (dst + 4, src + 4);
+ break;
+ case 6:
+ clib_memcpy4 (dst, src);
+ clib_memcpy2 (dst + 4, src + 4);
+ break;
+ case 7:
+ clib_memcpy4 (dst, src);
+ clib_memcpy4 (dst + 3, src + 3);
+ break;
+ case 8:
+ clib_memcpy8 (dst, src);
+ break;
+ case 9:
+ clib_memcpy8 (dst, src);
+ clib_memcpy1 (dst + 8, src + 8);
+ break;
+ case 10:
+ clib_memcpy8 (dst, src);
+ clib_memcpy2 (dst + 8, src + 8);
+ break;
+ case 11:
+ case 12:
+ clib_memcpy8 (dst, src);
+ clib_memcpy4 (dst + n - 4, src + n - 4);
+ break;
+ case 13:
+ case 14:
+ case 15:
+ clib_memcpy8 (dst, src);
+ clib_memcpy8 (dst + n - 8, src + n - 8);
+ break;
+ case 16:
+ clib_memcpy16 (dst, src);
+ break;
+ case 17:
+ clib_memcpy16 (dst, src);
+ clib_memcpy1 (dst + 16, src + 16);
+ break;
+ case 18:
+ clib_memcpy16 (dst, src);
+ clib_memcpy2 (dst + 16, src + 16);
+ break;
+ case 20:
+ clib_memcpy16 (dst, src);
+ clib_memcpy4 (dst + 16, src + 16);
+ break;
+ case 24:
+ clib_memcpy16 (dst, src);
+ clib_memcpy8 (dst + 16, src + 16);
+ break;
+ default:
+ clib_memcpy16 (dst, src);
+ clib_memcpy16 (dst + n - 16, src + n - 16);
+ break;
+ }
+}
+
+static_always_inline void
+clib_memcpy_const_le64 (u8 *dst, u8 *src, size_t n)
+{
+ if (n < 32)
+ {
+ clib_memcpy_const_le32 (dst, src, n);
+ return;
+ }
+
+#if defined(CLIB_HAVE_VEC256)
+ switch (n)
+ {
+ case 32:
+ clib_memcpy32 (dst, src);
+ break;
+ case 33:
+ clib_memcpy32 (dst, src);
+ clib_memcpy1 (dst + 32, src + 32);
+ break;
+ case 34:
+ clib_memcpy32 (dst, src);
+ clib_memcpy2 (dst + 32, src + 32);
+ break;
+ case 36:
+ clib_memcpy32 (dst, src);
+ clib_memcpy4 (dst + 32, src + 32);
+ break;
+ case 40:
+ clib_memcpy32 (dst, src);
+ clib_memcpy8 (dst + 32, src + 32);
+ break;
+ case 48:
+ clib_memcpy32 (dst, src);
+ clib_memcpy16 (dst + 32, src + 32);
+ break;
+ default:
+ clib_memcpy32 (dst, src);
+ clib_memcpy32 (dst + n - 32, src + n - 32);
+ break;
+ }
+#else
+ while (n > 31)
+ {
+ clib_memcpy16 (dst, src);
+ clib_memcpy16 (dst + 16, src + 16);
+ dst += 32;
+ src += 32;
+ n -= 32;
+ }
+ clib_memcpy_const_le32 (dst, src, n);
+#endif
+}
+
+static_always_inline void
+clib_memcpy_x86_64_const (u8 *dst, u8 *src, size_t n)
+{
+#if defined(CLIB_HAVE_VEC512)
+ while (n > 128)
+ {
+ clib_memcpy64 (dst, src);
+ dst += 64;
+ src += 64;
+ n -= 64;
+ }
+
+ if (n < 64)
+ {
+ clib_memcpy_const_le64 (dst, src, n);
+ return;
+ }
+
+ switch (n)
+ {
+ case 64:
+ clib_memcpy64 (dst, src);
+ break;
+ case 65:
+ clib_memcpy64 (dst, src);
+ clib_memcpy1 (dst + 64, src + 64);
+ break;
+ case 66:
+ clib_memcpy64 (dst, src);
+ clib_memcpy2 (dst + 64, src + 64);
+ break;
+ case 68:
+ clib_memcpy64 (dst, src);
+ clib_memcpy4 (dst + 64, src + 64);
+ break;
+ case 72:
+ clib_memcpy64 (dst, src);
+ clib_memcpy8 (dst + 64, src + 64);
+ break;
+ case 80:
+ clib_memcpy64 (dst, src);
+ clib_memcpy16 (dst + 64, src + 64);
+ break;
+ case 96:
+ clib_memcpy64 (dst, src);
+ clib_memcpy32 (dst + 64, src + 64);
+ break;
+ default:
+ clib_memcpy64 (dst, src);
+ clib_memcpy64 (dst + n - 64, src + n - 64);
+ break;
+ }
+#elif defined(CLIB_HAVE_VEC256)
+ while (n > 64)
+ {
+ clib_memcpy32 (dst, src);
+ dst += 32;
+ src += 32;
+ n -= 32;
+ }
+ clib_memcpy_const_le64 (dst, src, n);
+#else
+ while (n > 32)
+ {
+ clib_memcpy16 (dst, src);
+ dst += 16;
+ src += 16;
+ n -= 16;
+ }
+ clib_memcpy_const_le32 (dst, src, n);
+#endif
+}
+
+static_always_inline void *
+clib_memcpy_x86_64 (void *restrict dst, const void *restrict src, size_t n)
+{
+ u8 *d = (u8 *) dst, *s = (u8 *) src;
+
+ if (n == 0)
+ return dst;
+
+ if (COMPILE_TIME_CONST (n))
+ {
+ if (n)
+ clib_memcpy_x86_64_const (d, s, n);
+ return dst;
+ }
+
+ if (n <= 32)
+ {
+#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
+ u32 mask = pow2_mask (n);
+ u8x32_mask_store (u8x32_mask_load_zero (s, mask), d, mask);
+#else
+ if (PREDICT_TRUE (n >= 16))
+ {
+ clib_memcpy16 (d, s);
+ clib_memcpy16 (d + n - 16, s + n - 16);
+ }
+ else if (PREDICT_TRUE (n >= 8))
+ {
+ clib_memcpy8 (d, s);
+ clib_memcpy8 (d + n - 8, s + n - 8);
+ }
+ else if (PREDICT_TRUE (n >= 4))
+ {
+ clib_memcpy4 (d, s);
+ clib_memcpy4 (d + n - 4, s + n - 4);
+ }
+ else if (PREDICT_TRUE (n > 1))
+ {
+ clib_memcpy2 (d, s);
+ clib_memcpy2 (d + n - 2, s + n - 2);
+ }
+ else
+ clib_memcpy1 (d, s);
+#endif
+ }
+#ifdef CLIB_HAVE_VEC512
+ else
+ {
+ u8x64 v0, v1, v2, v3;
+ u64 final_off, nr, off = 64;
+
+ if (n <= 64)
+ {
+ n -= 32;
+ u8x32_store_unaligned (u8x32_load_unaligned (s), d);
+ u8x32_store_unaligned (u8x32_load_unaligned (s + n), d + n);
+ return dst;
+ }
+
+ u8x64_store_unaligned (u8x64_load_unaligned (s), d);
+
+ if (n <= 128)
+ goto done2;
+
+ if (n <= 192)
+ goto one;
+
+ if (n <= 512 + 64)
+ {
+ nr = round_pow2 (n - 128, 64);
+ goto last;
+ }
+
+ off -= ((u64) d) & 0x3f;
+ nr = round_pow2 (n - off - 64, 64);
+ final_off = (nr & ~(u64) 0x1ff) + off;
+
+ more:
+ v0 = u8x64_load_unaligned (s + off + 0x000);
+ v1 = u8x64_load_unaligned (s + off + 0x040);
+ v2 = u8x64_load_unaligned (s + off + 0x080);
+ v3 = u8x64_load_unaligned (s + off + 0x0c0);
+ u8x64_store_unaligned (v0, d + off + 0x000);
+ u8x64_store_unaligned (v1, d + off + 0x040);
+ u8x64_store_unaligned (v2, d + off + 0x080);
+ u8x64_store_unaligned (v3, d + off + 0x0c0);
+ v0 = u8x64_load_unaligned (s + off + 0x100);
+ v1 = u8x64_load_unaligned (s + off + 0x140);
+ v2 = u8x64_load_unaligned (s + off + 0x180);
+ v3 = u8x64_load_unaligned (s + off + 0x1c0);
+ u8x64_store_unaligned (v0, d + off + 0x100);
+ u8x64_store_unaligned (v1, d + off + 0x140);
+ u8x64_store_unaligned (v2, d + off + 0x180);
+ u8x64_store_unaligned (v3, d + off + 0x1c0);
+ off += 512;
+ if (off != final_off)
+ goto more;
+
+ if ((nr & 0x1ff) == 0)
+ goto done2;
+
+ last:
+ if (PREDICT_TRUE (nr & 256))
+ {
+ v0 = u8x64_load_unaligned (s + off + 0x000);
+ v1 = u8x64_load_unaligned (s + off + 0x040);
+ v2 = u8x64_load_unaligned (s + off + 0x080);
+ v3 = u8x64_load_unaligned (s + off + 0x0c0);
+ u8x64_store_unaligned (v0, d + off + 0x000);
+ u8x64_store_unaligned (v1, d + off + 0x040);
+ u8x64_store_unaligned (v2, d + off + 0x080);
+ u8x64_store_unaligned (v3, d + off + 0x0c0);
+ off += 256;
+ }
+ if (PREDICT_TRUE (nr & 128))
+ {
+ v0 = u8x64_load_unaligned (s + off + 0x000);
+ v1 = u8x64_load_unaligned (s + off + 0x040);
+ u8x64_store_unaligned (v0, d + off + 0x000);
+ u8x64_store_unaligned (v1, d + off + 0x040);
+ off += 128;
+ }
+ if (PREDICT_TRUE (nr & 64))
+ {
+ one:
+ u8x64_store_unaligned (u8x64_load_unaligned (s + off), d + off);
+ }
+ done2:
+ u8x64_store_unaligned (u8x64_load_unaligned (s + n - 64), d + n - 64);
+ }
+ return dst;
+#elif defined(CLIB_HAVE_VEC256)
+ else
+ {
+ u8x32 v0, v1, v2, v3;
+ u64 final_off, nr, off = 32;
+
+ u8x32_store_unaligned (u8x32_load_unaligned (s), d);
+
+ if (n <= 64)
+ goto done2;
+
+ if (n <= 96)
+ goto one;
+
+ if (n <= 256 + 32)
+ {
+ nr = round_pow2 (n - 64, 32);
+ goto last;
+ }
+
+ off -= ((u64) d) & 0x1f;
+ nr = round_pow2 (n - off - 32, 32);
+ final_off = (nr & ~(u64) 0xff) + off;
+
+ more:
+ v0 = u8x32_load_unaligned (s + off + 0x00);
+ v1 = u8x32_load_unaligned (s + off + 0x20);
+ v2 = u8x32_load_unaligned (s + off + 0x40);
+ v3 = u8x32_load_unaligned (s + off + 0x60);
+ u8x32_store_unaligned (v0, d + off + 0x00);
+ u8x32_store_unaligned (v1, d + off + 0x20);
+ u8x32_store_unaligned (v2, d + off + 0x40);
+ u8x32_store_unaligned (v3, d + off + 0x60);
+ v0 = u8x32_load_unaligned (s + off + 0x80);
+ v1 = u8x32_load_unaligned (s + off + 0xa0);
+ v2 = u8x32_load_unaligned (s + off + 0xc0);
+ v3 = u8x32_load_unaligned (s + off + 0xe0);
+ u8x32_store_unaligned (v0, d + off + 0x80);
+ u8x32_store_unaligned (v1, d + off + 0xa0);
+ u8x32_store_unaligned (v2, d + off + 0xc0);
+ u8x32_store_unaligned (v3, d + off + 0xe0);
+ off += 256;
+ if (off != final_off)
+ goto more;
+
+ if ((nr & 0xff) == 0)
+ goto done2;
+
+ last:
+ if (PREDICT_TRUE (nr & 128))
+ {
+ v0 = u8x32_load_unaligned (s + off + 0x00);
+ v1 = u8x32_load_unaligned (s + off + 0x20);
+ v2 = u8x32_load_unaligned (s + off + 0x40);
+ v3 = u8x32_load_unaligned (s + off + 0x60);
+ u8x32_store_unaligned (v0, d + off + 0x00);
+ u8x32_store_unaligned (v1, d + off + 0x20);
+ u8x32_store_unaligned (v2, d + off + 0x40);
+ u8x32_store_unaligned (v3, d + off + 0x60);
+ off += 128;
+ }
+ if (PREDICT_TRUE (nr & 64))
+ {
+ v0 = u8x32_load_unaligned (s + off + 0x00);
+ v1 = u8x32_load_unaligned (s + off + 0x20);
+ u8x32_store_unaligned (v0, d + off + 0x00);
+ u8x32_store_unaligned (v1, d + off + 0x20);
+ off += 64;
+ }
+ if (PREDICT_TRUE (nr & 32))
+ {
+ one:
+ u8x32_store_unaligned (u8x32_load_unaligned (s + off), d + off);
+ }
+ done2:
+ u8x32_store_unaligned (u8x32_load_unaligned (s + n - 32), d + n - 32);
+ }
+ return dst;
+#elif defined(CLIB_HAVE_VEC128)
+ else
+ {
+ u8x16 v0, v1, v2, v3;
+ u64 final_off, nr, off = 32;
+
+ if (0 && n > 389)
+ {
+ __builtin_memcpy (d, s, n);
+ return dst;
+ }
+
+ u8x16_store_unaligned (u8x16_load_unaligned (s), d);
+ u8x16_store_unaligned (u8x16_load_unaligned (s + 16), d + 16);
+
+ if (n <= 48)
+ goto done2;
+
+ if (n <= 64)
+ goto one;
+
+ if (n <= 256 + 32)
+ {
+ nr = round_pow2 (n - 48, 16);
+ goto last;
+ }
+
+ off -= ((u64) d) & 0x0f;
+ nr = round_pow2 (n - off - 16, 16);
+ final_off = (nr & ~(u64) 0xff) + off;
+
+ more:
+ v0 = u8x16_load_unaligned (s + off + 0x00);
+ v1 = u8x16_load_unaligned (s + off + 0x10);
+ v2 = u8x16_load_unaligned (s + off + 0x20);
+ v3 = u8x16_load_unaligned (s + off + 0x30);
+ u8x16_store_unaligned (v0, d + off + 0x00);
+ u8x16_store_unaligned (v1, d + off + 0x10);
+ u8x16_store_unaligned (v2, d + off + 0x20);
+ u8x16_store_unaligned (v3, d + off + 0x30);
+ v0 = u8x16_load_unaligned (s + off + 0x40);
+ v1 = u8x16_load_unaligned (s + off + 0x50);
+ v2 = u8x16_load_unaligned (s + off + 0x60);
+ v3 = u8x16_load_unaligned (s + off + 0x70);
+ u8x16_store_unaligned (v0, d + off + 0x40);
+ u8x16_store_unaligned (v1, d + off + 0x50);
+ u8x16_store_unaligned (v2, d + off + 0x60);
+ u8x16_store_unaligned (v3, d + off + 0x70);
+ v0 = u8x16_load_unaligned (s + off + 0x80);
+ v1 = u8x16_load_unaligned (s + off + 0x90);
+ v2 = u8x16_load_unaligned (s + off + 0xa0);
+ v3 = u8x16_load_unaligned (s + off + 0xb0);
+ u8x16_store_unaligned (v0, d + off + 0x80);
+ u8x16_store_unaligned (v1, d + off + 0x90);
+ u8x16_store_unaligned (v2, d + off + 0xa0);
+ u8x16_store_unaligned (v3, d + off + 0xb0);
+ v0 = u8x16_load_unaligned (s + off + 0xc0);
+ v1 = u8x16_load_unaligned (s + off + 0xd0);
+ v2 = u8x16_load_unaligned (s + off + 0xe0);
+ v3 = u8x16_load_unaligned (s + off + 0xf0);
+ u8x16_store_unaligned (v0, d + off + 0xc0);
+ u8x16_store_unaligned (v1, d + off + 0xd0);
+ u8x16_store_unaligned (v2, d + off + 0xe0);
+ u8x16_store_unaligned (v3, d + off + 0xf0);
+ off += 256;
+ if (off != final_off)
+ goto more;
+
+ if ((nr & 0xff) == 0)
+ goto done2;
+
+ last:
+ if (PREDICT_TRUE (nr & 128))
+ {
+ v0 = u8x16_load_unaligned (s + off + 0x00);
+ v1 = u8x16_load_unaligned (s + off + 0x10);
+ v2 = u8x16_load_unaligned (s + off + 0x20);
+ v3 = u8x16_load_unaligned (s + off + 0x30);
+ u8x16_store_unaligned (v0, d + off + 0x00);
+ u8x16_store_unaligned (v1, d + off + 0x10);
+ u8x16_store_unaligned (v2, d + off + 0x20);
+ u8x16_store_unaligned (v3, d + off + 0x30);
+ v0 = u8x16_load_unaligned (s + off + 0x40);
+ v1 = u8x16_load_unaligned (s + off + 0x50);
+ v2 = u8x16_load_unaligned (s + off + 0x60);
+ v3 = u8x16_load_unaligned (s + off + 0x70);
+ u8x16_store_unaligned (v0, d + off + 0x40);
+ u8x16_store_unaligned (v1, d + off + 0x50);
+ u8x16_store_unaligned (v2, d + off + 0x60);
+ u8x16_store_unaligned (v3, d + off + 0x70);
+ off += 128;
+ }
+ if (PREDICT_TRUE (nr & 64))
+ {
+ v0 = u8x16_load_unaligned (s + off + 0x00);
+ v1 = u8x16_load_unaligned (s + off + 0x10);
+ v2 = u8x16_load_unaligned (s + off + 0x20);
+ v3 = u8x16_load_unaligned (s + off + 0x30);
+ u8x16_store_unaligned (v0, d + off + 0x00);
+ u8x16_store_unaligned (v1, d + off + 0x10);
+ u8x16_store_unaligned (v2, d + off + 0x20);
+ u8x16_store_unaligned (v3, d + off + 0x30);
+ off += 64;
+ }
+ if (PREDICT_TRUE (nr & 32))
+ {
+ v0 = u8x16_load_unaligned (s + off + 0x00);
+ v1 = u8x16_load_unaligned (s + off + 0x10);
+ u8x16_store_unaligned (v0, d + off + 0x00);
+ u8x16_store_unaligned (v1, d + off + 0x10);
+ off += 32;
+ }
+ if (PREDICT_TRUE (nr & 16))
+ {
+ one:
+ u8x16_store_unaligned (u8x16_load_unaligned (s + off), d + off);
+ }
+ done2:
+ u8x16_store_unaligned (u8x16_load_unaligned (s + n - 16), d + n - 16);
+ }
+ return dst;
+#else
+#error "SSE/AVX2/AVX512 must be enabled"
+#endif
+
+ return dst;
+}
+
+/* clang-format off */
+WARN_ON (stringop-overflow)
+/* clang-format on */
+
+#endif
+#endif