From 56f54af21d18f9fdd471b81db77a3942b0aa4d9c Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Tue, 12 Oct 2021 20:30:02 +0200 Subject: vppinfra: new memcpy for x86_64 Change-Id: I5a5055580479960ac53e3f989aa188faf57fb05d Type: improvement Signed-off-by: Damjan Marion --- src/vppinfra/memcpy_x86_64.h | 611 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 611 insertions(+) create mode 100644 src/vppinfra/memcpy_x86_64.h (limited to 'src/vppinfra/memcpy_x86_64.h') diff --git a/src/vppinfra/memcpy_x86_64.h b/src/vppinfra/memcpy_x86_64.h new file mode 100644 index 00000000000..9662ab4e7ef --- /dev/null +++ b/src/vppinfra/memcpy_x86_64.h @@ -0,0 +1,611 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright(c) 2021 Damjan Marion + */ + +#ifndef included_clib_memcpy_x86_64_h +#define included_clib_memcpy_x86_64_h +#ifdef __x86_64__ + +#include +#include +#include + +/* clang-format off */ +WARN_OFF (stringop-overflow) +/* clang-format on */ + +static_always_inline void +clib_memcpy1 (void *d, void *s) +{ + *(u8 *) d = *(u8 *) s; +} + +static_always_inline void +clib_memcpy2 (void *d, void *s) +{ + *(u16u *) d = *(u16u *) s; +} + +static_always_inline void +clib_memcpy4 (void *d, void *s) +{ + *(u32u *) d = *(u32u *) s; +} + +static_always_inline void +clib_memcpy8 (void *d, void *s) +{ + *(u64u *) d = *(u64u *) s; +} + +#ifdef CLIB_HAVE_VEC128 +static_always_inline void +clib_memcpy16 (void *d, void *s) +{ + *(u8x16u *) d = *(u8x16u *) s; +} +#endif + +#ifdef CLIB_HAVE_VEC256 +static_always_inline void +clib_memcpy32 (void *d, void *s) +{ + *(u8x32u *) d = *(u8x32u *) s; +} +#endif + +#ifdef CLIB_HAVE_VEC512 +static_always_inline void +clib_memcpy64 (void *d, void *s) +{ + *(u8x64u *) d = *(u8x64u *) s; +} +#endif + +static_always_inline void +clib_memcpy_const_le32 (u8 *dst, u8 *src, size_t n) +{ + switch (n) + { + case 1: + clib_memcpy1 (dst, src); + break; + case 2: + clib_memcpy2 (dst, src); + break; + case 3: + clib_memcpy2 (dst, src); + clib_memcpy1 (dst + 2, src + 2); + break; + case 4: + clib_memcpy4 (dst, src); + break; + case 5: + clib_memcpy4 (dst, src); + clib_memcpy1 (dst + 4, src + 4); + break; + case 6: + clib_memcpy4 (dst, src); + clib_memcpy2 (dst + 4, src + 4); + break; + case 7: + clib_memcpy4 (dst, src); + clib_memcpy4 (dst + 3, src + 3); + break; + case 8: + clib_memcpy8 (dst, src); + break; + case 9: + clib_memcpy8 (dst, src); + clib_memcpy1 (dst + 8, src + 8); + break; + case 10: + clib_memcpy8 (dst, src); + clib_memcpy2 (dst + 8, src + 8); + break; + case 11: + case 12: + clib_memcpy8 (dst, src); + clib_memcpy4 (dst + n - 4, src + n - 4); + break; + case 13: + case 14: + case 15: + clib_memcpy8 (dst, src); + clib_memcpy8 (dst + n - 8, src + n - 8); + break; + case 16: + clib_memcpy16 (dst, src); + break; + case 17: + clib_memcpy16 (dst, src); + clib_memcpy1 (dst + 16, src + 16); + break; + case 18: + clib_memcpy16 (dst, src); + clib_memcpy2 (dst + 16, src + 16); + break; + case 20: + clib_memcpy16 (dst, src); + clib_memcpy4 (dst + 16, src + 16); + break; + case 24: + clib_memcpy16 (dst, src); + clib_memcpy8 (dst + 16, src + 16); + break; + default: + clib_memcpy16 (dst, src); + clib_memcpy16 (dst + n - 16, src + n - 16); + break; + } +} + +static_always_inline void +clib_memcpy_const_le64 (u8 *dst, u8 *src, size_t n) +{ + if (n < 32) + { + clib_memcpy_const_le32 (dst, src, n); + return; + } + +#if defined(CLIB_HAVE_VEC256) + switch (n) + { + case 32: + clib_memcpy32 (dst, src); + break; + case 33: + clib_memcpy32 (dst, src); + clib_memcpy1 (dst + 32, src + 32); + break; + case 34: + clib_memcpy32 (dst, src); + clib_memcpy2 (dst + 32, src + 32); + break; + case 36: + clib_memcpy32 (dst, src); + clib_memcpy4 (dst + 32, src + 32); + break; + case 40: + clib_memcpy32 (dst, src); + clib_memcpy8 (dst + 32, src + 32); + break; + case 48: + clib_memcpy32 (dst, src); + clib_memcpy16 (dst + 32, src + 32); + break; + default: + clib_memcpy32 (dst, src); + clib_memcpy32 (dst + n - 32, src + n - 32); + break; + } +#else + while (n > 31) + { + clib_memcpy16 (dst, src); + clib_memcpy16 (dst + 16, src + 16); + dst += 32; + src += 32; + n -= 32; + } + clib_memcpy_const_le32 (dst, src, n); +#endif +} + +static_always_inline void +clib_memcpy_x86_64_const (u8 *dst, u8 *src, size_t n) +{ +#if defined(CLIB_HAVE_VEC512) + while (n > 128) + { + clib_memcpy64 (dst, src); + dst += 64; + src += 64; + n -= 64; + } + + if (n < 64) + { + clib_memcpy_const_le64 (dst, src, n); + return; + } + + switch (n) + { + case 64: + clib_memcpy64 (dst, src); + break; + case 65: + clib_memcpy64 (dst, src); + clib_memcpy1 (dst + 64, src + 64); + break; + case 66: + clib_memcpy64 (dst, src); + clib_memcpy2 (dst + 64, src + 64); + break; + case 68: + clib_memcpy64 (dst, src); + clib_memcpy4 (dst + 64, src + 64); + break; + case 72: + clib_memcpy64 (dst, src); + clib_memcpy8 (dst + 64, src + 64); + break; + case 80: + clib_memcpy64 (dst, src); + clib_memcpy16 (dst + 64, src + 64); + break; + case 96: + clib_memcpy64 (dst, src); + clib_memcpy32 (dst + 64, src + 64); + break; + default: + clib_memcpy64 (dst, src); + clib_memcpy64 (dst + n - 64, src + n - 64); + break; + } +#elif defined(CLIB_HAVE_VEC256) + while (n > 64) + { + clib_memcpy32 (dst, src); + dst += 32; + src += 32; + n -= 32; + } + clib_memcpy_const_le64 (dst, src, n); +#else + while (n > 32) + { + clib_memcpy16 (dst, src); + dst += 16; + src += 16; + n -= 16; + } + clib_memcpy_const_le32 (dst, src, n); +#endif +} + +static_always_inline void * +clib_memcpy_x86_64 (void *restrict dst, const void *restrict src, size_t n) +{ + u8 *d = (u8 *) dst, *s = (u8 *) src; + + if (n == 0) + return dst; + + if (COMPILE_TIME_CONST (n)) + { + if (n) + clib_memcpy_x86_64_const (d, s, n); + return dst; + } + + if (n <= 32) + { +#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE) + u32 mask = pow2_mask (n); + u8x32_mask_store (u8x32_mask_load_zero (s, mask), d, mask); +#else + if (PREDICT_TRUE (n >= 16)) + { + clib_memcpy16 (d, s); + clib_memcpy16 (d + n - 16, s + n - 16); + } + else if (PREDICT_TRUE (n >= 8)) + { + clib_memcpy8 (d, s); + clib_memcpy8 (d + n - 8, s + n - 8); + } + else if (PREDICT_TRUE (n >= 4)) + { + clib_memcpy4 (d, s); + clib_memcpy4 (d + n - 4, s + n - 4); + } + else if (PREDICT_TRUE (n > 1)) + { + clib_memcpy2 (d, s); + clib_memcpy2 (d + n - 2, s + n - 2); + } + else + clib_memcpy1 (d, s); +#endif + } +#ifdef CLIB_HAVE_VEC512 + else + { + u8x64 v0, v1, v2, v3; + u64 final_off, nr, off = 64; + + if (n <= 64) + { + n -= 32; + u8x32_store_unaligned (u8x32_load_unaligned (s), d); + u8x32_store_unaligned (u8x32_load_unaligned (s + n), d + n); + return dst; + } + + u8x64_store_unaligned (u8x64_load_unaligned (s), d); + + if (n <= 128) + goto done2; + + if (n <= 192) + goto one; + + if (n <= 512 + 64) + { + nr = round_pow2 (n - 128, 64); + goto last; + } + + off -= ((u64) d) & 0x3f; + nr = round_pow2 (n - off - 64, 64); + final_off = (nr & ~(u64) 0x1ff) + off; + + more: + v0 = u8x64_load_unaligned (s + off + 0x000); + v1 = u8x64_load_unaligned (s + off + 0x040); + v2 = u8x64_load_unaligned (s + off + 0x080); + v3 = u8x64_load_unaligned (s + off + 0x0c0); + u8x64_store_unaligned (v0, d + off + 0x000); + u8x64_store_unaligned (v1, d + off + 0x040); + u8x64_store_unaligned (v2, d + off + 0x080); + u8x64_store_unaligned (v3, d + off + 0x0c0); + v0 = u8x64_load_unaligned (s + off + 0x100); + v1 = u8x64_load_unaligned (s + off + 0x140); + v2 = u8x64_load_unaligned (s + off + 0x180); + v3 = u8x64_load_unaligned (s + off + 0x1c0); + u8x64_store_unaligned (v0, d + off + 0x100); + u8x64_store_unaligned (v1, d + off + 0x140); + u8x64_store_unaligned (v2, d + off + 0x180); + u8x64_store_unaligned (v3, d + off + 0x1c0); + off += 512; + if (off != final_off) + goto more; + + if ((nr & 0x1ff) == 0) + goto done2; + + last: + if (PREDICT_TRUE (nr & 256)) + { + v0 = u8x64_load_unaligned (s + off + 0x000); + v1 = u8x64_load_unaligned (s + off + 0x040); + v2 = u8x64_load_unaligned (s + off + 0x080); + v3 = u8x64_load_unaligned (s + off + 0x0c0); + u8x64_store_unaligned (v0, d + off + 0x000); + u8x64_store_unaligned (v1, d + off + 0x040); + u8x64_store_unaligned (v2, d + off + 0x080); + u8x64_store_unaligned (v3, d + off + 0x0c0); + off += 256; + } + if (PREDICT_TRUE (nr & 128)) + { + v0 = u8x64_load_unaligned (s + off + 0x000); + v1 = u8x64_load_unaligned (s + off + 0x040); + u8x64_store_unaligned (v0, d + off + 0x000); + u8x64_store_unaligned (v1, d + off + 0x040); + off += 128; + } + if (PREDICT_TRUE (nr & 64)) + { + one: + u8x64_store_unaligned (u8x64_load_unaligned (s + off), d + off); + } + done2: + u8x64_store_unaligned (u8x64_load_unaligned (s + n - 64), d + n - 64); + } + return dst; +#elif defined(CLIB_HAVE_VEC256) + else + { + u8x32 v0, v1, v2, v3; + u64 final_off, nr, off = 32; + + u8x32_store_unaligned (u8x32_load_unaligned (s), d); + + if (n <= 64) + goto done2; + + if (n <= 96) + goto one; + + if (n <= 256 + 32) + { + nr = round_pow2 (n - 64, 32); + goto last; + } + + off -= ((u64) d) & 0x1f; + nr = round_pow2 (n - off - 32, 32); + final_off = (nr & ~(u64) 0xff) + off; + + more: + v0 = u8x32_load_unaligned (s + off + 0x00); + v1 = u8x32_load_unaligned (s + off + 0x20); + v2 = u8x32_load_unaligned (s + off + 0x40); + v3 = u8x32_load_unaligned (s + off + 0x60); + u8x32_store_unaligned (v0, d + off + 0x00); + u8x32_store_unaligned (v1, d + off + 0x20); + u8x32_store_unaligned (v2, d + off + 0x40); + u8x32_store_unaligned (v3, d + off + 0x60); + v0 = u8x32_load_unaligned (s + off + 0x80); + v1 = u8x32_load_unaligned (s + off + 0xa0); + v2 = u8x32_load_unaligned (s + off + 0xc0); + v3 = u8x32_load_unaligned (s + off + 0xe0); + u8x32_store_unaligned (v0, d + off + 0x80); + u8x32_store_unaligned (v1, d + off + 0xa0); + u8x32_store_unaligned (v2, d + off + 0xc0); + u8x32_store_unaligned (v3, d + off + 0xe0); + off += 256; + if (off != final_off) + goto more; + + if ((nr & 0xff) == 0) + goto done2; + + last: + if (PREDICT_TRUE (nr & 128)) + { + v0 = u8x32_load_unaligned (s + off + 0x00); + v1 = u8x32_load_unaligned (s + off + 0x20); + v2 = u8x32_load_unaligned (s + off + 0x40); + v3 = u8x32_load_unaligned (s + off + 0x60); + u8x32_store_unaligned (v0, d + off + 0x00); + u8x32_store_unaligned (v1, d + off + 0x20); + u8x32_store_unaligned (v2, d + off + 0x40); + u8x32_store_unaligned (v3, d + off + 0x60); + off += 128; + } + if (PREDICT_TRUE (nr & 64)) + { + v0 = u8x32_load_unaligned (s + off + 0x00); + v1 = u8x32_load_unaligned (s + off + 0x20); + u8x32_store_unaligned (v0, d + off + 0x00); + u8x32_store_unaligned (v1, d + off + 0x20); + off += 64; + } + if (PREDICT_TRUE (nr & 32)) + { + one: + u8x32_store_unaligned (u8x32_load_unaligned (s + off), d + off); + } + done2: + u8x32_store_unaligned (u8x32_load_unaligned (s + n - 32), d + n - 32); + } + return dst; +#elif defined(CLIB_HAVE_VEC128) + else + { + u8x16 v0, v1, v2, v3; + u64 final_off, nr, off = 32; + + if (0 && n > 389) + { + __builtin_memcpy (d, s, n); + return dst; + } + + u8x16_store_unaligned (u8x16_load_unaligned (s), d); + u8x16_store_unaligned (u8x16_load_unaligned (s + 16), d + 16); + + if (n <= 48) + goto done2; + + if (n <= 64) + goto one; + + if (n <= 256 + 32) + { + nr = round_pow2 (n - 48, 16); + goto last; + } + + off -= ((u64) d) & 0x0f; + nr = round_pow2 (n - off - 16, 16); + final_off = (nr & ~(u64) 0xff) + off; + + more: + v0 = u8x16_load_unaligned (s + off + 0x00); + v1 = u8x16_load_unaligned (s + off + 0x10); + v2 = u8x16_load_unaligned (s + off + 0x20); + v3 = u8x16_load_unaligned (s + off + 0x30); + u8x16_store_unaligned (v0, d + off + 0x00); + u8x16_store_unaligned (v1, d + off + 0x10); + u8x16_store_unaligned (v2, d + off + 0x20); + u8x16_store_unaligned (v3, d + off + 0x30); + v0 = u8x16_load_unaligned (s + off + 0x40); + v1 = u8x16_load_unaligned (s + off + 0x50); + v2 = u8x16_load_unaligned (s + off + 0x60); + v3 = u8x16_load_unaligned (s + off + 0x70); + u8x16_store_unaligned (v0, d + off + 0x40); + u8x16_store_unaligned (v1, d + off + 0x50); + u8x16_store_unaligned (v2, d + off + 0x60); + u8x16_store_unaligned (v3, d + off + 0x70); + v0 = u8x16_load_unaligned (s + off + 0x80); + v1 = u8x16_load_unaligned (s + off + 0x90); + v2 = u8x16_load_unaligned (s + off + 0xa0); + v3 = u8x16_load_unaligned (s + off + 0xb0); + u8x16_store_unaligned (v0, d + off + 0x80); + u8x16_store_unaligned (v1, d + off + 0x90); + u8x16_store_unaligned (v2, d + off + 0xa0); + u8x16_store_unaligned (v3, d + off + 0xb0); + v0 = u8x16_load_unaligned (s + off + 0xc0); + v1 = u8x16_load_unaligned (s + off + 0xd0); + v2 = u8x16_load_unaligned (s + off + 0xe0); + v3 = u8x16_load_unaligned (s + off + 0xf0); + u8x16_store_unaligned (v0, d + off + 0xc0); + u8x16_store_unaligned (v1, d + off + 0xd0); + u8x16_store_unaligned (v2, d + off + 0xe0); + u8x16_store_unaligned (v3, d + off + 0xf0); + off += 256; + if (off != final_off) + goto more; + + if ((nr & 0xff) == 0) + goto done2; + + last: + if (PREDICT_TRUE (nr & 128)) + { + v0 = u8x16_load_unaligned (s + off + 0x00); + v1 = u8x16_load_unaligned (s + off + 0x10); + v2 = u8x16_load_unaligned (s + off + 0x20); + v3 = u8x16_load_unaligned (s + off + 0x30); + u8x16_store_unaligned (v0, d + off + 0x00); + u8x16_store_unaligned (v1, d + off + 0x10); + u8x16_store_unaligned (v2, d + off + 0x20); + u8x16_store_unaligned (v3, d + off + 0x30); + v0 = u8x16_load_unaligned (s + off + 0x40); + v1 = u8x16_load_unaligned (s + off + 0x50); + v2 = u8x16_load_unaligned (s + off + 0x60); + v3 = u8x16_load_unaligned (s + off + 0x70); + u8x16_store_unaligned (v0, d + off + 0x40); + u8x16_store_unaligned (v1, d + off + 0x50); + u8x16_store_unaligned (v2, d + off + 0x60); + u8x16_store_unaligned (v3, d + off + 0x70); + off += 128; + } + if (PREDICT_TRUE (nr & 64)) + { + v0 = u8x16_load_unaligned (s + off + 0x00); + v1 = u8x16_load_unaligned (s + off + 0x10); + v2 = u8x16_load_unaligned (s + off + 0x20); + v3 = u8x16_load_unaligned (s + off + 0x30); + u8x16_store_unaligned (v0, d + off + 0x00); + u8x16_store_unaligned (v1, d + off + 0x10); + u8x16_store_unaligned (v2, d + off + 0x20); + u8x16_store_unaligned (v3, d + off + 0x30); + off += 64; + } + if (PREDICT_TRUE (nr & 32)) + { + v0 = u8x16_load_unaligned (s + off + 0x00); + v1 = u8x16_load_unaligned (s + off + 0x10); + u8x16_store_unaligned (v0, d + off + 0x00); + u8x16_store_unaligned (v1, d + off + 0x10); + off += 32; + } + if (PREDICT_TRUE (nr & 16)) + { + one: + u8x16_store_unaligned (u8x16_load_unaligned (s + off), d + off); + } + done2: + u8x16_store_unaligned (u8x16_load_unaligned (s + n - 16), d + n - 16); + } + return dst; +#else +#error "SSE/AVX2/AVX512 must be enabled" +#endif + + return dst; +} + +/* clang-format off */ +WARN_ON (stringop-overflow) +/* clang-format on */ + +#endif +#endif -- cgit 1.2.3-korg