diff options
author | Damjan Marion <damarion@cisco.com> | 2021-04-21 21:11:35 +0200 |
---|---|---|
committer | Beno�t Ganne <bganne@cisco.com> | 2021-04-26 13:47:40 +0000 |
commit | 856d062ce67f93d83f1ff302a394e8a4448ad017 (patch) | |
tree | 7b7dc8436850bcf03ab74d91de65eb8fee59e6d2 /src | |
parent | c99ab12cbb9252806db55f7a79e5d84b41df3525 (diff) |
vppinfra: clib_memcpy_u32() utilizing SIMD mask loads/stores
Type: improvement
Change-Id: I55c4688bd1feffae139ce12a66d15885373e5cd7
Signed-off-by: Damjan Marion <damarion@cisco.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/vlib/buffer_funcs.h | 38 | ||||
-rw-r--r-- | src/vppinfra/CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/vppinfra/memcpy.h | 155 | ||||
-rw-r--r-- | src/vppinfra/string.h | 2 |
4 files changed, 159 insertions, 37 deletions
diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 8ab9c4e53ad..7829986d643 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -121,43 +121,7 @@ vlib_buffer_get_default_data_size (vlib_main_t * vm) static_always_inline void vlib_buffer_copy_indices (u32 * dst, u32 * src, u32 n_indices) { -#if defined(CLIB_HAVE_VEC512) - while (n_indices >= 16) - { - u32x16_store_unaligned (u32x16_load_unaligned (src), dst); - dst += 16; - src += 16; - n_indices -= 16; - } -#endif - -#if defined(CLIB_HAVE_VEC256) - while (n_indices >= 8) - { - u32x8_store_unaligned (u32x8_load_unaligned (src), dst); - dst += 8; - src += 8; - n_indices -= 8; - } -#endif - -#if defined(CLIB_HAVE_VEC128) - while (n_indices >= 4) - { - u32x4_store_unaligned (u32x4_load_unaligned (src), dst); - dst += 4; - src += 4; - n_indices -= 4; - } -#endif - - while (n_indices) - { - dst[0] = src[0]; - dst += 1; - src += 1; - n_indices -= 1; - } + clib_memcpy_u32 (dst, src, n_indices); } always_inline void diff --git a/src/vppinfra/CMakeLists.txt b/src/vppinfra/CMakeLists.txt index 8cebd32ffe3..11193a37482 100644 --- a/src/vppinfra/CMakeLists.txt +++ b/src/vppinfra/CMakeLists.txt @@ -144,6 +144,7 @@ set(VPPINFRA_HEADERS macros.h maplog.h math.h + memcpy.h memcpy_avx2.h memcpy_avx512.h memcpy_sse3.h diff --git a/src/vppinfra/memcpy.h b/src/vppinfra/memcpy.h new file mode 100644 index 00000000000..115379797d2 --- /dev/null +++ b/src/vppinfra/memcpy.h @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright(c) 2021 Cisco Systems, Inc. + */ + +#include <vppinfra/clib.h> +#ifndef included_memcpy_h +#define included_memcpy_h + +static_always_inline void +clib_memcpy_u32_x4 (u32 *dst, u32 *src) +{ +#if defined(CLIB_HAVE_VEC128) + u32x4_store_unaligned (u32x4_load_unaligned (src), dst); +#else + clib_memcpy_fast (dst, src, 4 * sizeof (u32)); +#endif +} +static_always_inline void +clib_memcpy_u32_x8 (u32 *dst, u32 *src) +{ +#if defined(CLIB_HAVE_VEC256) + u32x8_store_unaligned (u32x8_load_unaligned (src), dst); +#else + clib_memcpy_u32_x4 (dst, src); + clib_memcpy_u32_x4 (dst + 4, src + 4); +#endif +} + +static_always_inline void +clib_memcpy_u32_x16 (u32 *dst, u32 *src) +{ +#if defined(CLIB_HAVE_VEC512) + u32x16_store_unaligned (u32x16_load_unaligned (src), dst); +#else + clib_memcpy_u32_x8 (dst, src); + clib_memcpy_u32_x8 (dst + 8, src + 8); +#endif +} + +static_always_inline void +clib_memcpy_u32 (u32 *dst, u32 *src, u32 n_left) +{ +#if defined(CLIB_HAVE_VEC128) + if (COMPILE_TIME_CONST (n_left)) + { + /* for n_left defined as compile-time constant we should prevent compiler + * to use more expensive mask load/store for common cases where smaller + * register load/store exists */ + switch (n_left) + { + case 4: + clib_memcpy_u32_x4 (dst, src); + return; + case 8: + clib_memcpy_u32_x8 (dst, src); + return; + case 12: + clib_memcpy_u32_x8 (dst, src); + clib_memcpy_u32_x4 (dst + 8, src + 8); + return; + case 16: + clib_memcpy_u32_x16 (dst, src); + return; + case 32: + clib_memcpy_u32_x16 (dst, src); + clib_memcpy_u32_x16 (dst + 16, src + 16); + return; + case 64: + clib_memcpy_u32_x16 (dst, src); + clib_memcpy_u32_x16 (dst + 16, src + 16); + clib_memcpy_u32_x16 (dst + 32, src + 32); + clib_memcpy_u32_x16 (dst + 48, src + 48); + return; + default: + break; + } + } + +#if defined(CLIB_HAVE_VEC512) + while (n_left >= 64) + { + clib_memcpy_u32_x16 (dst, src); + clib_memcpy_u32_x16 (dst + 16, src + 16); + clib_memcpy_u32_x16 (dst + 32, src + 32); + clib_memcpy_u32_x16 (dst + 48, src + 48); + dst += 64; + src += 64; + n_left -= 64; + } +#endif + +#if defined(CLIB_HAVE_VEC256) + while (n_left >= 32) + { + clib_memcpy_u32_x16 (dst, src); + clib_memcpy_u32_x16 (dst + 16, src + 16); + dst += 32; + src += 32; + n_left -= 32; + } +#endif + + while (n_left >= 16) + { + clib_memcpy_u32_x16 (dst, src); + dst += 16; + src += 16; + n_left -= 16; + } + +#if defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE) + if (n_left) + { + u16 mask = pow2_mask (n_left); + u32x16_mask_store (u32x16_mask_load_zero (src, mask), dst, mask); + } + return; +#endif + + if (n_left >= 8) + { + clib_memcpy_u32_x8 (dst, src); + dst += 8; + src += 8; + n_left -= 8; + } + +#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE) + if (n_left) + { + u8 mask = pow2_mask (n_left); + u32x8_mask_store (u32x8_mask_load_zero (src, mask), dst, mask); + } + return; +#endif + + if (n_left >= 4) + { + clib_memcpy_u32_x4 (dst, src); + dst += 4; + src += 4; + n_left -= 4; + } +#endif + + while (n_left) + { + dst[0] = src[0]; + dst += 1; + src += 1; + n_left -= 1; + } +} + +#endif diff --git a/src/vppinfra/string.h b/src/vppinfra/string.h index 76d345e16bc..5d01e48eb1c 100644 --- a/src/vppinfra/string.h +++ b/src/vppinfra/string.h @@ -99,6 +99,8 @@ clib_memcpy_fast (void *restrict dst, const void *restrict src, size_t n) #undef clib_memcpy_fast_arch +#include <vppinfra/memcpy.h> + /* c-11 string manipulation variants */ #ifndef EOK |