From 856d062ce67f93d83f1ff302a394e8a4448ad017 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 21 Apr 2021 21:11:35 +0200 Subject: vppinfra: clib_memcpy_u32() utilizing SIMD mask loads/stores Type: improvement Change-Id: I55c4688bd1feffae139ce12a66d15885373e5cd7 Signed-off-by: Damjan Marion --- src/vppinfra/memcpy.h | 155 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 src/vppinfra/memcpy.h (limited to 'src/vppinfra/memcpy.h') diff --git a/src/vppinfra/memcpy.h b/src/vppinfra/memcpy.h new file mode 100644 index 00000000000..115379797d2 --- /dev/null +++ b/src/vppinfra/memcpy.h @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright(c) 2021 Cisco Systems, Inc. + */ + +#include +#ifndef included_memcpy_h +#define included_memcpy_h + +static_always_inline void +clib_memcpy_u32_x4 (u32 *dst, u32 *src) +{ +#if defined(CLIB_HAVE_VEC128) + u32x4_store_unaligned (u32x4_load_unaligned (src), dst); +#else + clib_memcpy_fast (dst, src, 4 * sizeof (u32)); +#endif +} +static_always_inline void +clib_memcpy_u32_x8 (u32 *dst, u32 *src) +{ +#if defined(CLIB_HAVE_VEC256) + u32x8_store_unaligned (u32x8_load_unaligned (src), dst); +#else + clib_memcpy_u32_x4 (dst, src); + clib_memcpy_u32_x4 (dst + 4, src + 4); +#endif +} + +static_always_inline void +clib_memcpy_u32_x16 (u32 *dst, u32 *src) +{ +#if defined(CLIB_HAVE_VEC512) + u32x16_store_unaligned (u32x16_load_unaligned (src), dst); +#else + clib_memcpy_u32_x8 (dst, src); + clib_memcpy_u32_x8 (dst + 8, src + 8); +#endif +} + +static_always_inline void +clib_memcpy_u32 (u32 *dst, u32 *src, u32 n_left) +{ +#if defined(CLIB_HAVE_VEC128) + if (COMPILE_TIME_CONST (n_left)) + { + /* for n_left defined as compile-time constant we should prevent compiler + * to use more expensive mask load/store for common cases where smaller + * register load/store exists */ + switch (n_left) + { + case 4: + clib_memcpy_u32_x4 (dst, src); + return; + case 8: + clib_memcpy_u32_x8 (dst, src); + return; + case 12: + clib_memcpy_u32_x8 (dst, src); + clib_memcpy_u32_x4 (dst + 8, src + 8); + return; + case 16: + clib_memcpy_u32_x16 (dst, src); + return; + case 32: + clib_memcpy_u32_x16 (dst, src); + clib_memcpy_u32_x16 (dst + 16, src + 16); + return; + case 64: + clib_memcpy_u32_x16 (dst, src); + clib_memcpy_u32_x16 (dst + 16, src + 16); + clib_memcpy_u32_x16 (dst + 32, src + 32); + clib_memcpy_u32_x16 (dst + 48, src + 48); + return; + default: + break; + } + } + +#if defined(CLIB_HAVE_VEC512) + while (n_left >= 64) + { + clib_memcpy_u32_x16 (dst, src); + clib_memcpy_u32_x16 (dst + 16, src + 16); + clib_memcpy_u32_x16 (dst + 32, src + 32); + clib_memcpy_u32_x16 (dst + 48, src + 48); + dst += 64; + src += 64; + n_left -= 64; + } +#endif + +#if defined(CLIB_HAVE_VEC256) + while (n_left >= 32) + { + clib_memcpy_u32_x16 (dst, src); + clib_memcpy_u32_x16 (dst + 16, src + 16); + dst += 32; + src += 32; + n_left -= 32; + } +#endif + + while (n_left >= 16) + { + clib_memcpy_u32_x16 (dst, src); + dst += 16; + src += 16; + n_left -= 16; + } + +#if defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE) + if (n_left) + { + u16 mask = pow2_mask (n_left); + u32x16_mask_store (u32x16_mask_load_zero (src, mask), dst, mask); + } + return; +#endif + + if (n_left >= 8) + { + clib_memcpy_u32_x8 (dst, src); + dst += 8; + src += 8; + n_left -= 8; + } + +#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE) + if (n_left) + { + u8 mask = pow2_mask (n_left); + u32x8_mask_store (u32x8_mask_load_zero (src, mask), dst, mask); + } + return; +#endif + + if (n_left >= 4) + { + clib_memcpy_u32_x4 (dst, src); + dst += 4; + src += 4; + n_left -= 4; + } +#endif + + while (n_left) + { + dst[0] = src[0]; + dst += 1; + src += 1; + n_left -= 1; + } +} + +#endif -- cgit 1.2.3-korg