diff options
author | Damjan Marion <damarion@cisco.com> | 2017-11-22 12:41:32 +0100 |
---|---|---|
committer | Dave Barach <openvpp@barachs.net> | 2017-11-22 13:20:57 +0000 |
commit | b2e1fe9c4d6263ced0e37c78a7ba1837f5ff1c86 (patch) | |
tree | 1cbb84948117ebec2dbd5e6e8d4aa52f39762bdb | |
parent | 3ce7bcb55b8bdf53aa4975a6312a5fd4a1ca2d0c (diff) |
use intel intrinsics in clib_memcpy64_x4
While my original attmept was to write this function to be portable
and work on non-x86 systems, seems that gcc-5 desn't respect aligment
attribute and issues alligned vector insutruciton which causes crash.
Change-Id: If165c8d482ac96f2b71959d326f9772b48097b48
Signed-off-by: Damjan Marion <damarion@cisco.com>
-rw-r--r-- | src/vppinfra/string.h | 101 |
1 files changed, 54 insertions, 47 deletions
diff --git a/src/vppinfra/string.h b/src/vppinfra/string.h index 914f6a7bbc4..4d2ff7875a2 100644 --- a/src/vppinfra/string.h +++ b/src/vppinfra/string.h @@ -53,6 +53,10 @@ #include <vppinfra/standalone_string.h> #endif +#if _x86_64_ +#include <x86intrin.h> +#endif + /* Exchanges source and destination. */ void clib_memswap (void *_a, void *_b, uword bytes); @@ -82,53 +86,56 @@ void clib_memswap (void *_a, void *_b, uword bytes); static_always_inline void clib_memcpy64_x4 (void *d0, void *d1, void *d2, void *d3, void *s) { -#if defined (CLIB_HAVE_VEC512) - u8x64 __attribute__ ((aligned (1))) r0 = *(((u8x64 *) s) + 0); - - *(((u8x64 *) d0) + 0) = r0; - *(((u8x64 *) d1) + 0) = r0; - *(((u8x64 *) d2) + 0) = r0; - *(((u8x64 *) d3) + 0) = r0; -#elif defined (CLIB_HAVE_VEC256) - u8x32 __attribute__ ((aligned (1))) r0 = *(((u8x32 *) s) + 0); - u8x32 __attribute__ ((aligned (1))) r1 = *(((u8x32 *) s) + 1); - - *(((u8x32 *) d0) + 0) = r0; - *(((u8x32 *) d0) + 1) = r1; - - *(((u8x32 *) d1) + 0) = r0; - *(((u8x32 *) d1) + 1) = r1; - - *(((u8x32 *) d2) + 0) = r0; - *(((u8x32 *) d2) + 1) = r1; - - *(((u8x32 *) d3) + 0) = r0; - *(((u8x32 *) d3) + 1) = r1; -#elif defined (CLIB_HAVE_VEC128) - u8x16 __attribute__ ((aligned (1))) r0 = *(((u8x16 *) s) + 0); - u8x16 __attribute__ ((aligned (1))) r1 = *(((u8x16 *) s) + 1); - u8x16 __attribute__ ((aligned (1))) r2 = *(((u8x16 *) s) + 3); - u8x16 __attribute__ ((aligned (1))) r3 = *(((u8x16 *) s) + 4); - - *(((u8x16 *) d0) + 0) = r0; - *(((u8x16 *) d0) + 1) = r1; - *(((u8x16 *) d0) + 2) = r2; - *(((u8x16 *) d0) + 3) = r3; - - *(((u8x16 *) d1) + 0) = r0; - *(((u8x16 *) d1) + 1) = r1; - *(((u8x16 *) d1) + 2) = r2; - *(((u8x16 *) d1) + 3) = r3; - - *(((u8x16 *) d2) + 0) = r0; - *(((u8x16 *) d2) + 1) = r1; - *(((u8x16 *) d2) + 2) = r2; - *(((u8x16 *) d2) + 3) = r3; - - *(((u8x16 *) d3) + 0) = r0; - *(((u8x16 *) d3) + 1) = r1; - *(((u8x16 *) d3) + 2) = r2; - *(((u8x16 *) d3) + 3) = r3; +#if defined (__AVX512F__) + __m512i r0 = _mm512_loadu_si512 (s); + + _mm512_storeu_si512 (d0, r0); + _mm512_storeu_si512 (d1, r0); + _mm512_storeu_si512 (d2, r0); + _mm512_storeu_si512 (d3, r0); + +#elif defined (__AVX2__) + __m256i r0 = _mm256_loadu_si256 ((__m256i *) s + 0 * 32); + __m256i r1 = _mm256_loadu_si256 ((__m256i *) s + 1 * 32); + + _mm256_storeu_si256 ((__m256i *) d0 + 0 * 32, r0); + _mm256_storeu_si256 ((__m256i *) d0 + 1 * 32, r1); + + _mm256_storeu_si256 ((__m256i *) d1 + 0 * 32, r0); + _mm256_storeu_si256 ((__m256i *) d1 + 1 * 32, r1); + + _mm256_storeu_si256 ((__m256i *) d2 + 0 * 32, r0); + _mm256_storeu_si256 ((__m256i *) d2 + 1 * 32, r1); + + _mm256_storeu_si256 ((__m256i *) d3 + 0 * 32, r0); + _mm256_storeu_si256 ((__m256i *) d3 + 1 * 32, r1); + +#elif defined (__SSSE3__) + __m128i r0 = _mm_loadu_si128 ((__m128i *) s + 0 * 16); + __m128i r1 = _mm_loadu_si128 ((__m128i *) s + 1 * 16); + __m128i r2 = _mm_loadu_si128 ((__m128i *) s + 2 * 16); + __m128i r3 = _mm_loadu_si128 ((__m128i *) s + 3 * 16); + + _mm_storeu_si128 ((__m128i *) d0 + 0 * 16, r0); + _mm_storeu_si128 ((__m128i *) d0 + 1 * 16, r1); + _mm_storeu_si128 ((__m128i *) d0 + 2 * 16, r2); + _mm_storeu_si128 ((__m128i *) d0 + 3 * 16, r3); + + _mm_storeu_si128 ((__m128i *) d1 + 0 * 16, r0); + _mm_storeu_si128 ((__m128i *) d1 + 1 * 16, r1); + _mm_storeu_si128 ((__m128i *) d1 + 2 * 16, r2); + _mm_storeu_si128 ((__m128i *) d1 + 3 * 16, r3); + + _mm_storeu_si128 ((__m128i *) d2 + 0 * 16, r0); + _mm_storeu_si128 ((__m128i *) d2 + 1 * 16, r1); + _mm_storeu_si128 ((__m128i *) d2 + 2 * 16, r2); + _mm_storeu_si128 ((__m128i *) d2 + 3 * 16, r3); + + _mm_storeu_si128 ((__m128i *) d3 + 0 * 16, r0); + _mm_storeu_si128 ((__m128i *) d3 + 1 * 16, r1); + _mm_storeu_si128 ((__m128i *) d3 + 2 * 16, r2); + _mm_storeu_si128 ((__m128i *) d3 + 3 * 16, r3); + #else clib_memcpy (d0, s, 64); clib_memcpy (d1, s, 64); |