diff options
author | Damjan Marion <damarion@cisco.com> | 2017-07-05 18:15:08 +0200 |
---|---|---|
committer | John Lo <loj@cisco.com> | 2017-07-06 13:25:46 +0000 |
commit | 31e59d9b01a04d734553136fdbd9e570b5241faf (patch) | |
tree | c477d463ac2b174d12f38809b948dd8055aea2d9 | |
parent | 8b81cb43359380e50d3fc216d93ff05894149939 (diff) |
vppinfra: revert clib_memcpy optimization
Looks like some compiler versions are producing wrong code when we are
copying 9-16 bytes so reverting back to the original code.
Change-Id: I74b5fa54a3b01f6288648f1cb0926030edd3b26f
Signed-off-by: Damjan Marion <damarion@cisco.com>
-rw-r--r-- | src/vppinfra/memcpy_avx.h | 13 | ||||
-rw-r--r-- | src/vppinfra/memcpy_sse3.h | 11 |
2 files changed, 14 insertions, 10 deletions
diff --git a/src/vppinfra/memcpy_avx.h b/src/vppinfra/memcpy_avx.h index e3feb76b6b7..e987d044b58 100644 --- a/src/vppinfra/memcpy_avx.h +++ b/src/vppinfra/memcpy_avx.h @@ -51,19 +51,22 @@ #include <stdint.h> #include <x86intrin.h> -typedef u8 u8x16u __attribute__ ((vector_size (16), aligned (1))); -typedef u8 u8x32u __attribute__ ((vector_size (32), aligned (1))); - static inline void clib_mov16 (u8 * dst, const u8 * src) { - *(u8x16u *) dst = *(u8x16u *) src; + __m128i xmm0; + + xmm0 = _mm_loadu_si128 ((const __m128i *) src); + _mm_storeu_si128 ((__m128i *) dst, xmm0); } static inline void clib_mov32 (u8 * dst, const u8 * src) { - *(u8x32u *) dst = *(u8x32u *) src; + __m256i ymm0; + + ymm0 = _mm256_loadu_si256 ((const __m256i *) src); + _mm256_storeu_si256 ((__m256i *) dst, ymm0); } static inline void diff --git a/src/vppinfra/memcpy_sse3.h b/src/vppinfra/memcpy_sse3.h index 4fc48c86c8b..f61396c8922 100644 --- a/src/vppinfra/memcpy_sse3.h +++ b/src/vppinfra/memcpy_sse3.h @@ -51,19 +51,20 @@ #include <stdint.h> #include <x86intrin.h> -typedef u8 u8x16u __attribute__ ((vector_size (16), aligned (1))); -typedef u8 u8x32u __attribute__ ((vector_size (32), aligned (1))); - static inline void clib_mov16 (u8 * dst, const u8 * src) { - *(u8x16u *) dst = *(u8x16u *) src; + __m128i xmm0; + + xmm0 = _mm_loadu_si128 ((const __m128i *) src); + _mm_storeu_si128 ((__m128i *) dst, xmm0); } static inline void clib_mov32 (u8 * dst, const u8 * src) { - *(u8x32u *) dst = *(u8x32u *) src; + clib_mov16 ((u8 *) dst + 0 * 16, (const u8 *) src + 0 * 16); + clib_mov16 ((u8 *) dst + 1 * 16, (const u8 *) src + 1 * 16); } static inline void |