diff options
author | Damjan Marion <damarion@cisco.com> | 2016-10-21 19:30:42 +0200 |
---|---|---|
committer | Ole Trøan <otroan@employees.org> | 2016-10-24 10:00:58 +0000 |
commit | e319de0b0407cd1e0ebc6ad523b9c608499d8c0c (patch) | |
tree | cb08798cc0dfad1d29419f2d3270ea41d00567f2 | |
parent | d3b85b0157b7c3f69b4ed58ad9dde67281150b54 (diff) |
vppinfra: clib_memcpy optimization
This patch allows copiler to select which SIMD instructions
will be used for copying 16 and 32 byte block.
Immediate effect of this change will occur in avx2 variants of
graph node functions. So far 128 byte registers were used
even in code optimized for avx2 due to macro nature of
clib_memcpy. With this patch gcc should pick 256 byte registers
in such cases.
Change-Id: I3510ee9b3bf01f3f0a9184e1a3f8e1bd827f8eee
Signed-off-by: Damjan Marion <damarion@cisco.com>
-rw-r--r-- | vppinfra/vppinfra/memcpy_avx.h | 13 | ||||
-rw-r--r-- | vppinfra/vppinfra/memcpy_sse3.h | 11 | ||||
-rw-r--r-- | vppinfra/vppinfra/string.h | 1 |
3 files changed, 11 insertions, 14 deletions
diff --git a/vppinfra/vppinfra/memcpy_avx.h b/vppinfra/vppinfra/memcpy_avx.h index e987d044b58..e3feb76b6b7 100644 --- a/vppinfra/vppinfra/memcpy_avx.h +++ b/vppinfra/vppinfra/memcpy_avx.h @@ -51,22 +51,19 @@ #include <stdint.h> #include <x86intrin.h> +typedef u8 u8x16u __attribute__ ((vector_size (16), aligned (1))); +typedef u8 u8x32u __attribute__ ((vector_size (32), aligned (1))); + static inline void clib_mov16 (u8 * dst, const u8 * src) { - __m128i xmm0; - - xmm0 = _mm_loadu_si128 ((const __m128i *) src); - _mm_storeu_si128 ((__m128i *) dst, xmm0); + *(u8x16u *) dst = *(u8x16u *) src; } static inline void clib_mov32 (u8 * dst, const u8 * src) { - __m256i ymm0; - - ymm0 = _mm256_loadu_si256 ((const __m256i *) src); - _mm256_storeu_si256 ((__m256i *) dst, ymm0); + *(u8x32u *) dst = *(u8x32u *) src; } static inline void diff --git a/vppinfra/vppinfra/memcpy_sse3.h b/vppinfra/vppinfra/memcpy_sse3.h index f61396c8922..4fc48c86c8b 100644 --- a/vppinfra/vppinfra/memcpy_sse3.h +++ b/vppinfra/vppinfra/memcpy_sse3.h @@ -51,20 +51,19 @@ #include <stdint.h> #include <x86intrin.h> +typedef u8 u8x16u __attribute__ ((vector_size (16), aligned (1))); +typedef u8 u8x32u __attribute__ ((vector_size (32), aligned (1))); + static inline void clib_mov16 (u8 * dst, const u8 * src) { - __m128i xmm0; - - xmm0 = _mm_loadu_si128 ((const __m128i *) src); - _mm_storeu_si128 ((__m128i *) dst, xmm0); + *(u8x16u *) dst = *(u8x16u *) src; } static inline void clib_mov32 (u8 * dst, const u8 * src) { - clib_mov16 ((u8 *) dst + 0 * 16, (const u8 *) src + 0 * 16); - clib_mov16 ((u8 *) dst + 1 * 16, (const u8 *) src + 1 * 16); + *(u8x32u *) dst = *(u8x32u *) src; } static inline void diff --git a/vppinfra/vppinfra/string.h b/vppinfra/vppinfra/string.h index 8d28375ec0a..dda27b7961b 100644 --- a/vppinfra/vppinfra/string.h +++ b/vppinfra/vppinfra/string.h @@ -39,6 +39,7 @@ #define included_clib_string_h #include <vppinfra/clib.h> /* for CLIB_LINUX_KERNEL */ +#include <vppinfra/vector.h> #ifdef CLIB_LINUX_KERNEL #include <linux/string.h> |