From e319de0b0407cd1e0ebc6ad523b9c608499d8c0c Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Fri, 21 Oct 2016 19:30:42 +0200 Subject: vppinfra: clib_memcpy optimization This patch allows copiler to select which SIMD instructions will be used for copying 16 and 32 byte block. Immediate effect of this change will occur in avx2 variants of graph node functions. So far 128 byte registers were used even in code optimized for avx2 due to macro nature of clib_memcpy. With this patch gcc should pick 256 byte registers in such cases. Change-Id: I3510ee9b3bf01f3f0a9184e1a3f8e1bd827f8eee Signed-off-by: Damjan Marion --- vppinfra/vppinfra/memcpy_avx.h | 13 +++++-------- vppinfra/vppinfra/memcpy_sse3.h | 11 +++++------ vppinfra/vppinfra/string.h | 1 + 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/vppinfra/vppinfra/memcpy_avx.h b/vppinfra/vppinfra/memcpy_avx.h index e987d044..e3feb76b 100644 --- a/vppinfra/vppinfra/memcpy_avx.h +++ b/vppinfra/vppinfra/memcpy_avx.h @@ -51,22 +51,19 @@ #include #include +typedef u8 u8x16u __attribute__ ((vector_size (16), aligned (1))); +typedef u8 u8x32u __attribute__ ((vector_size (32), aligned (1))); + static inline void clib_mov16 (u8 * dst, const u8 * src) { - __m128i xmm0; - - xmm0 = _mm_loadu_si128 ((const __m128i *) src); - _mm_storeu_si128 ((__m128i *) dst, xmm0); + *(u8x16u *) dst = *(u8x16u *) src; } static inline void clib_mov32 (u8 * dst, const u8 * src) { - __m256i ymm0; - - ymm0 = _mm256_loadu_si256 ((const __m256i *) src); - _mm256_storeu_si256 ((__m256i *) dst, ymm0); + *(u8x32u *) dst = *(u8x32u *) src; } static inline void diff --git a/vppinfra/vppinfra/memcpy_sse3.h b/vppinfra/vppinfra/memcpy_sse3.h index f61396c8..4fc48c86 100644 --- a/vppinfra/vppinfra/memcpy_sse3.h +++ b/vppinfra/vppinfra/memcpy_sse3.h @@ -51,20 +51,19 @@ #include #include +typedef u8 u8x16u __attribute__ ((vector_size (16), aligned (1))); +typedef u8 u8x32u __attribute__ ((vector_size (32), aligned (1))); + static inline void clib_mov16 (u8 * dst, const u8 * src) { - __m128i xmm0; - - xmm0 = _mm_loadu_si128 ((const __m128i *) src); - _mm_storeu_si128 ((__m128i *) dst, xmm0); + *(u8x16u *) dst = *(u8x16u *) src; } static inline void clib_mov32 (u8 * dst, const u8 * src) { - clib_mov16 ((u8 *) dst + 0 * 16, (const u8 *) src + 0 * 16); - clib_mov16 ((u8 *) dst + 1 * 16, (const u8 *) src + 1 * 16); + *(u8x32u *) dst = *(u8x32u *) src; } static inline void diff --git a/vppinfra/vppinfra/string.h b/vppinfra/vppinfra/string.h index 8d28375e..dda27b79 100644 --- a/vppinfra/vppinfra/string.h +++ b/vppinfra/vppinfra/string.h @@ -39,6 +39,7 @@ #define included_clib_string_h #include /* for CLIB_LINUX_KERNEL */ +#include #ifdef CLIB_LINUX_KERNEL #include -- cgit 1.2.3-korg