aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDamjan Marion <damarion@cisco.com>2016-10-21 19:30:42 +0200
committerOle Trøan <otroan@employees.org>2016-10-24 10:00:58 +0000
commite319de0b0407cd1e0ebc6ad523b9c608499d8c0c (patch)
treecb08798cc0dfad1d29419f2d3270ea41d00567f2
parentd3b85b0157b7c3f69b4ed58ad9dde67281150b54 (diff)
vppinfra: clib_memcpy optimization
This patch allows copiler to select which SIMD instructions will be used for copying 16 and 32 byte block. Immediate effect of this change will occur in avx2 variants of graph node functions. So far 128 byte registers were used even in code optimized for avx2 due to macro nature of clib_memcpy. With this patch gcc should pick 256 byte registers in such cases. Change-Id: I3510ee9b3bf01f3f0a9184e1a3f8e1bd827f8eee Signed-off-by: Damjan Marion <damarion@cisco.com>
-rw-r--r--vppinfra/vppinfra/memcpy_avx.h13
-rw-r--r--vppinfra/vppinfra/memcpy_sse3.h11
-rw-r--r--vppinfra/vppinfra/string.h1
3 files changed, 11 insertions, 14 deletions
diff --git a/vppinfra/vppinfra/memcpy_avx.h b/vppinfra/vppinfra/memcpy_avx.h
index e987d044b58..e3feb76b6b7 100644
--- a/vppinfra/vppinfra/memcpy_avx.h
+++ b/vppinfra/vppinfra/memcpy_avx.h
@@ -51,22 +51,19 @@
#include <stdint.h>
#include <x86intrin.h>
+typedef u8 u8x16u __attribute__ ((vector_size (16), aligned (1)));
+typedef u8 u8x32u __attribute__ ((vector_size (32), aligned (1)));
+
static inline void
clib_mov16 (u8 * dst, const u8 * src)
{
- __m128i xmm0;
-
- xmm0 = _mm_loadu_si128 ((const __m128i *) src);
- _mm_storeu_si128 ((__m128i *) dst, xmm0);
+ *(u8x16u *) dst = *(u8x16u *) src;
}
static inline void
clib_mov32 (u8 * dst, const u8 * src)
{
- __m256i ymm0;
-
- ymm0 = _mm256_loadu_si256 ((const __m256i *) src);
- _mm256_storeu_si256 ((__m256i *) dst, ymm0);
+ *(u8x32u *) dst = *(u8x32u *) src;
}
static inline void
diff --git a/vppinfra/vppinfra/memcpy_sse3.h b/vppinfra/vppinfra/memcpy_sse3.h
index f61396c8922..4fc48c86c8b 100644
--- a/vppinfra/vppinfra/memcpy_sse3.h
+++ b/vppinfra/vppinfra/memcpy_sse3.h
@@ -51,20 +51,19 @@
#include <stdint.h>
#include <x86intrin.h>
+typedef u8 u8x16u __attribute__ ((vector_size (16), aligned (1)));
+typedef u8 u8x32u __attribute__ ((vector_size (32), aligned (1)));
+
static inline void
clib_mov16 (u8 * dst, const u8 * src)
{
- __m128i xmm0;
-
- xmm0 = _mm_loadu_si128 ((const __m128i *) src);
- _mm_storeu_si128 ((__m128i *) dst, xmm0);
+ *(u8x16u *) dst = *(u8x16u *) src;
}
static inline void
clib_mov32 (u8 * dst, const u8 * src)
{
- clib_mov16 ((u8 *) dst + 0 * 16, (const u8 *) src + 0 * 16);
- clib_mov16 ((u8 *) dst + 1 * 16, (const u8 *) src + 1 * 16);
+ *(u8x32u *) dst = *(u8x32u *) src;
}
static inline void
diff --git a/vppinfra/vppinfra/string.h b/vppinfra/vppinfra/string.h
index 8d28375ec0a..dda27b7961b 100644
--- a/vppinfra/vppinfra/string.h
+++ b/vppinfra/vppinfra/string.h
@@ -39,6 +39,7 @@
#define included_clib_string_h
#include <vppinfra/clib.h> /* for CLIB_LINUX_KERNEL */
+#include <vppinfra/vector.h>
#ifdef CLIB_LINUX_KERNEL
#include <linux/string.h>