Imported Upstream version 16.07-rc1

Change-Id: I40a523e52f12e8496fdd69e902824b0226c303de Signed-off-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
author: Christian Ehrhardt <christian.ehrhardt@canonical.com> 2016-07-06 09:22:35 +0200
committer: Christian Ehrhardt <christian.ehrhardt@canonical.com> 2016-07-06 16:09:40 +0200
commit: 8b25d1ad5d2264bdfc2818c7bda74ee2697df6db (patch)
tree: 8c3c769777f7e66a2d1ba7dd7651b563cfde370b /lib/librte_eal/common/include/arch
parent: 97f17497d162afdb82c8704bf097f0fee3724b2e (diff)
5 files changed, 34 insertions, 109 deletions
diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h
index 988125b3..da6c233a 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h
@@ -323,12 +323,6 @@ rte_memcpy(void *dst, const void *src, size_t n)
 	return memcpy(dst, src, n);
 }
 
-static inline void *
-rte_memcpy_func(void *dst, const void *src, size_t n)
-{
-	return memcpy(dst, src, n);
-}
-
 #endif /* RTE_ARCH_ARM_NEON_MEMCPY */
 
 #ifdef __cplusplus
diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
index 917cdc1b..5db66b63 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
@@ -80,12 +80,6 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
 
 #define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
 
-static inline void *
-rte_memcpy_func(void *dst, const void *src, size_t n)
-{
-	return memcpy(dst, src, n);
-}
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/common/include/arch/tile/rte_memcpy.h b/lib/librte_eal/common/include/arch/tile/rte_memcpy.h
index 9b5b37ef..e606957c 100644
--- a/lib/librte_eal/common/include/arch/tile/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/tile/rte_memcpy.h
@@ -80,12 +80,6 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
 
 #define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
 
-static inline void *
-rte_memcpy_func(void *dst, const void *src, size_t n)
-{
-	return memcpy(dst, src, n);
-}
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
index f463ab30..413035e7 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
@@ -363,71 +363,26 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 }
 
 /**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-	rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
-	rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
-	rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
-	rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
-}
-
-/**
- * Copy 64-byte blocks from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov64blocks(uint8_t *dst, const uint8_t *src, size_t n)
-{
-	__m256i ymm0, ymm1;
-
-	while (n >= 64) {
-		ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
-		n -= 64;
-		ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
-		src = (const uint8_t *)src + 64;
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
-		dst = (uint8_t *)dst + 64;
-	}
-}
-
-/**
- * Copy 256-byte blocks from one location to another,
+ * Copy 128-byte blocks from one location to another,
  * locations should not overlap.
  */
 static inline void
-rte_mov256blocks(uint8_t *dst, const uint8_t *src, size_t n)
+rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 {
-	__m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
+	__m256i ymm0, ymm1, ymm2, ymm3;
 
-	while (n >= 256) {
+	while (n >= 128) {
 		ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
-		n -= 256;
+		n -= 128;
 		ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
 		ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32));
 		ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32));
-		ymm4 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 4 * 32));
-		ymm5 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 5 * 32));
-		ymm6 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 6 * 32));
-		ymm7 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 7 * 32));
-		src = (const uint8_t *)src + 256;
+		src = (const uint8_t *)src + 128;
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 4 * 32), ymm4);
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 5 * 32), ymm5);
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 6 * 32), ymm6);
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 7 * 32), ymm7);
-		dst = (uint8_t *)dst + 256;
+		dst = (uint8_t *)dst + 128;
 	}
 }
 
@@ -466,51 +421,56 @@ rte_memcpy(void *dst, const void *src, size_t n)
 	}
 
 	/**
-	 * Fast way when copy size doesn't exceed 512 bytes
+	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 48) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+		rte_mov32((uint8_t *)dst - 32 + n,
+				(const uint8_t *)src - 32 + n);
 		return ret;
 	}
-	if (n <= 512) {
-		if (n >= 256) {
-			n -= 256;
-			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
-			src = (const uint8_t *)src + 256;
-			dst = (uint8_t *)dst + 256;
-		}
+	if (n <= 256) {
 		if (n >= 128) {
 			n -= 128;
 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 128;
 			dst = (uint8_t *)dst + 128;
 		}
+COPY_BLOCK_128_BACK31:
 		if (n >= 64) {
 			n -= 64;
 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 64;
 			dst = (uint8_t *)dst + 64;
 		}
-COPY_BLOCK_64_BACK31:
 		if (n > 32) {
 			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-			rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+			rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
 			return ret;
 		}
 		if (n > 0) {
-			rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+			rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
 		}
 		return ret;
 	}
 
 	/**
-	 * Make store aligned when copy size exceeds 512 bytes
+	 * Make store aligned when copy size exceeds 256 bytes
 	 */
 	dstofss = (uintptr_t)dst & 0x1F;
 	if (dstofss > 0) {
@@ -522,35 +482,19 @@ COPY_BLOCK_64_BACK31:
 	}
 
 	/**
-	 * Copy 256-byte blocks.
-	 * Use copy block function for better instruction order control,
-	 * which is important when load is unaligned.
+	 * Copy 128-byte blocks
 	 */
-	rte_mov256blocks((uint8_t *)dst, (const uint8_t *)src, n);
+	rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
 	bits = n;
-	n = n & 255;
+	n = n & 127;
 	bits -= n;
 	src = (const uint8_t *)src + bits;
 	dst = (uint8_t *)dst + bits;
 
 	/**
-	 * Copy 64-byte blocks.
-	 * Use copy block function for better instruction order control,
-	 * which is important when load is unaligned.
-	 */
-	if (n >= 64) {
-		rte_mov64blocks((uint8_t *)dst, (const uint8_t *)src, n);
-		bits = n;
-		n = n & 63;
-		bits -= n;
-		src = (const uint8_t *)src + bits;
-		dst = (uint8_t *)dst + bits;
-	}
-
-	/**
 	 * Copy whatever left
 	 */
-	goto COPY_BLOCK_64_BACK31;
+	goto COPY_BLOCK_128_BACK31;
 }
 
 #else /* RTE_MACHINE_CPUFLAG */
diff --git a/lib/librte_eal/common/include/arch/x86/rte_rtm.h b/lib/librte_eal/common/include/arch/x86/rte_rtm.h
index d9356419..0649f794 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_rtm.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_rtm.h
@@ -50,11 +50,10 @@ void rte_xend(void)
 	 asm volatile(".byte 0x0f,0x01,0xd5" ::: "memory");
 }
 
-static __attribute__((__always_inline__)) inline
-void rte_xabort(const unsigned int status)
-{
-	asm volatile(".byte 0xc6,0xf8,%P0" :: "i" (status) : "memory");
-}
+/* not an inline function to workaround a clang bug with -O0 */
+#define rte_xabort(status) do { \
+	asm volatile(".byte 0xc6,0xf8,%P0" :: "i" (status) : "memory"); \
+} while (0)
 
 static __attribute__((__always_inline__)) inline
 int rte_xtest(void)
author	Christian Ehrhardt <christian.ehrhardt@canonical.com>	2016-07-06 09:22:35 +0200
committer	Christian Ehrhardt <christian.ehrhardt@canonical.com>	2016-07-06 16:09:40 +0200
commit	8b25d1ad5d2264bdfc2818c7bda74ee2697df6db (patch)
tree	8c3c769777f7e66a2d1ba7dd7651b563cfde370b /lib/librte_eal/common/include/arch
parent	97f17497d162afdb82c8704bf097f0fee3724b2e (diff)