From e58041f242bf4bd120ecc9619b88348d80b94c17 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Fri, 18 Jan 2019 19:56:09 +0100 Subject: deprecate clib_memcpy64_x4 Storing buffer in local template seems to be better option.... Change-Id: I1a2fdd68cb956f99a5b36d2cd810fc623e089bcf Signed-off-by: Damjan Marion --- src/vppinfra/string.h | 68 --------------------------------------------------- 1 file changed, 68 deletions(-) (limited to 'src/vppinfra') diff --git a/src/vppinfra/string.h b/src/vppinfra/string.h index 42f7890f3d0..d9cd8fe1af9 100644 --- a/src/vppinfra/string.h +++ b/src/vppinfra/string.h @@ -213,74 +213,6 @@ memset_s_inline (void *s, rsize_t smax, int c, rsize_t n) */ #define clib_memset(s,c,n) memset_s_inline(s,n,c,n) -/* - * Copy 64 bytes of data to 4 destinations - * this function is typically used in quad-loop case when whole cacheline - * needs to be copied to 4 different places. First it reads whole cacheline - * to 1/2/4 SIMD registers and then it writes data to 4 destinations. - */ - -static_always_inline void -clib_memcpy64_x4 (void *d0, void *d1, void *d2, void *d3, void *s) -{ -#if defined (__AVX512F__) - __m512i r0 = _mm512_loadu_si512 (s); - - _mm512_storeu_si512 (d0, r0); - _mm512_storeu_si512 (d1, r0); - _mm512_storeu_si512 (d2, r0); - _mm512_storeu_si512 (d3, r0); - -#elif defined (__AVX2__) - __m256i r0 = _mm256_loadu_si256 ((__m256i *) (s + 0 * 32)); - __m256i r1 = _mm256_loadu_si256 ((__m256i *) (s + 1 * 32)); - - _mm256_storeu_si256 ((__m256i *) (d0 + 0 * 32), r0); - _mm256_storeu_si256 ((__m256i *) (d0 + 1 * 32), r1); - - _mm256_storeu_si256 ((__m256i *) (d1 + 0 * 32), r0); - _mm256_storeu_si256 ((__m256i *) (d1 + 1 * 32), r1); - - _mm256_storeu_si256 ((__m256i *) (d2 + 0 * 32), r0); - _mm256_storeu_si256 ((__m256i *) (d2 + 1 * 32), r1); - - _mm256_storeu_si256 ((__m256i *) (d3 + 0 * 32), r0); - _mm256_storeu_si256 ((__m256i *) (d3 + 1 * 32), r1); - -#elif defined (__SSSE3__) - __m128i r0 = _mm_loadu_si128 ((__m128i *) (s + 0 * 16)); - __m128i r1 = _mm_loadu_si128 ((__m128i *) (s + 1 * 16)); - __m128i r2 = _mm_loadu_si128 ((__m128i *) (s + 2 * 16)); - __m128i r3 = _mm_loadu_si128 ((__m128i *) (s + 3 * 16)); - - _mm_storeu_si128 ((__m128i *) (d0 + 0 * 16), r0); - _mm_storeu_si128 ((__m128i *) (d0 + 1 * 16), r1); - _mm_storeu_si128 ((__m128i *) (d0 + 2 * 16), r2); - _mm_storeu_si128 ((__m128i *) (d0 + 3 * 16), r3); - - _mm_storeu_si128 ((__m128i *) (d1 + 0 * 16), r0); - _mm_storeu_si128 ((__m128i *) (d1 + 1 * 16), r1); - _mm_storeu_si128 ((__m128i *) (d1 + 2 * 16), r2); - _mm_storeu_si128 ((__m128i *) (d1 + 3 * 16), r3); - - _mm_storeu_si128 ((__m128i *) (d2 + 0 * 16), r0); - _mm_storeu_si128 ((__m128i *) (d2 + 1 * 16), r1); - _mm_storeu_si128 ((__m128i *) (d2 + 2 * 16), r2); - _mm_storeu_si128 ((__m128i *) (d2 + 3 * 16), r3); - - _mm_storeu_si128 ((__m128i *) (d3 + 0 * 16), r0); - _mm_storeu_si128 ((__m128i *) (d3 + 1 * 16), r1); - _mm_storeu_si128 ((__m128i *) (d3 + 2 * 16), r2); - _mm_storeu_si128 ((__m128i *) (d3 + 3 * 16), r3); - -#else - clib_memcpy_fast (d0, s, 64); - clib_memcpy_fast (d1, s, 64); - clib_memcpy_fast (d2, s, 64); - clib_memcpy_fast (d3, s, 64); -#endif -} - static_always_inline void clib_memset_u64 (void *p, u64 val, uword count) { -- cgit 1.2.3-korg