From 14864770be4068b34ffc1124f47f5e9fe2a929d2 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Tue, 22 May 2018 14:07:47 +0200 Subject: vppinfra: add clib_count_equal_uXX and clib_memset_uXX functions Change-Id: I56782652d8ef10304900cc293cfc0502689d800e Signed-off-by: Damjan Marion --- src/vppinfra/string.h | 334 +++++++++++++++++++++++++++++++++++++++++++ src/vppinfra/vector_avx2.h | 4 +- src/vppinfra/vector_avx512.h | 2 +- src/vppinfra/vector_sse42.h | 4 +- 4 files changed, 339 insertions(+), 5 deletions(-) (limited to 'src') diff --git a/src/vppinfra/string.h b/src/vppinfra/string.h index 7873e4647d9..c70996c007b 100644 --- a/src/vppinfra/string.h +++ b/src/vppinfra/string.h @@ -146,6 +146,340 @@ clib_memcpy64_x4 (void *d0, void *d1, void *d2, void *d3, void *s) #endif } +static_always_inline void +clib_memset_u64 (void *p, u64 val, uword count) +{ + u64 *ptr = p; +#if defined(CLIB_HAVE_VEC512) + u64x8 v512 = u64x8_splat (val); + while (count >= 8) + { + u64x8_store_unaligned (v512, ptr); + ptr += 8; + count -= 8; + } + if (count == 0) + return; +#endif +#if defined(CLIB_HAVE_VEC256) + u64x4 v256 = u64x4_splat (val); + while (count >= 4) + { + u64x4_store_unaligned (v256, ptr); + ptr += 4; + count -= 4; + } + if (count == 0) + return; +#else + while (count >= 4) + { + ptr[0] = ptr[1] = ptr[2] = ptr[3] = val; + ptr += 4; + count -= 4; + } +#endif + while (count--) + ptr++[0] = val; +} + +static_always_inline void +clib_memset_u32 (void *p, u32 val, uword count) +{ + u32 *ptr = p; +#if defined(CLIB_HAVE_VEC512) + u32x16 v512 = u32x16_splat (val); + while (count >= 16) + { + u32x16_store_unaligned (v512, ptr); + ptr += 16; + count -= 16; + } + if (count == 0) + return; +#endif +#if defined(CLIB_HAVE_VEC256) + u32x8 v256 = u32x8_splat (val); + while (count >= 8) + { + u32x8_store_unaligned (v256, ptr); + ptr += 8; + count -= 8; + } + if (count == 0) + return; +#endif +#if defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE) + u32x4 v128 = u32x4_splat (val); + while (count >= 4) + { + u32x4_store_unaligned (v128, ptr); + ptr += 4; + count -= 4; + } +#else + while (count >= 4) + { + ptr[0] = ptr[1] = ptr[2] = ptr[3] = val; + ptr += 4; + count -= 4; + } +#endif + while (count--) + ptr++[0] = val; +} + +static_always_inline void +clib_memset_u16 (void *p, u16 val, uword count) +{ + u16 *ptr = p; +#if defined(CLIB_HAVE_VEC512) + u16x32 v512 = u16x32_splat (val); + while (count >= 32) + { + u16x32_store_unaligned (v512, ptr); + ptr += 32; + count -= 32; + } + if (count == 0) + return; +#endif +#if defined(CLIB_HAVE_VEC256) + u16x16 v256 = u16x16_splat (val); + while (count >= 16) + { + u16x16_store_unaligned (v256, ptr); + ptr += 16; + count -= 16; + } + if (count == 0) + return; +#endif +#if defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE) + u16x8 v128 = u16x8_splat (val); + while (count >= 8) + { + u16x8_store_unaligned (v128, ptr); + ptr += 8; + count -= 8; + } +#else + while (count >= 4) + { + ptr[0] = ptr[1] = ptr[2] = ptr[3] = val; + ptr += 4; + count -= 4; + } +#endif + while (count--) + ptr++[0] = val; +} + +static_always_inline void +clib_memset_u8 (void *p, u8 val, uword count) +{ + u8 *ptr = p; +#if defined(CLIB_HAVE_VEC512) + u8x64 v512 = u8x64_splat (val); + while (count >= 64) + { + u8x64_store_unaligned (v512, ptr); + ptr += 64; + count -= 64; + } + if (count == 0) + return; +#endif +#if defined(CLIB_HAVE_VEC256) + u8x32 v256 = u8x32_splat (val); + while (count >= 32) + { + u8x32_store_unaligned (v256, ptr); + ptr += 32; + count -= 32; + } + if (count == 0) + return; +#endif +#if defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE) + u8x16 v128 = u8x16_splat (val); + while (count >= 16) + { + u8x16_store_unaligned (v128, ptr); + ptr += 16; + count -= 16; + } +#else + while (count >= 4) + { + ptr[0] = ptr[1] = ptr[2] = ptr[3] = val; + ptr += 4; + count -= 4; + } +#endif + while (count--) + ptr++[0] = val; +} + +static_always_inline uword +clib_count_equal_u64 (u64 * data, uword max_count) +{ + uword count = 0; + u64 first = data[0]; + +#if defined(CLIB_HAVE_VEC512) + while (u64x8_is_all_equal (u64x8_load_unaligned (data), first)) + { + data += 8; + count += 8; + if (count >= max_count) + return max_count; + } +#endif +#if defined(CLIB_HAVE_VEC256) + while (u64x4_is_all_equal (u64x4_load_unaligned (data), first)) + { + data += 4; + count += 4; + if (count >= max_count) + return max_count; + } +#endif +#if defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE) + while (u64x2_is_all_equal (u64x2_load_unaligned (data), first)) + { + data += 2; + count += 2; + if (count >= max_count) + return max_count; + } +#endif + while (count < max_count && (data[0] == first)) + { + data += 1; + count += 1; + } + return count; +} + +static_always_inline uword +clib_count_equal_u32 (u32 * data, uword max_count) +{ + uword count = 0; + u32 first = data[0]; + +#if defined(CLIB_HAVE_VEC512) + while (u32x16_is_all_equal (u32x16_load_unaligned (data), first)) + { + data += 16; + count += 16; + if (count >= max_count) + return max_count; + } +#endif +#if defined(CLIB_HAVE_VEC256) + while (u32x8_is_all_equal (u32x8_load_unaligned (data), first)) + { + data += 8; + count += 8; + if (count >= max_count) + return max_count; + } +#endif +#if defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE) + while (u32x4_is_all_equal (u32x4_load_unaligned (data), first)) + { + data += 4; + count += 4; + if (count >= max_count) + return max_count; + } +#endif + while (count < max_count && (data[0] == first)) + { + data += 1; + count += 1; + } + return count; +} + +static_always_inline uword +clib_count_equal_u16 (u16 * data, uword max_count) +{ + uword count = 0; + u16 first = data[0]; + +#if defined(CLIB_HAVE_VEC512) + while (count + 32 <= max_count && + u16x32_is_all_equal (u16x32_load_unaligned (data), first)) + { + data += 32; + count += 32; + } +#endif +#if defined(CLIB_HAVE_VEC256) + while (count + 16 <= max_count && + u16x16_is_all_equal (u16x16_load_unaligned (data), first)) + { + data += 16; + count += 16; + } +#endif +#if defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE) + while (count + 8 <= max_count && + u16x8_is_all_equal (u16x8_load_unaligned (data), first)) + { + data += 8; + count += 8; + } +#endif + while (count < max_count && (data[0] == first)) + { + data += 1; + count += 1; + } + return count; +} + +static_always_inline u32 +clib_count_equal_u8 (u32 * data, uword max_count) +{ + uword count = 0; + u8 first = data[0]; + +#if defined(CLIB_HAVE_VEC512) + while (count + 64 <= max_count && + u8x64_is_all_equal (u8x64_load_unaligned (data), first)) + { + data += 64; + count += 64; + } +#endif +#if defined(CLIB_HAVE_VEC256) + while (count + 32 <= max_count && + u8x32_is_all_equal (u8x32_load_unaligned (data), first)) + { + data += 32; + count += 32; + } +#endif +#if defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE) + while (count + 16 <= max_count && + u8x16_is_all_equal (u8x16_load_unaligned (data), first)) + { + data += 4; + count += 4; + } +#endif + while (count < max_count && (data[0] == first)) + { + data += 1; + count += 1; + } + return count; +} + + #endif /* included_clib_string_h */ /* diff --git a/src/vppinfra/vector_avx2.h b/src/vppinfra/vector_avx2.h index 1fb41dfd7df..e2d5701c0e8 100644 --- a/src/vppinfra/vector_avx2.h +++ b/src/vppinfra/vector_avx2.h @@ -47,8 +47,8 @@ t##s##x##c##_is_all_zero (t##s##x##c x) \ { return _mm256_testz_si256 ((__m256i) x, (__m256i) x); } \ \ static_always_inline int \ -t##s##x##c##_is_equal (t##s##x##c x, t##s##x##c y) \ -{ return _mm256_testc_si256 ((__m256i) x, (__m256i) y); } \ +t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \ +{ return t##s##x##c##_is_all_zero (a ^ b); } \ \ static_always_inline int \ t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \ diff --git a/src/vppinfra/vector_avx512.h b/src/vppinfra/vector_avx512.h index c2903e2aa1a..4aee8d1170a 100644 --- a/src/vppinfra/vector_avx512.h +++ b/src/vppinfra/vector_avx512.h @@ -48,7 +48,7 @@ t##s##x##c##_is_all_zero (t##s##x##c v) \ \ static_always_inline int \ t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \ -{ return t##s##x##c##_is_all_zero (a ^b); } \ +{ return t##s##x##c##_is_all_zero (a ^ b); } \ \ static_always_inline int \ t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \ diff --git a/src/vppinfra/vector_sse42.h b/src/vppinfra/vector_sse42.h index 053826d92bc..fa377cc983b 100644 --- a/src/vppinfra/vector_sse42.h +++ b/src/vppinfra/vector_sse42.h @@ -69,8 +69,8 @@ t##s##x##c##_is_all_zero (t##s##x##c x) \ { return _mm_testz_si128 ((__m128i) x, (__m128i) x); } \ \ static_always_inline int \ -t##s##x##c##_is_equal (t##s##x##c x, t##s##x##c y) \ -{ return _mm_testc_si128 ((__m128i) x, (__m128i) y); } \ +t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \ +{ return t##s##x##c##_is_all_zero (a ^ b); } \ \ static_always_inline int \ t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \ -- cgit 1.2.3-korg