From 7d14aad6379ebf96b75dd076260a2fccb7caa3b4 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 5 May 2021 19:31:41 +0200 Subject: vppinfra: fix x86 packs / packus wrappers They both take signed value as input. Type: fix Change-Id: If3d8ec4e0b1c02d7d65262bdd9db49ff7fbfef39 Signed-off-by: Damjan Marion --- src/vppinfra/vector_avx2.h | 22 +++++++------ src/vppinfra/vector_avx512.h | 13 ++++++++ src/vppinfra/vector_funcs.h | 35 ++++++++++---------- src/vppinfra/vector_sse42.h | 76 ++++++-------------------------------------- 4 files changed, 52 insertions(+), 94 deletions(-) diff --git a/src/vppinfra/vector_avx2.h b/src/vppinfra/vector_avx2.h index 584bd207b27..f38a3bdae73 100644 --- a/src/vppinfra/vector_avx2.h +++ b/src/vppinfra/vector_avx2.h @@ -105,17 +105,19 @@ _(u64x2, u64x4) #undef _ /* *INDENT-ON* */ -always_inline u8x32 -u16x16_pack (u16x16 lo, u16x16 hi) -{ - return (u8x32) _mm256_packus_epi16 ((__m256i) lo, (__m256i) hi); -} +/* 256 bit packs. */ +#define _(f, t, fn) \ + always_inline t t##_pack (f lo, f hi) \ + { \ + return (t) fn ((__m256i) lo, (__m256i) hi); \ + } + +_ (i16x16, i8x32, _mm256_packs_epi16) +_ (i16x16, u8x32, _mm256_packus_epi16) +_ (i32x8, i16x16, _mm256_packs_epi32) +_ (i32x8, u16x16, _mm256_packus_epi32) -always_inline i8x32 -i16x16_pack (i16x16 lo, i16x16 hi) -{ - return (i8x32) _mm256_packs_epi16 ((__m256i) lo, (__m256i) hi); -} +#undef _ static_always_inline u32 u8x32_msb_mask (u8x32 v) diff --git a/src/vppinfra/vector_avx512.h b/src/vppinfra/vector_avx512.h index 2f5763e3c92..3a01c1ed824 100644 --- a/src/vppinfra/vector_avx512.h +++ b/src/vppinfra/vector_avx512.h @@ -85,6 +85,19 @@ u16x32_msb_mask (u16x32 v) return (u32) _mm512_movepi16_mask ((__m512i) v); } +/* 512-bit packs */ +#define _(f, t, fn) \ + always_inline t t##_pack (f lo, f hi) \ + { \ + return (t) fn ((__m512i) lo, (__m512i) hi); \ + } + +_ (i16x32, i8x64, _mm512_packs_epi16) +_ (i16x32, u8x64, _mm512_packus_epi16) +_ (i32x16, i16x32, _mm512_packs_epi32) +_ (i32x16, u16x32, _mm512_packus_epi32) +#undef _ + static_always_inline u32x16 u32x16_byte_swap (u32x16 v) { diff --git a/src/vppinfra/vector_funcs.h b/src/vppinfra/vector_funcs.h index 2b02d9eb301..5c446a5d50d 100644 --- a/src/vppinfra/vector_funcs.h +++ b/src/vppinfra/vector_funcs.h @@ -27,37 +27,36 @@ clib_compare_u16_x64 (u16 v, u16 *a) u16x16u *av = (u16x16u *) a; i8x32 x; - x = i16x16_pack (v16 == av[0], v16 == av[1]); + x = i8x32_pack (v16 == av[0], v16 == av[1]); mask = i8x32_msb_mask ((i8x32) u64x4_permute (x, 0, 2, 1, 3)); - x = i16x16_pack (v16 == av[2], v16 == av[3]); + x = i8x32_pack (v16 == av[2], v16 == av[3]); mask |= (u64) i8x32_msb_mask ((i8x32) u64x4_permute (x, 0, 2, 1, 3)) << 32; #elif defined(CLIB_HAVE_VEC128) && defined(__ARM_NEON) - u16x8 idx8 = u16x8_splat (v); + u16x8 v8 = u16x8_splat (v); u16x8 m = { 1, 2, 4, 8, 16, 32, 64, 128 }; u16x8u *av = (u16x8u *) a; - /* compare each u16 elemment with idx8, result gives 0xffff in each element + /* compare each u16 elemment with v8, result gives 0xffff in each element of the resulting vector if comparison result is true. Bitwise AND with m will give us one bit set for true result and offset of that bit represend element index. Finally vaddvq_u16() gives us sum of all elements of the vector which will give us u8 bitmap. */ - mask = ((u64) vaddvq_u16 ((av[0] == idx8) & m) | - (u64) vaddvq_u16 ((av[1] == idx8) & m) << 8 | - (u64) vaddvq_u16 ((av[2] == idx8) & m) << 16 | - (u64) vaddvq_u16 ((av[3] == idx8) & m) << 24 | - (u64) vaddvq_u16 ((av[4] == idx8) & m) << 32 | - (u64) vaddvq_u16 ((av[5] == idx8) & m) << 40 | - (u64) vaddvq_u16 ((av[6] == idx8) & m) << 48 | - (u64) vaddvq_u16 ((av[7] == idx8) & m) << 56); + mask = ((u64) vaddvq_u16 ((av[0] == v8) & m) | + (u64) vaddvq_u16 ((av[1] == v8) & m) << 8 | + (u64) vaddvq_u16 ((av[2] == v8) & m) << 16 | + (u64) vaddvq_u16 ((av[3] == v8) & m) << 24 | + (u64) vaddvq_u16 ((av[4] == v8) & m) << 32 | + (u64) vaddvq_u16 ((av[5] == v8) & m) << 40 | + (u64) vaddvq_u16 ((av[6] == v8) & m) << 48 | + (u64) vaddvq_u16 ((av[7] == v8) & m) << 56); #elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK) - u16x8 idx8 = u16x8_splat (v); + u16x8 v8 = u16x8_splat (v); u16x8u *av = (u16x8u *) a; - mask = - ((u64) i8x16_msb_mask (i16x8_pack (idx8 == av[0], idx8 == av[1])) | - (u64) i8x16_msb_mask (i16x8_pack (idx8 == av[2], idx8 == av[3])) << 16 | - (u64) i8x16_msb_mask (i16x8_pack (idx8 == av[4], idx8 == av[5])) << 32 | - (u64) i8x16_msb_mask (i16x8_pack (idx8 == av[6], idx8 == av[7])) << 48); + mask = ((u64) i8x16_msb_mask (i8x16_pack (v8 == av[0], v8 == av[1])) | + (u64) i8x16_msb_mask (i8x16_pack (v8 == av[2], v8 == av[3])) << 16 | + (u64) i8x16_msb_mask (i8x16_pack (v8 == av[4], v8 == av[5])) << 32 | + (u64) i8x16_msb_mask (i8x16_pack (v8 == av[6], v8 == av[7])) << 48); #else for (int i = 0; i < 64; i++) if (a[i] == v) diff --git a/src/vppinfra/vector_sse42.h b/src/vppinfra/vector_sse42.h index f86fad39b02..1bdb34b866e 100644 --- a/src/vppinfra/vector_sse42.h +++ b/src/vppinfra/vector_sse42.h @@ -184,74 +184,18 @@ u32x2_interleave_lo (u32x2 a, u32x2 b) } /* 128 bit packs. */ -always_inline u8x16 -u16x8_pack (u16x8 lo, u16x8 hi) -{ - return (u8x16) _mm_packus_epi16 ((__m128i) lo, (__m128i) hi); -} - -always_inline i8x16 -i16x8_pack (i16x8 lo, i16x8 hi) -{ - return (i8x16) _mm_packs_epi16 ((__m128i) lo, (__m128i) hi); -} - -always_inline u16x8 -u32x4_pack (u32x4 lo, u32x4 hi) -{ - return (u16x8) _mm_packs_epi32 ((__m128i) lo, (__m128i) hi); -} - -/* 64 bit packs. */ -always_inline u8x8 -u16x4_pack (u16x4 lo, u16x4 hi) -{ - return (u8x8) _m_packuswb ((__m64) lo, (__m64) hi); -} - -always_inline i8x8 -i16x4_pack (i16x4 lo, i16x4 hi) -{ - return (i8x8) _m_packsswb ((__m64) lo, (__m64) hi); -} - -always_inline u16x4 -u32x2_pack (u32x2 lo, u32x2 hi) -{ - return (u16x4) _m_packssdw ((__m64) lo, (__m64) hi); -} - -always_inline i16x4 -i32x2_pack (i32x2 lo, i32x2 hi) -{ - return (i16x4) _m_packssdw ((__m64) lo, (__m64) hi); -} - -#ifndef __ICC -always_inline u64x2 -u64x2_read_lo (u64x2 x, u64 * a) -{ - return (u64x2) _mm_loadl_pi ((__m128) x, (__m64 *) a); -} - -always_inline u64x2 -u64x2_read_hi (u64x2 x, u64 * a) -{ - return (u64x2) _mm_loadh_pi ((__m128) x, (__m64 *) a); -} +#define _(f, t, fn) \ + always_inline t t##_pack (f lo, f hi) \ + { \ + return (t) fn ((__m128i) lo, (__m128i) hi); \ + } -always_inline void -u64x2_write_lo (u64x2 x, u64 * a) -{ - _mm_storel_pi ((__m64 *) a, (__m128) x); -} +_ (i16x8, i8x16, _mm_packs_epi16) +_ (i16x8, u8x16, _mm_packus_epi16) +_ (i32x4, i16x8, _mm_packs_epi32) +_ (i32x4, u16x8, _mm_packus_epi32) -always_inline void -u64x2_write_hi (u64x2 x, u64 * a) -{ - _mm_storeh_pi ((__m64 *) a, (__m128) x); -} -#endif +#undef _ #define _signed_binop(n,m,f,g) \ /* Unsigned */ \ -- cgit 1.2.3-korg