diff options
Diffstat (limited to 'src/vppinfra/vector/array_mask.h')
-rw-r--r-- | src/vppinfra/vector/array_mask.h | 119 |
1 files changed, 86 insertions, 33 deletions
diff --git a/src/vppinfra/vector/array_mask.h b/src/vppinfra/vector/array_mask.h index 778ed3e638f..3d4a82ac01b 100644 --- a/src/vppinfra/vector/array_mask.h +++ b/src/vppinfra/vector/array_mask.h @@ -17,61 +17,114 @@ static_always_inline void clib_array_mask_u32 (u32 *src, u32 mask, u32 n_elts) { - u32 i; #if defined(CLIB_HAVE_VEC512) u32x16 mask16 = u32x16_splat (mask); - - for (i = 0; i + 16 <= n_elts; i += 16) - *((u32x16u *) (src + i)) &= mask16; - n_elts -= i; - if (n_elts) + if (n_elts <= 16) { - u16 m = pow2_mask (n_elts); - u32x16_mask_store (u32x16_mask_load_zero (src + i, m) & mask16, src + i, - m); + u32 m = pow2_mask (n_elts); + u32x16 r = u32x16_mask_load_zero (src, m); + u32x16_mask_store (r & mask16, src, m); + return; } - return; + for (; n_elts >= 16; n_elts -= 16, src += 16) + *((u32x16u *) src) &= mask16; + *((u32x16u *) (src + n_elts - 16)) &= mask16; #elif defined(CLIB_HAVE_VEC256) u32x8 mask8 = u32x8_splat (mask); - - for (i = 0; i + 8 <= n_elts; i += 8) - *((u32x8u *) (src + i)) &= mask8; - n_elts -= i; - src += i; #if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE) - if (n_elts) + if (n_elts <= 8) { - u8 m = pow2_mask (n_elts); - u32x8_mask_store (u32x8_mask_load_zero (src, m) & mask8, src, m); + u32 m = pow2_mask (n_elts); + u32x8 r = u32x8_mask_load_zero (src, m); + u32x8_mask_store (r & mask8, src, m); + return; + } +#else + if (PREDICT_FALSE (n_elts < 4)) + { + if (n_elts & 2) + { + src[0] &= mask; + src[1] &= mask; + src += 2; + } + if (n_elts & 1) + src[0] &= mask; + return; + } + if (n_elts <= 8) + { + u32x4 mask4 = u32x4_splat (mask); + *(u32x4u *) src &= mask4; + *(u32x4u *) (src + n_elts - 4) &= mask4; + return; } - return; #endif + + for (; n_elts >= 8; n_elts -= 8, src += 8) + *((u32x8u *) src) &= mask8; + *((u32x8u *) (src + n_elts - 8)) &= mask8; #elif defined(CLIB_HAVE_VEC128) u32x4 mask4 = u32x4_splat (mask); - for (i = 0; i + 4 <= n_elts; i += 4) - *((u32x4u *) (src + i)) &= mask4; - n_elts -= i; - src += i; - switch (n_elts) + if (PREDICT_FALSE (n_elts < 4)) { - case 3: - src[2] &= mask; - case 2: - src[1] &= mask; - case 1: - src[0] &= mask; - case 0: - default:; + if (n_elts & 2) + { + src[0] &= mask; + src[1] &= mask; + src += 2; + } + if (n_elts & 1) + src[0] &= mask; + return; } + + for (; n_elts >= 4; n_elts -= 4, src += 4) + *((u32x4u *) src) &= mask4; + *((u32x4u *) (src + n_elts - 4)) &= mask4; return; -#endif +#else while (n_elts > 0) { src[0] &= mask; src++; n_elts--; } +#endif +} + +static_always_inline void +clib_array_mask_set_u32_x64 (u32 *a, u32 v, uword bmp, int n_elts) +{ +#if defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE) + u32x16 r = u32x16_splat (v); + for (; n_elts > 0; n_elts -= 16, a += 16, bmp >>= 16) + u32x16_mask_store (r, a, bmp); +#elif defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE) + u32x8 r = u32x8_splat (v); + for (; n_elts > 0; n_elts -= 8, a += 8, bmp >>= 8) + u32x8_mask_store (r, a, bmp); +#else + while (bmp) + { + a[get_lowest_set_bit_index (bmp)] = v; + bmp = clear_lowest_set_bit (bmp); + } +#endif +} + +static_always_inline void +clib_array_mask_set_u32 (u32 *a, u32 v, uword *bmp, u32 n_elts) +{ + while (n_elts >= uword_bits) + { + clib_array_mask_set_u32_x64 (a, v, bmp++[0], uword_bits); + a += uword_bits; + n_elts -= uword_bits; + } + + clib_array_mask_set_u32_x64 (a, v, bmp[0] & pow2_mask (n_elts), n_elts); } #endif |