diff options
-rw-r--r-- | src/vppinfra/vector/array_mask.h | 85 |
1 files changed, 52 insertions, 33 deletions
diff --git a/src/vppinfra/vector/array_mask.h b/src/vppinfra/vector/array_mask.h index 778ed3e638f..fa427a6f1a9 100644 --- a/src/vppinfra/vector/array_mask.h +++ b/src/vppinfra/vector/array_mask.h @@ -17,61 +17,80 @@ static_always_inline void clib_array_mask_u32 (u32 *src, u32 mask, u32 n_elts) { - u32 i; #if defined(CLIB_HAVE_VEC512) u32x16 mask16 = u32x16_splat (mask); - - for (i = 0; i + 16 <= n_elts; i += 16) - *((u32x16u *) (src + i)) &= mask16; - n_elts -= i; - if (n_elts) + if (n_elts <= 16) { - u16 m = pow2_mask (n_elts); - u32x16_mask_store (u32x16_mask_load_zero (src + i, m) & mask16, src + i, - m); + u32 m = pow2_mask (n_elts); + u32x16 r = u32x16_mask_load_zero (src, m); + u32x16_mask_store (r & mask16, src, m); + return; } - return; + for (int i = 0; i < n_elts; i += 16) + *((u32x16u *) (src + i)) &= mask16; + *((u32x16u *) (src + n_elts - 16)) &= mask16; #elif defined(CLIB_HAVE_VEC256) u32x8 mask8 = u32x8_splat (mask); - - for (i = 0; i + 8 <= n_elts; i += 8) - *((u32x8u *) (src + i)) &= mask8; - n_elts -= i; - src += i; #if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE) - if (n_elts) + if (n_elts <= 8) { - u8 m = pow2_mask (n_elts); - u32x8_mask_store (u32x8_mask_load_zero (src, m) & mask8, src, m); + u32 m = pow2_mask (n_elts); + u32x8 r = u32x8_mask_load_zero (src, m); + u32x8_mask_store (r & mask8, src, m); + return; + } +#else + if (PREDICT_FALSE (n_elts < 4)) + { + if (n_elts & 2) + { + src[0] &= mask; + src[1] &= mask; + src += 2; + } + if (n_elts & 1) + src[0] &= mask; + return; + } + if (n_elts <= 8) + { + u32x4 mask4 = u32x4_splat (mask); + *(u32x4u *) src &= mask4; + *(u32x4u *) (src + n_elts - 4) &= mask4; } - return; #endif + + for (int i = 0; i < n_elts; i += 8) + *((u32x8u *) (src + i)) &= mask8; + *((u32x8u *) (src + n_elts - 8)) &= mask8; #elif defined(CLIB_HAVE_VEC128) u32x4 mask4 = u32x4_splat (mask); - for (i = 0; i + 4 <= n_elts; i += 4) - *((u32x4u *) (src + i)) &= mask4; - n_elts -= i; - src += i; - switch (n_elts) + if (PREDICT_FALSE (n_elts < 4)) { - case 3: - src[2] &= mask; - case 2: - src[1] &= mask; - case 1: - src[0] &= mask; - case 0: - default:; + if (n_elts & 2) + { + src[0] &= mask; + src[1] &= mask; + src += 2; + } + if (n_elts & 1) + src[0] &= mask; + return; } + + for (int i = 0; i < n_elts; i += 4) + *((u32x4u *) (src + i)) &= mask4; + *((u32x4u *) (src + n_elts - 4)) &= mask4; return; -#endif +#else while (n_elts > 0) { src[0] &= mask; src++; n_elts--; } +#endif } #endif |