summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/vppinfra/vector/array_mask.h85
1 files changed, 52 insertions, 33 deletions
diff --git a/src/vppinfra/vector/array_mask.h b/src/vppinfra/vector/array_mask.h
index 778ed3e638f..fa427a6f1a9 100644
--- a/src/vppinfra/vector/array_mask.h
+++ b/src/vppinfra/vector/array_mask.h
@@ -17,61 +17,80 @@
static_always_inline void
clib_array_mask_u32 (u32 *src, u32 mask, u32 n_elts)
{
- u32 i;
#if defined(CLIB_HAVE_VEC512)
u32x16 mask16 = u32x16_splat (mask);
-
- for (i = 0; i + 16 <= n_elts; i += 16)
- *((u32x16u *) (src + i)) &= mask16;
- n_elts -= i;
- if (n_elts)
+ if (n_elts <= 16)
{
- u16 m = pow2_mask (n_elts);
- u32x16_mask_store (u32x16_mask_load_zero (src + i, m) & mask16, src + i,
- m);
+ u32 m = pow2_mask (n_elts);
+ u32x16 r = u32x16_mask_load_zero (src, m);
+ u32x16_mask_store (r & mask16, src, m);
+ return;
}
- return;
+ for (int i = 0; i < n_elts; i += 16)
+ *((u32x16u *) (src + i)) &= mask16;
+ *((u32x16u *) (src + n_elts - 16)) &= mask16;
#elif defined(CLIB_HAVE_VEC256)
u32x8 mask8 = u32x8_splat (mask);
-
- for (i = 0; i + 8 <= n_elts; i += 8)
- *((u32x8u *) (src + i)) &= mask8;
- n_elts -= i;
- src += i;
#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
- if (n_elts)
+ if (n_elts <= 8)
{
- u8 m = pow2_mask (n_elts);
- u32x8_mask_store (u32x8_mask_load_zero (src, m) & mask8, src, m);
+ u32 m = pow2_mask (n_elts);
+ u32x8 r = u32x8_mask_load_zero (src, m);
+ u32x8_mask_store (r & mask8, src, m);
+ return;
+ }
+#else
+ if (PREDICT_FALSE (n_elts < 4))
+ {
+ if (n_elts & 2)
+ {
+ src[0] &= mask;
+ src[1] &= mask;
+ src += 2;
+ }
+ if (n_elts & 1)
+ src[0] &= mask;
+ return;
+ }
+ if (n_elts <= 8)
+ {
+ u32x4 mask4 = u32x4_splat (mask);
+ *(u32x4u *) src &= mask4;
+ *(u32x4u *) (src + n_elts - 4) &= mask4;
}
- return;
#endif
+
+ for (int i = 0; i < n_elts; i += 8)
+ *((u32x8u *) (src + i)) &= mask8;
+ *((u32x8u *) (src + n_elts - 8)) &= mask8;
#elif defined(CLIB_HAVE_VEC128)
u32x4 mask4 = u32x4_splat (mask);
- for (i = 0; i + 4 <= n_elts; i += 4)
- *((u32x4u *) (src + i)) &= mask4;
- n_elts -= i;
- src += i;
- switch (n_elts)
+ if (PREDICT_FALSE (n_elts < 4))
{
- case 3:
- src[2] &= mask;
- case 2:
- src[1] &= mask;
- case 1:
- src[0] &= mask;
- case 0:
- default:;
+ if (n_elts & 2)
+ {
+ src[0] &= mask;
+ src[1] &= mask;
+ src += 2;
+ }
+ if (n_elts & 1)
+ src[0] &= mask;
+ return;
}
+
+ for (int i = 0; i < n_elts; i += 4)
+ *((u32x4u *) (src + i)) &= mask4;
+ *((u32x4u *) (src + n_elts - 4)) &= mask4;
return;
-#endif
+#else
while (n_elts > 0)
{
src[0] &= mask;
src++;
n_elts--;
}
+#endif
}
#endif