aboutsummaryrefslogtreecommitdiffstats
path: root/src/vppinfra/vector
diff options
context:
space:
mode:
Diffstat (limited to 'src/vppinfra/vector')
-rw-r--r--src/vppinfra/vector/array_mask.h119
-rw-r--r--src/vppinfra/vector/compress.h140
-rw-r--r--src/vppinfra/vector/count_equal.h306
-rw-r--r--src/vppinfra/vector/index_to_ptr.h257
-rw-r--r--src/vppinfra/vector/ip_csum.h339
-rw-r--r--src/vppinfra/vector/mask_compare.h207
-rw-r--r--src/vppinfra/vector/test/array_mask.c124
-rw-r--r--src/vppinfra/vector/test/compress.c248
-rw-r--r--src/vppinfra/vector/test/mask_compare.c95
-rw-r--r--src/vppinfra/vector/test/test.c53
-rw-r--r--src/vppinfra/vector/test/test.h35
-rw-r--r--src/vppinfra/vector/toeplitz.c122
-rw-r--r--src/vppinfra/vector/toeplitz.h513
13 files changed, 1942 insertions, 616 deletions
diff --git a/src/vppinfra/vector/array_mask.h b/src/vppinfra/vector/array_mask.h
index 778ed3e638f..3d4a82ac01b 100644
--- a/src/vppinfra/vector/array_mask.h
+++ b/src/vppinfra/vector/array_mask.h
@@ -17,61 +17,114 @@
static_always_inline void
clib_array_mask_u32 (u32 *src, u32 mask, u32 n_elts)
{
- u32 i;
#if defined(CLIB_HAVE_VEC512)
u32x16 mask16 = u32x16_splat (mask);
-
- for (i = 0; i + 16 <= n_elts; i += 16)
- *((u32x16u *) (src + i)) &= mask16;
- n_elts -= i;
- if (n_elts)
+ if (n_elts <= 16)
{
- u16 m = pow2_mask (n_elts);
- u32x16_mask_store (u32x16_mask_load_zero (src + i, m) & mask16, src + i,
- m);
+ u32 m = pow2_mask (n_elts);
+ u32x16 r = u32x16_mask_load_zero (src, m);
+ u32x16_mask_store (r & mask16, src, m);
+ return;
}
- return;
+ for (; n_elts >= 16; n_elts -= 16, src += 16)
+ *((u32x16u *) src) &= mask16;
+ *((u32x16u *) (src + n_elts - 16)) &= mask16;
#elif defined(CLIB_HAVE_VEC256)
u32x8 mask8 = u32x8_splat (mask);
-
- for (i = 0; i + 8 <= n_elts; i += 8)
- *((u32x8u *) (src + i)) &= mask8;
- n_elts -= i;
- src += i;
#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
- if (n_elts)
+ if (n_elts <= 8)
{
- u8 m = pow2_mask (n_elts);
- u32x8_mask_store (u32x8_mask_load_zero (src, m) & mask8, src, m);
+ u32 m = pow2_mask (n_elts);
+ u32x8 r = u32x8_mask_load_zero (src, m);
+ u32x8_mask_store (r & mask8, src, m);
+ return;
+ }
+#else
+ if (PREDICT_FALSE (n_elts < 4))
+ {
+ if (n_elts & 2)
+ {
+ src[0] &= mask;
+ src[1] &= mask;
+ src += 2;
+ }
+ if (n_elts & 1)
+ src[0] &= mask;
+ return;
+ }
+ if (n_elts <= 8)
+ {
+ u32x4 mask4 = u32x4_splat (mask);
+ *(u32x4u *) src &= mask4;
+ *(u32x4u *) (src + n_elts - 4) &= mask4;
+ return;
}
- return;
#endif
+
+ for (; n_elts >= 8; n_elts -= 8, src += 8)
+ *((u32x8u *) src) &= mask8;
+ *((u32x8u *) (src + n_elts - 8)) &= mask8;
#elif defined(CLIB_HAVE_VEC128)
u32x4 mask4 = u32x4_splat (mask);
- for (i = 0; i + 4 <= n_elts; i += 4)
- *((u32x4u *) (src + i)) &= mask4;
- n_elts -= i;
- src += i;
- switch (n_elts)
+ if (PREDICT_FALSE (n_elts < 4))
{
- case 3:
- src[2] &= mask;
- case 2:
- src[1] &= mask;
- case 1:
- src[0] &= mask;
- case 0:
- default:;
+ if (n_elts & 2)
+ {
+ src[0] &= mask;
+ src[1] &= mask;
+ src += 2;
+ }
+ if (n_elts & 1)
+ src[0] &= mask;
+ return;
}
+
+ for (; n_elts >= 4; n_elts -= 4, src += 4)
+ *((u32x4u *) src) &= mask4;
+ *((u32x4u *) (src + n_elts - 4)) &= mask4;
return;
-#endif
+#else
while (n_elts > 0)
{
src[0] &= mask;
src++;
n_elts--;
}
+#endif
+}
+
+static_always_inline void
+clib_array_mask_set_u32_x64 (u32 *a, u32 v, uword bmp, int n_elts)
+{
+#if defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE)
+ u32x16 r = u32x16_splat (v);
+ for (; n_elts > 0; n_elts -= 16, a += 16, bmp >>= 16)
+ u32x16_mask_store (r, a, bmp);
+#elif defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
+ u32x8 r = u32x8_splat (v);
+ for (; n_elts > 0; n_elts -= 8, a += 8, bmp >>= 8)
+ u32x8_mask_store (r, a, bmp);
+#else
+ while (bmp)
+ {
+ a[get_lowest_set_bit_index (bmp)] = v;
+ bmp = clear_lowest_set_bit (bmp);
+ }
+#endif
+}
+
+static_always_inline void
+clib_array_mask_set_u32 (u32 *a, u32 v, uword *bmp, u32 n_elts)
+{
+ while (n_elts >= uword_bits)
+ {
+ clib_array_mask_set_u32_x64 (a, v, bmp++[0], uword_bits);
+ a += uword_bits;
+ n_elts -= uword_bits;
+ }
+
+ clib_array_mask_set_u32_x64 (a, v, bmp[0] & pow2_mask (n_elts), n_elts);
}
#endif
diff --git a/src/vppinfra/vector/compress.h b/src/vppinfra/vector/compress.h
index adb6503f711..5429113984b 100644
--- a/src/vppinfra/vector/compress.h
+++ b/src/vppinfra/vector/compress.h
@@ -27,12 +27,40 @@ clib_compress_u64_x64 (u64 *dst, u64 *src, u64 mask)
mask >>= 4;
}
#else
- while (mask)
+ u32 i;
+ foreach_set_bit_index (i, mask)
+ dst++[0] = src[i];
+#endif
+ return dst;
+}
+
+static_always_inline u64 *
+clib_compress_u64_x64_masked (u64 *dst, u64 *src, u64 mask)
+{
+#if defined(CLIB_HAVE_VEC512_COMPRESS) && \
+ defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE)
+ u64x8u *sv = (u64x8u *) src;
+ for (int i = 0; i < 8; i++)
{
- u16 bit = count_trailing_zeros (mask);
- mask = clear_lowest_set_bit (mask);
- dst++[0] = src[bit];
+ u64x8u s = u64x8_mask_load_zero (&sv[i], mask);
+ u64x8_compress_store (s, mask, dst);
+ dst += _popcnt32 ((u8) mask);
+ mask >>= 8;
}
+#elif defined(CLIB_HAVE_VEC256_COMPRESS) && \
+ defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
+ u64x4u *sv = (u64x4u *) src;
+ for (int i = 0; i < 16; i++)
+ {
+ u64x4u s = u64x4_mask_load_zero (&sv[i], mask);
+ u64x4_compress_store (s, mask, dst);
+ dst += _popcnt32 (((u8) mask) & 0x0f);
+ mask >>= 4;
+ }
+#else
+ u32 i;
+ foreach_set_bit_index (i, mask)
+ dst++[0] = src[i];
#endif
return dst;
}
@@ -69,7 +97,9 @@ clib_compress_u64 (u64 *dst, u64 *src, u64 *mask, u32 n_elts)
if (PREDICT_TRUE (n_elts == 0))
return dst - dst0;
- return clib_compress_u64_x64 (dst, src, mask[0] & pow2_mask (n_elts)) - dst0;
+ return clib_compress_u64_x64_masked (dst, src,
+ mask[0] & pow2_mask (n_elts)) -
+ dst0;
}
static_always_inline u32 *
@@ -93,12 +123,41 @@ clib_compress_u32_x64 (u32 *dst, u32 *src, u64 mask)
mask >>= 8;
}
#else
- while (mask)
+ u32 i;
+ foreach_set_bit_index (i, mask)
+ dst++[0] = src[i];
+#endif
+ return dst;
+}
+
+static_always_inline u32 *
+clib_compress_u32_x64_masked (u32 *dst, u32 *src, u64 mask)
+{
+#if defined(CLIB_HAVE_VEC512_COMPRESS) && \
+ defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE)
+ u32x16u *sv = (u32x16u *) src;
+ for (int i = 0; i < 4; i++)
{
- u16 bit = count_trailing_zeros (mask);
- mask = clear_lowest_set_bit (mask);
- dst++[0] = src[bit];
+ u32x16u s = u32x16_mask_load_zero (&sv[i], mask);
+ u32x16_compress_store (s, mask, dst);
+ dst += _popcnt32 ((u16) mask);
+ mask >>= 16;
}
+
+#elif defined(CLIB_HAVE_VEC256_COMPRESS) && \
+ defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
+ u32x8u *sv = (u32x8u *) src;
+ for (int i = 0; i < 8; i++)
+ {
+ u32x8u s = u32x8_mask_load_zero (&sv[i], mask);
+ u32x8_compress_store (s, mask, dst);
+ dst += _popcnt32 ((u8) mask);
+ mask >>= 8;
+ }
+#else
+ u32 i;
+ foreach_set_bit_index (i, mask)
+ dst++[0] = src[i];
#endif
return dst;
}
@@ -135,7 +194,9 @@ clib_compress_u32 (u32 *dst, u32 *src, u64 *mask, u32 n_elts)
if (PREDICT_TRUE (n_elts == 0))
return dst - dst0;
- return clib_compress_u32_x64 (dst, src, mask[0] & pow2_mask (n_elts)) - dst0;
+ return clib_compress_u32_x64_masked (dst, src,
+ mask[0] & pow2_mask (n_elts)) -
+ dst0;
}
static_always_inline u16 *
@@ -150,12 +211,30 @@ clib_compress_u16_x64 (u16 *dst, u16 *src, u64 mask)
mask >>= 32;
}
#else
- while (mask)
+ u32 i;
+ foreach_set_bit_index (i, mask)
+ dst++[0] = src[i];
+#endif
+ return dst;
+}
+
+static_always_inline u16 *
+clib_compress_u16_x64_masked (u16 *dst, u16 *src, u64 mask)
+{
+#if defined(CLIB_HAVE_VEC512_COMPRESS_U8_U16) && \
+ defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE)
+ u16x32u *sv = (u16x32u *) src;
+ for (int i = 0; i < 2; i++)
{
- u16 bit = count_trailing_zeros (mask);
- mask = clear_lowest_set_bit (mask);
- dst++[0] = src[bit];
+ u16x32u s = u16x32_mask_load_zero (&sv[i], mask);
+ u16x32_compress_store (s, mask, dst);
+ dst += _popcnt32 ((u32) mask);
+ mask >>= 32;
}
+#else
+ u32 i;
+ foreach_set_bit_index (i, mask)
+ dst++[0] = src[i];
#endif
return dst;
}
@@ -192,7 +271,9 @@ clib_compress_u16 (u16 *dst, u16 *src, u64 *mask, u32 n_elts)
if (PREDICT_TRUE (n_elts == 0))
return dst - dst0;
- return clib_compress_u16_x64 (dst, src, mask[0] & pow2_mask (n_elts)) - dst0;
+ return clib_compress_u16_x64_masked (dst, src,
+ mask[0] & pow2_mask (n_elts)) -
+ dst0;
}
static_always_inline u8 *
@@ -203,12 +284,26 @@ clib_compress_u8_x64 (u8 *dst, u8 *src, u64 mask)
u8x64_compress_store (sv[0], mask, dst);
dst += _popcnt64 (mask);
#else
- while (mask)
- {
- u16 bit = count_trailing_zeros (mask);
- mask = clear_lowest_set_bit (mask);
- dst++[0] = src[bit];
- }
+ u32 i;
+ foreach_set_bit_index (i, mask)
+ dst++[0] = src[i];
+#endif
+ return dst;
+}
+
+static_always_inline u8 *
+clib_compress_u8_x64_masked (u8 *dst, u8 *src, u64 mask)
+{
+#if defined(CLIB_HAVE_VEC512_COMPRESS_U8_U16) && \
+ defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE)
+ u8x64u *sv = (u8x64u *) src;
+ u8x64u s = u8x64_mask_load_zero (sv, mask);
+ u8x64_compress_store (s, mask, dst);
+ dst += _popcnt64 (mask);
+#else
+ u32 i;
+ foreach_set_bit_index (i, mask)
+ dst++[0] = src[i];
#endif
return dst;
}
@@ -245,7 +340,8 @@ clib_compress_u8 (u8 *dst, u8 *src, u64 *mask, u32 n_elts)
if (PREDICT_TRUE (n_elts == 0))
return dst - dst0;
- return clib_compress_u8_x64 (dst, src, mask[0] & pow2_mask (n_elts)) - dst0;
+ return clib_compress_u8_x64_masked (dst, src, mask[0] & pow2_mask (n_elts)) -
+ dst0;
}
#endif
diff --git a/src/vppinfra/vector/count_equal.h b/src/vppinfra/vector/count_equal.h
new file mode 100644
index 00000000000..ca2fbb7fd39
--- /dev/null
+++ b/src/vppinfra/vector/count_equal.h
@@ -0,0 +1,306 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Cisco Systems, Inc.
+ */
+
+#ifndef included_vector_count_equal_h
+#define included_vector_count_equal_h
+#include <vppinfra/clib.h>
+
+static_always_inline uword
+clib_count_equal_u64 (u64 *data, uword max_count)
+{
+ uword count;
+ u64 first;
+
+ if (max_count <= 1)
+ return max_count;
+ if (data[0] != data[1])
+ return 1;
+
+ count = 0;
+ first = data[0];
+
+#if defined(CLIB_HAVE_VEC256)
+ u64x4 splat = u64x4_splat (first);
+ while (count + 3 < max_count)
+ {
+ u64 bmp;
+ bmp = u8x32_msb_mask ((u8x32) (u64x4_load_unaligned (data) == splat));
+ if (bmp != 0xffffffff)
+ {
+ count += count_trailing_zeros (~bmp) / 8;
+ return count;
+ }
+
+ data += 4;
+ count += 4;
+ }
+#else
+ count += 2;
+ data += 2;
+ while (count + 3 < max_count && ((data[0] ^ first) | (data[1] ^ first) |
+ (data[2] ^ first) | (data[3] ^ first)) == 0)
+ {
+ data += 4;
+ count += 4;
+ }
+#endif
+ while (count < max_count && (data[0] == first))
+ {
+ data += 1;
+ count += 1;
+ }
+ return count;
+}
+
+static_always_inline uword
+clib_count_equal_u32 (u32 *data, uword max_count)
+{
+ uword count;
+ u32 first;
+
+ if (max_count <= 1)
+ return max_count;
+ if (data[0] != data[1])
+ return 1;
+
+ count = 0;
+ first = data[0];
+
+#if defined(CLIB_HAVE_VEC512)
+ u32x16 splat = u32x16_splat (first);
+ while (count + 15 < max_count)
+ {
+ u32 bmp;
+ bmp = u32x16_is_equal_mask (u32x16_load_unaligned (data), splat);
+ if (bmp != pow2_mask (16))
+ return count + count_trailing_zeros (~bmp);
+
+ data += 16;
+ count += 16;
+ }
+ if (count == max_count)
+ return count;
+ else
+ {
+ u32 mask = pow2_mask (max_count - count);
+ u32 bmp =
+ u32x16_is_equal_mask (u32x16_mask_load_zero (data, mask), splat) &
+ mask;
+ return count + count_trailing_zeros (~bmp);
+ }
+#elif defined(CLIB_HAVE_VEC256)
+ u32x8 splat = u32x8_splat (first);
+ while (count + 7 < max_count)
+ {
+ u32 bmp;
+#ifdef __AVX512F__
+ bmp = u32x8_is_equal_mask (u32x8_load_unaligned (data), splat);
+ if (bmp != pow2_mask (8))
+ return count + count_trailing_zeros (~bmp);
+#else
+ bmp = u8x32_msb_mask ((u8x32) (u32x8_load_unaligned (data) == splat));
+ if (bmp != 0xffffffff)
+ return count + count_trailing_zeros (~bmp) / 4;
+#endif
+
+ data += 8;
+ count += 8;
+ }
+ if (count == max_count)
+ return count;
+#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
+ else
+ {
+ u32 mask = pow2_mask (max_count - count);
+ u32 bmp =
+ u32x8_is_equal_mask (u32x8_mask_load_zero (data, mask), splat) & mask;
+ return count + count_trailing_zeros (~bmp);
+ }
+#endif
+#elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK)
+ u32x4 splat = u32x4_splat (first);
+ while (count + 3 < max_count)
+ {
+ u64 bmp;
+ bmp = u8x16_msb_mask ((u8x16) (u32x4_load_unaligned (data) == splat));
+ if (bmp != pow2_mask (4 * 4))
+ {
+ count += count_trailing_zeros (~bmp) / 4;
+ return count;
+ }
+
+ data += 4;
+ count += 4;
+ }
+#else
+ count += 2;
+ data += 2;
+ while (count + 3 < max_count && ((data[0] ^ first) | (data[1] ^ first) |
+ (data[2] ^ first) | (data[3] ^ first)) == 0)
+ {
+ data += 4;
+ count += 4;
+ }
+#endif
+ while (count < max_count && (data[0] == first))
+ {
+ data += 1;
+ count += 1;
+ }
+ return count;
+}
+
+static_always_inline uword
+clib_count_equal_u16 (u16 *data, uword max_count)
+{
+ uword count;
+ u16 first;
+
+ if (max_count <= 1)
+ return max_count;
+ if (data[0] != data[1])
+ return 1;
+
+ count = 0;
+ first = data[0];
+
+#if defined(CLIB_HAVE_VEC256)
+ u16x16 splat = u16x16_splat (first);
+ while (count + 15 < max_count)
+ {
+ u64 bmp;
+ bmp = u8x32_msb_mask ((u8x32) (u16x16_load_unaligned (data) == splat));
+ if (bmp != 0xffffffff)
+ {
+ count += count_trailing_zeros (~bmp) / 2;
+ return count;
+ }
+
+ data += 16;
+ count += 16;
+ }
+#elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK)
+ u16x8 splat = u16x8_splat (first);
+ while (count + 7 < max_count)
+ {
+ u64 bmp;
+ bmp = u8x16_msb_mask ((u8x16) (u16x8_load_unaligned (data) == splat));
+ if (bmp != 0xffff)
+ {
+ count += count_trailing_zeros (~bmp) / 2;
+ return count;
+ }
+
+ data += 8;
+ count += 8;
+ }
+#else
+ count += 2;
+ data += 2;
+ while (count + 3 < max_count && ((data[0] ^ first) | (data[1] ^ first) |
+ (data[2] ^ first) | (data[3] ^ first)) == 0)
+ {
+ data += 4;
+ count += 4;
+ }
+#endif
+ while (count < max_count && (data[0] == first))
+ {
+ data += 1;
+ count += 1;
+ }
+ return count;
+}
+
+static_always_inline uword
+clib_count_equal_u8 (u8 *data, uword max_count)
+{
+ uword count;
+ u8 first;
+
+ if (max_count <= 1)
+ return max_count;
+ if (data[0] != data[1])
+ return 1;
+
+ count = 0;
+ first = data[0];
+
+#if defined(CLIB_HAVE_VEC512)
+ u8x64 splat = u8x64_splat (first);
+ while (count + 63 < max_count)
+ {
+ u64 bmp;
+ bmp = u8x64_is_equal_mask (u8x64_load_unaligned (data), splat);
+ if (bmp != -1)
+ return count + count_trailing_zeros (~bmp);
+
+ data += 64;
+ count += 64;
+ }
+ if (count == max_count)
+ return count;
+#if defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE)
+ else
+ {
+ u64 mask = pow2_mask (max_count - count);
+ u64 bmp =
+ u8x64_is_equal_mask (u8x64_mask_load_zero (data, mask), splat) & mask;
+ return count + count_trailing_zeros (~bmp);
+ }
+#endif
+#elif defined(CLIB_HAVE_VEC256)
+ u8x32 splat = u8x32_splat (first);
+ while (count + 31 < max_count)
+ {
+ u64 bmp;
+ bmp = u8x32_msb_mask ((u8x32) (u8x32_load_unaligned (data) == splat));
+ if (bmp != 0xffffffff)
+ return count + count_trailing_zeros (~bmp);
+
+ data += 32;
+ count += 32;
+ }
+ if (count == max_count)
+ return count;
+#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
+ else
+ {
+ u32 mask = pow2_mask (max_count - count);
+ u64 bmp =
+ u8x32_msb_mask (u8x32_mask_load_zero (data, mask) == splat) & mask;
+ return count + count_trailing_zeros (~bmp);
+ }
+#endif
+#elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK)
+ u8x16 splat = u8x16_splat (first);
+ while (count + 15 < max_count)
+ {
+ u64 bmp;
+ bmp = u8x16_msb_mask ((u8x16) (u8x16_load_unaligned (data) == splat));
+ if (bmp != 0xffff)
+ return count + count_trailing_zeros (~bmp);
+
+ data += 16;
+ count += 16;
+ }
+#else
+ count += 2;
+ data += 2;
+ while (count + 3 < max_count && ((data[0] ^ first) | (data[1] ^ first) |
+ (data[2] ^ first) | (data[3] ^ first)) == 0)
+ {
+ data += 4;
+ count += 4;
+ }
+#endif
+ while (count < max_count && (data[0] == first))
+ {
+ data += 1;
+ count += 1;
+ }
+ return count;
+}
+
+#endif
diff --git a/src/vppinfra/vector/index_to_ptr.h b/src/vppinfra/vector/index_to_ptr.h
new file mode 100644
index 00000000000..3985b757d54
--- /dev/null
+++ b/src/vppinfra/vector/index_to_ptr.h
@@ -0,0 +1,257 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Cisco Systems, Inc.
+ */
+
+#ifndef included_vector_index_to_ptr_h
+#define included_vector_index_to_ptr_h
+#include <vppinfra/clib.h>
+
+#ifdef CLIB_HAVE_VEC128
+static_always_inline void
+clib_index_to_ptr_u32x4 (u32 *indices, void **ptrs, i32 i, u64x2 ov, u8 shift)
+{
+ u32x4 iv4 = u32x4_load_unaligned (indices + i);
+ u64x2 pv2;
+ pv2 = u64x2_from_u32x4 (iv4);
+ u64x2_store_unaligned ((pv2 << shift) + ov, ptrs + i);
+#ifdef __aarch64__
+ pv2 = u64x2_from_u32x4_high (iv4);
+#else
+ pv2 = u64x2_from_u32x4 ((u32x4) u8x16_word_shift_right (iv4, 8));
+#endif
+ u64x2_store_unaligned ((pv2 << shift) + ov, ptrs + i + 2);
+}
+#endif
+
+/** \brief Convert array of indices to pointers with base and shift
+
+ @param indices source array of u32 indices
+ @param base base pointer
+ @param shift numbers of bits to be shifted
+ @param ptrs destinatin array of pointers
+ @param n_elts number of elements in the source array
+*/
+
+static_always_inline void
+clib_index_to_ptr_u32 (u32 *indices, void *base, u8 shift, void **ptrs,
+ u32 n_elts)
+{
+#if defined CLIB_HAVE_VEC512
+ if (n_elts >= 8)
+ {
+ u64x8 off = u64x8_splat ((u64) base);
+ u64x8 b0, b1, b2, b3, b4, b5, b6, b7;
+
+ while (n_elts >= 64)
+ {
+ b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
+ b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8));
+ b2 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 16));
+ b3 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 24));
+ b4 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 32));
+ b5 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 40));
+ b6 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 48));
+ b7 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 56));
+ u64x8_store_unaligned ((b0 << shift) + off, ptrs);
+ u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8);
+ u64x8_store_unaligned ((b2 << shift) + off, ptrs + 16);
+ u64x8_store_unaligned ((b3 << shift) + off, ptrs + 24);
+ u64x8_store_unaligned ((b4 << shift) + off, ptrs + 32);
+ u64x8_store_unaligned ((b5 << shift) + off, ptrs + 40);
+ u64x8_store_unaligned ((b6 << shift) + off, ptrs + 48);
+ u64x8_store_unaligned ((b7 << shift) + off, ptrs + 56);
+ ptrs += 64;
+ indices += 64;
+ n_elts -= 64;
+ }
+
+ if (n_elts == 0)
+ return;
+
+ if (n_elts >= 32)
+ {
+ b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
+ b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8));
+ b2 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 16));
+ b3 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 24));
+ u64x8_store_unaligned ((b0 << shift) + off, ptrs);
+ u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8);
+ u64x8_store_unaligned ((b2 << shift) + off, ptrs + 16);
+ u64x8_store_unaligned ((b3 << shift) + off, ptrs + 24);
+ ptrs += 32;
+ indices += 32;
+ n_elts -= 32;
+ }
+ if (n_elts >= 16)
+ {
+ b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
+ b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8));
+ u64x8_store_unaligned ((b0 << shift) + off, ptrs);
+ u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8);
+ ptrs += 16;
+ indices += 16;
+ n_elts -= 16;
+ }
+ if (n_elts >= 8)
+ {
+ b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
+ u64x8_store_unaligned ((b0 << shift) + off, ptrs);
+ ptrs += 8;
+ indices += 8;
+ n_elts -= 8;
+ }
+
+ if (n_elts == 0)
+ return;
+
+ b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + n_elts - 8));
+ u64x8_store_unaligned ((b0 << shift) + off, ptrs + n_elts - 8);
+ }
+ else
+ {
+ u32 mask = pow2_mask (n_elts);
+ u64x8 r = u64x8_from_u32x8 (u32x8_mask_load_zero (indices, mask));
+ u64x8_mask_store ((r << shift) + u64x8_splat ((u64) base), ptrs, mask);
+ return;
+ }
+#elif defined CLIB_HAVE_VEC256
+ if (n_elts >= 4)
+ {
+ u64x4 off = u64x4_splat ((u64) base);
+ u64x4 b0, b1, b2, b3, b4, b5, b6, b7;
+
+ while (n_elts >= 32)
+ {
+ b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
+ b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4));
+ b2 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 8));
+ b3 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 12));
+ b4 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 16));
+ b5 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 20));
+ b6 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 24));
+ b7 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 28));
+ u64x4_store_unaligned ((b0 << shift) + off, ptrs);
+ u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4);
+ u64x4_store_unaligned ((b2 << shift) + off, ptrs + 8);
+ u64x4_store_unaligned ((b3 << shift) + off, ptrs + 12);
+ u64x4_store_unaligned ((b4 << shift) + off, ptrs + 16);
+ u64x4_store_unaligned ((b5 << shift) + off, ptrs + 20);
+ u64x4_store_unaligned ((b6 << shift) + off, ptrs + 24);
+ u64x4_store_unaligned ((b7 << shift) + off, ptrs + 28);
+ ptrs += 32;
+ indices += 32;
+ n_elts -= 32;
+ }
+
+ if (n_elts == 0)
+ return;
+
+ if (n_elts >= 16)
+ {
+ b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
+ b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4));
+ b2 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 8));
+ b3 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 12));
+ u64x4_store_unaligned ((b0 << shift) + off, ptrs);
+ u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4);
+ u64x4_store_unaligned ((b2 << shift) + off, ptrs + 8);
+ u64x4_store_unaligned ((b3 << shift) + off, ptrs + 12);
+ ptrs += 16;
+ indices += 16;
+ n_elts -= 16;
+ }
+ if (n_elts >= 8)
+ {
+ b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
+ b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4));
+ u64x4_store_unaligned ((b0 << shift) + off, ptrs);
+ u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4);
+ ptrs += 8;
+ indices += 8;
+ n_elts -= 8;
+ }
+ if (n_elts > 4)
+ {
+ b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
+ u64x4_store_unaligned ((b0 << shift) + off, ptrs);
+ ptrs += 4;
+ indices += 4;
+ n_elts -= 4;
+ }
+
+ b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + n_elts - 4));
+ u64x4_store_unaligned ((b0 << shift) + off, ptrs + n_elts - 4);
+ return;
+ }
+#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE
+ else
+ {
+ u32 mask = pow2_mask (n_elts);
+ u64x4 r = u64x4_from_u32x4 (u32x4_mask_load_zero (indices, mask));
+ u64x4_mask_store ((r << shift) + u64x4_splat ((u64) base), ptrs, mask);
+ return;
+ }
+#endif
+#elif defined(CLIB_HAVE_VEC128)
+ if (n_elts >= 4)
+ {
+ u64x2 ov = u64x2_splat ((u64) base);
+ u32 *i = (u32 *) indices;
+ void **p = (void **) ptrs;
+ u32 n = n_elts;
+
+ while (n >= 32)
+ {
+ clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
+ clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift);
+ clib_index_to_ptr_u32x4 (indices, ptrs, 8, ov, shift);
+ clib_index_to_ptr_u32x4 (indices, ptrs, 12, ov, shift);
+ clib_index_to_ptr_u32x4 (indices, ptrs, 16, ov, shift);
+ clib_index_to_ptr_u32x4 (indices, ptrs, 20, ov, shift);
+ clib_index_to_ptr_u32x4 (indices, ptrs, 24, ov, shift);
+ clib_index_to_ptr_u32x4 (indices, ptrs, 28, ov, shift);
+ indices += 32;
+ ptrs += 32;
+ n -= 32;
+ }
+
+ if (n == 0)
+ return;
+
+ if (n >= 16)
+ {
+ clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
+ clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift);
+ clib_index_to_ptr_u32x4 (indices, ptrs, 8, ov, shift);
+ clib_index_to_ptr_u32x4 (indices, ptrs, 12, ov, shift);
+ indices += 16;
+ ptrs += 16;
+ n -= 16;
+ }
+
+ if (n >= 8)
+ {
+ clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
+ clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift);
+ indices += 8;
+ ptrs += 8;
+ n -= 8;
+ }
+
+ if (n > 4)
+ clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
+
+ clib_index_to_ptr_u32x4 (i, p, n_elts - 4, ov, shift);
+ return;
+ }
+#endif
+ while (n_elts)
+ {
+ ptrs[0] = base + ((u64) indices[0] << shift);
+ ptrs += 1;
+ indices += 1;
+ n_elts -= 1;
+ }
+}
+
+#endif
diff --git a/src/vppinfra/vector/ip_csum.h b/src/vppinfra/vector/ip_csum.h
new file mode 100644
index 00000000000..2cea9b448ea
--- /dev/null
+++ b/src/vppinfra/vector/ip_csum.h
@@ -0,0 +1,339 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Cisco Systems, Inc.
+ */
+
+#ifndef included_vector_ip_csum_h
+#define included_vector_ip_csum_h
+#include <vppinfra/clib.h>
+typedef struct
+{
+ u64 sum;
+ u8 odd;
+} clib_ip_csum_t;
+
+#if defined(CLIB_HAVE_VEC128)
+static_always_inline u64x2
+clib_ip_csum_cvt_and_add_4 (u32x4 v)
+{
+ return ((u64x2) u32x4_interleave_lo ((u32x4) v, u32x4_zero ()) +
+ (u64x2) u32x4_interleave_hi ((u32x4) v, u32x4_zero ()));
+}
+static_always_inline u64
+clib_ip_csum_hadd_2 (u64x2 v)
+{
+ return v[0] + v[1];
+}
+#endif
+
+#if defined(CLIB_HAVE_VEC256)
+static_always_inline u64x4
+clib_ip_csum_cvt_and_add_8 (u32x8 v)
+{
+ return ((u64x4) u32x8_interleave_lo ((u32x8) v, u32x8_zero ()) +
+ (u64x4) u32x8_interleave_hi ((u32x8) v, u32x8_zero ()));
+}
+static_always_inline u64
+clib_ip_csum_hadd_4 (u64x4 v)
+{
+ return clib_ip_csum_hadd_2 (u64x4_extract_lo (v) + u64x4_extract_hi (v));
+}
+#endif
+
+#if defined(CLIB_HAVE_VEC512)
+static_always_inline u64x8
+clib_ip_csum_cvt_and_add_16 (u32x16 v)
+{
+ return ((u64x8) u32x16_interleave_lo ((u32x16) v, u32x16_zero ()) +
+ (u64x8) u32x16_interleave_hi ((u32x16) v, u32x16_zero ()));
+}
+static_always_inline u64
+clib_ip_csum_hadd_8 (u64x8 v)
+{
+ return clib_ip_csum_hadd_4 (u64x8_extract_lo (v) + u64x8_extract_hi (v));
+}
+#endif
+
+static_always_inline void
+clib_ip_csum_inline (clib_ip_csum_t *c, u8 *dst, u8 *src, u16 count,
+ int is_copy)
+{
+ if (c->odd)
+ {
+ c->odd = 0;
+ c->sum += (u16) src[0] << 8;
+ count--;
+ src++;
+ if (is_copy)
+ dst++[0] = src[0];
+ }
+
+#if defined(CLIB_HAVE_VEC512)
+ u64x8 sum8 = {};
+
+ while (count >= 512)
+ {
+ u32x16u *s = (u32x16u *) src;
+ sum8 += clib_ip_csum_cvt_and_add_16 (s[0]);
+ sum8 += clib_ip_csum_cvt_and_add_16 (s[1]);
+ sum8 += clib_ip_csum_cvt_and_add_16 (s[2]);
+ sum8 += clib_ip_csum_cvt_and_add_16 (s[3]);
+ sum8 += clib_ip_csum_cvt_and_add_16 (s[8]);
+ sum8 += clib_ip_csum_cvt_and_add_16 (s[5]);
+ sum8 += clib_ip_csum_cvt_and_add_16 (s[6]);
+ sum8 += clib_ip_csum_cvt_and_add_16 (s[7]);
+ count -= 512;
+ src += 512;
+ if (is_copy)
+ {
+ u32x16u *d = (u32x16u *) dst;
+ d[0] = s[0];
+ d[1] = s[1];
+ d[2] = s[2];
+ d[3] = s[3];
+ d[4] = s[4];
+ d[5] = s[5];
+ d[6] = s[6];
+ d[7] = s[7];
+ dst += 512;
+ }
+ }
+
+ while (count >= 64)
+ {
+ u32x16u *s = (u32x16u *) src;
+ sum8 += clib_ip_csum_cvt_and_add_16 (s[0]);
+ count -= 64;
+ src += 64;
+ if (is_copy)
+ {
+ u32x16u *d = (u32x16u *) dst;
+ d[0] = s[0];
+ dst += 512;
+ }
+ }
+
+#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE
+ if (count)
+ {
+ u64 mask = pow2_mask (count);
+ u32x16 v = (u32x16) u8x64_mask_load_zero (src, mask);
+ sum8 += clib_ip_csum_cvt_and_add_16 (v);
+ c->odd = count & 1;
+ if (is_copy)
+ u32x16_mask_store (v, dst, mask);
+ }
+ c->sum += clib_ip_csum_hadd_8 (sum8);
+ return;
+#endif
+
+ c->sum += clib_ip_csum_hadd_8 (sum8);
+#elif defined(CLIB_HAVE_VEC256)
+ u64x4 sum4 = {};
+
+ while (count >= 256)
+ {
+ u32x8u *s = (u32x8u *) src;
+ sum4 += clib_ip_csum_cvt_and_add_8 (s[0]);
+ sum4 += clib_ip_csum_cvt_and_add_8 (s[1]);
+ sum4 += clib_ip_csum_cvt_and_add_8 (s[2]);
+ sum4 += clib_ip_csum_cvt_and_add_8 (s[3]);
+ sum4 += clib_ip_csum_cvt_and_add_8 (s[4]);
+ sum4 += clib_ip_csum_cvt_and_add_8 (s[5]);
+ sum4 += clib_ip_csum_cvt_and_add_8 (s[6]);
+ sum4 += clib_ip_csum_cvt_and_add_8 (s[7]);
+ count -= 256;
+ src += 256;
+ if (is_copy)
+ {
+ u32x8u *d = (u32x8u *) dst;
+ d[0] = s[0];
+ d[1] = s[1];
+ d[2] = s[2];
+ d[3] = s[3];
+ d[4] = s[4];
+ d[5] = s[5];
+ d[6] = s[6];
+ d[7] = s[7];
+ dst += 256;
+ }
+ }
+
+ while (count >= 32)
+ {
+ u32x8u *s = (u32x8u *) src;
+ sum4 += clib_ip_csum_cvt_and_add_8 (s[0]);
+ count -= 32;
+ src += 32;
+ if (is_copy)
+ {
+ u32x8u *d = (u32x8u *) dst;
+ d[0] = s[0];
+ dst += 32;
+ }
+ }
+
+#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE
+ if (count)
+ {
+ u32 mask = pow2_mask (count);
+ u32x8 v = (u32x8) u8x32_mask_load_zero (src, mask);
+ sum4 += clib_ip_csum_cvt_and_add_8 (v);
+ c->odd = count & 1;
+ if (is_copy)
+ u32x8_mask_store (v, dst, mask);
+ }
+ c->sum += clib_ip_csum_hadd_4 (sum4);
+ return;
+#endif
+
+ c->sum += clib_ip_csum_hadd_4 (sum4);
+#elif defined(CLIB_HAVE_VEC128)
+ u64x2 sum2 = {};
+
+ while (count >= 128)
+ {
+ u32x4u *s = (u32x4u *) src;
+ sum2 += clib_ip_csum_cvt_and_add_4 (s[0]);
+ sum2 += clib_ip_csum_cvt_and_add_4 (s[1]);
+ sum2 += clib_ip_csum_cvt_and_add_4 (s[2]);
+ sum2 += clib_ip_csum_cvt_and_add_4 (s[3]);
+ sum2 += clib_ip_csum_cvt_and_add_4 (s[4]);
+ sum2 += clib_ip_csum_cvt_and_add_4 (s[5]);
+ sum2 += clib_ip_csum_cvt_and_add_4 (s[6]);
+ sum2 += clib_ip_csum_cvt_and_add_4 (s[7]);
+ count -= 128;
+ src += 128;
+ if (is_copy)
+ {
+ u32x4u *d = (u32x4u *) dst;
+ d[0] = s[0];
+ d[1] = s[1];
+ d[2] = s[2];
+ d[3] = s[3];
+ d[4] = s[4];
+ d[5] = s[5];
+ d[6] = s[6];
+ d[7] = s[7];
+ dst += 128;
+ }
+ }
+
+ while (count >= 16)
+ {
+ u32x4u *s = (u32x4u *) src;
+ sum2 += clib_ip_csum_cvt_and_add_4 (s[0]);
+ count -= 16;
+ src += 16;
+ if (is_copy)
+ {
+ u32x4u *d = (u32x4u *) dst;
+ d[0] = s[0];
+ dst += 16;
+ }
+ }
+ c->sum += clib_ip_csum_hadd_2 (sum2);
+#else
+ while (count >= 4)
+ {
+ u32 v = *((u32 *) src);
+ c->sum += v;
+ count -= 4;
+ src += 4;
+ if (is_copy)
+ {
+ *(u32 *) dst = v;
+ dst += 4;
+ }
+ }
+#endif
+ while (count >= 2)
+ {
+ u16 v = *((u16 *) src);
+ c->sum += v;
+ count -= 2;
+ src += 2;
+ if (is_copy)
+ {
+ *(u16 *) dst = v;
+ dst += 2;
+ }
+ }
+
+ if (count)
+ {
+ c->odd = 1;
+ c->sum += (u16) src[0];
+ if (is_copy)
+ dst[0] = src[0];
+ }
+}
+
+static_always_inline u16
+clib_ip_csum_fold (clib_ip_csum_t *c)
+{
+ u64 sum = c->sum;
+#if defined(__x86_64__) && defined(__BMI2__)
+ u64 tmp = sum;
+ asm volatile(
+ /* using ADC is much faster than mov, shift, add sequence
+ * compiler produces */
+ "shr $32, %[sum] \n\t"
+ "add %k[tmp], %k[sum] \n\t"
+ "mov $16, %k[tmp] \n\t"
+ "shrx %k[tmp], %k[sum], %k[tmp] \n\t"
+ "adc %w[tmp], %w[sum] \n\t"
+ "adc $0, %w[sum] \n\t"
+ : [ sum ] "+&r"(sum), [ tmp ] "+&r"(tmp));
+#else
+ sum = ((u32) sum) + (sum >> 32);
+ sum = ((u16) sum) + (sum >> 16);
+ sum = ((u16) sum) + (sum >> 16);
+#endif
+ return (~((u16) sum));
+}
+
+static_always_inline void
+clib_ip_csum_chunk (clib_ip_csum_t *c, u8 *src, u16 count)
+{
+ return clib_ip_csum_inline (c, 0, src, count, 0);
+}
+
+static_always_inline void
+clib_ip_csum_and_copy_chunk (clib_ip_csum_t *c, u8 *src, u8 *dst, u16 count)
+{
+ return clib_ip_csum_inline (c, dst, src, count, 1);
+}
+
+static_always_inline u16
+clib_ip_csum (u8 *src, u16 count)
+{
+ clib_ip_csum_t c = {};
+ if (COMPILE_TIME_CONST (count) && count == 12)
+ {
+ for (int i = 0; i < 3; i++)
+ c.sum += ((u32 *) src)[i];
+ }
+ else if (COMPILE_TIME_CONST (count) && count == 20)
+ {
+ for (int i = 0; i < 5; i++)
+ c.sum += ((u32 *) src)[i];
+ }
+ else if (COMPILE_TIME_CONST (count) && count == 40)
+ {
+ for (int i = 0; i < 10; i++)
+ c.sum += ((u32 *) src)[i];
+ }
+ else
+ clib_ip_csum_inline (&c, 0, src, count, 0);
+ return clib_ip_csum_fold (&c);
+}
+
+static_always_inline u16
+clib_ip_csum_and_copy (u8 *dst, u8 *src, u16 count)
+{
+ clib_ip_csum_t c = {};
+ clib_ip_csum_inline (&c, dst, src, count, 1);
+ return clib_ip_csum_fold (&c);
+}
+
+#endif
diff --git a/src/vppinfra/vector/mask_compare.h b/src/vppinfra/vector/mask_compare.h
index cac48a31f47..fc72d7dac35 100644
--- a/src/vppinfra/vector/mask_compare.h
+++ b/src/vppinfra/vector/mask_compare.h
@@ -8,7 +8,7 @@
#include <vppinfra/memcpy.h>
static_always_inline u64
-clib_mask_compare_u16_x64 (u16 v, u16 *a, u32 n_elts)
+clib_mask_compare_u16_x64 (u16 v, u16 *a)
{
u64 mask = 0;
#if defined(CLIB_HAVE_VEC512)
@@ -47,6 +47,38 @@ clib_mask_compare_u16_x64 (u16 v, u16 *a, u32 n_elts)
(u64) i8x16_msb_mask (i8x16_pack (v8 == av[4], v8 == av[5])) << 32 |
(u64) i8x16_msb_mask (i8x16_pack (v8 == av[6], v8 == av[7])) << 48);
#else
+ for (int i = 0; i < 64; i++)
+ if (a[i] == v)
+ mask |= 1ULL << i;
+#endif
+ return mask;
+}
+
+static_always_inline u64
+clib_mask_compare_u16_x64_n (u16 v, u16 *a, u32 n_elts)
+{
+ u64 mask = 0;
+ CLIB_UNUSED (u64 data_mask) = pow2_mask (n_elts);
+#if defined(CLIB_HAVE_VEC512)
+ u16x32 v32 = u16x32_splat (v);
+ u16x32u *av = (u16x32u *) a;
+ mask = ((u64) u16x32_is_equal_mask (
+ u16x32_mask_load_zero (&av[0], data_mask), v32) |
+ (u64) u16x32_is_equal_mask (
+ u16x32_mask_load_zero (&av[1], data_mask >> 32), v32)
+ << 32);
+#elif defined(CLIB_HAVE_VEC256) && defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
+ u16x16 v16 = u16x16_splat (v);
+ u16x16u *av = (u16x16u *) a;
+ i8x32 x;
+
+ x = i8x32_pack (v16 == u16x16_mask_load_zero (&av[0], data_mask),
+ v16 == u16x16_mask_load_zero (&av[1], data_mask >> 16));
+ mask = i8x32_msb_mask ((i8x32) u64x4_permute (x, 0, 2, 1, 3));
+ x = i8x32_pack (v16 == u16x16_mask_load_zero (&av[2], data_mask >> 32),
+ v16 == u16x16_mask_load_zero (&av[3], data_mask >> 48));
+ mask |= (u64) i8x32_msb_mask ((i8x32) u64x4_permute (x, 0, 2, 1, 3)) << 32;
+#else
for (int i = 0; i < n_elts; i++)
if (a[i] == v)
mask |= 1ULL << i;
@@ -68,7 +100,7 @@ clib_mask_compare_u16 (u16 v, u16 *a, u64 *mask, u32 n_elts)
{
while (n_elts >= 64)
{
- mask++[0] = clib_mask_compare_u16_x64 (v, a, 64);
+ mask++[0] = clib_mask_compare_u16_x64 (v, a);
n_elts -= 64;
a += 64;
}
@@ -76,11 +108,11 @@ clib_mask_compare_u16 (u16 v, u16 *a, u64 *mask, u32 n_elts)
if (PREDICT_TRUE (n_elts == 0))
return;
- mask[0] = clib_mask_compare_u16_x64 (v, a, n_elts) & pow2_mask (n_elts);
+ mask[0] = clib_mask_compare_u16_x64_n (v, a, n_elts) & pow2_mask (n_elts);
}
static_always_inline u64
-clib_mask_compare_u32_x64 (u32 v, u32 *a, u32 n_elts)
+clib_mask_compare_u32_x64 (u32 v, u32 *a)
{
u64 mask = 0;
#if defined(CLIB_HAVE_VEC512)
@@ -131,6 +163,57 @@ clib_mask_compare_u32_x64 (u32 v, u32 *a, u32 n_elts)
}
#else
+ for (int i = 0; i < 64; i++)
+ if (a[i] == v)
+ mask |= 1ULL << i;
+#endif
+ return mask;
+}
+
+static_always_inline u64
+clib_mask_compare_u32_x64_n (u32 v, u32 *a, u32 n_elts)
+{
+ u64 mask = 0;
+ CLIB_UNUSED (u64 data_mask) = pow2_mask (n_elts);
+#if defined(CLIB_HAVE_VEC512)
+ u32x16 v16 = u32x16_splat (v);
+ u32x16u *av = (u32x16u *) a;
+ mask = ((u64) u32x16_is_equal_mask (
+ u32x16_mask_load_zero (&av[0], data_mask), v16) |
+ (u64) u32x16_is_equal_mask (
+ u32x16_mask_load_zero (&av[1], data_mask >> 16), v16)
+ << 16 |
+ (u64) u32x16_is_equal_mask (
+ u32x16_mask_load_zero (&av[2], data_mask >> 32), v16)
+ << 32 |
+ (u64) u32x16_is_equal_mask (
+ u32x16_mask_load_zero (&av[3], data_mask >> 48), v16)
+ << 48);
+#elif defined(CLIB_HAVE_VEC256) && defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
+ u32x8 v8 = u32x8_splat (v);
+ u32x8u *av = (u32x8u *) a;
+ u32x8 m = { 0, 4, 1, 5, 2, 6, 3, 7 };
+ i8x32 c;
+
+ c = i8x32_pack (
+ i16x16_pack (
+ (i32x8) (v8 == u32x8_mask_load_zero (&av[0], data_mask)),
+ (i32x8) (v8 == u32x8_mask_load_zero (&av[1], data_mask >> 8))),
+ i16x16_pack (
+ (i32x8) (v8 == u32x8_mask_load_zero (&av[2], data_mask >> 16)),
+ (i32x8) (v8 == u32x8_mask_load_zero (&av[3], data_mask >> 24))));
+ mask = i8x32_msb_mask ((i8x32) u32x8_permute ((u32x8) c, m));
+
+ c = i8x32_pack (
+ i16x16_pack (
+ (i32x8) (v8 == u32x8_mask_load_zero (&av[4], data_mask >> 32)),
+ (i32x8) (v8 == u32x8_mask_load_zero (&av[5], data_mask >> 40))),
+ i16x16_pack (
+ (i32x8) (v8 == u32x8_mask_load_zero (&av[6], data_mask >> 48)),
+ (i32x8) (v8 == u32x8_mask_load_zero (&av[7], data_mask >> 56))));
+ mask |= (u64) i8x32_msb_mask ((i8x32) u32x8_permute ((u32x8) c, m)) << 32;
+ mask |= (u64) i8x32_msb_mask ((i8x32) u32x8_permute ((u32x8) c, m)) << 32;
+#else
for (int i = 0; i < n_elts; i++)
if (a[i] == v)
mask |= 1ULL << i;
@@ -152,7 +235,119 @@ clib_mask_compare_u32 (u32 v, u32 *a, u64 *bitmap, u32 n_elts)
{
while (n_elts >= 64)
{
- bitmap++[0] = clib_mask_compare_u32_x64 (v, a, 64);
+ bitmap++[0] = clib_mask_compare_u32_x64 (v, a);
+ n_elts -= 64;
+ a += 64;
+ }
+
+ if (PREDICT_TRUE (n_elts == 0))
+ return;
+
+ bitmap[0] = clib_mask_compare_u32_x64_n (v, a, n_elts) & pow2_mask (n_elts);
+}
+
+static_always_inline u64
+clib_mask_compare_u64_x64 (u64 v, u64 *a)
+{
+ u64 mask = 0;
+#if defined(CLIB_HAVE_VEC512)
+ u64x8 v8 = u64x8_splat (v);
+ u64x8u *av = (u64x8u *) a;
+ mask = ((u64) u64x8_is_equal_mask (av[0], v8) |
+ (u64) u64x8_is_equal_mask (av[1], v8) << 8 |
+ (u64) u64x8_is_equal_mask (av[2], v8) << 16 |
+ (u64) u64x8_is_equal_mask (av[3], v8) << 24 |
+ (u64) u64x8_is_equal_mask (av[4], v8) << 32 |
+ (u64) u64x8_is_equal_mask (av[5], v8) << 40 |
+ (u64) u64x8_is_equal_mask (av[6], v8) << 48 |
+ (u64) u64x8_is_equal_mask (av[7], v8) << 56);
+
+#elif defined(CLIB_HAVE_VEC256) && defined(__BMI2__)
+ u64x4 v4 = u64x4_splat (v);
+ u64x4u *av = (u64x4u *) a;
+
+ for (int i = 0; i < 16; i += 2)
+ {
+ u64 l = u8x32_msb_mask (v4 == av[i]);
+ u64 h = u8x32_msb_mask (v4 == av[i + 1]);
+ mask |= _pext_u64 (l | h << 32, 0x0101010101010101) << (i * 4);
+ }
+#else
+ for (int i = 0; i < 64; i++)
+ if (a[i] == v)
+ mask |= 1ULL << i;
+#endif
+ return mask;
+}
+
+static_always_inline u64
+clib_mask_compare_u64_x64_n (u64 v, u64 *a, u32 n_elts)
+{
+ u64 mask = 0;
+ CLIB_UNUSED (u64 data_mask) = pow2_mask (n_elts);
+#if defined(CLIB_HAVE_VEC512)
+ u64x8 v8 = u64x8_splat (v);
+ u64x8u *av = (u64x8u *) a;
+ mask =
+ ((u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[0], data_mask), v8) |
+ (u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[1], data_mask >> 8),
+ v8)
+ << 8 |
+ (u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[2], data_mask >> 16),
+ v8)
+ << 16 |
+ (u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[3], data_mask >> 24),
+ v8)
+ << 24 |
+ (u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[4], data_mask >> 32),
+ v8)
+ << 32 |
+ (u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[5], data_mask >> 40),
+ v8)
+ << 40 |
+ (u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[6], data_mask >> 48),
+ v8)
+ << 48 |
+ (u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[7], data_mask >> 56),
+ v8)
+ << 56);
+
+#elif defined(CLIB_HAVE_VEC256) && defined(__BMI2__) && \
+ defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
+ u64x4 v4 = u64x4_splat (v);
+ u64x4u *av = (u64x4u *) a;
+
+ for (int i = 0; i < 16; i += 2)
+ {
+ u64 l = u8x32_msb_mask (v4 == u64x4_mask_load_zero (&av[i], data_mask));
+ u64 h = u8x32_msb_mask (
+ v4 == u64x4_mask_load_zero (&av[i + 1], data_mask >> 4));
+ mask |= _pext_u64 (l | h << 32, 0x0101010101010101) << (i * 4);
+ data_mask >>= 8;
+ }
+#else
+ for (int i = 0; i < n_elts; i++)
+ if (a[i] == v)
+ mask |= 1ULL << i;
+#endif
+ return mask;
+}
+
+/** \brief Compare 64-bit elemments with provied value and return bitmap
+
+ @param v value to compare elements with
+ @param a array of u64 elements
+ @param mask array of u64 where reuslting mask will be stored
+ @param n_elts number of elements in the array
+ @return none
+*/
+
+static_always_inline void
+clib_mask_compare_u64 (u64 v, u64 *a, u64 *bitmap, u32 n_elts)
+{
+ while (n_elts >= 64)
+ {
+ bitmap++[0] = clib_mask_compare_u64_x64 (v, a);
n_elts -= 64;
a += 64;
}
@@ -160,7 +355,7 @@ clib_mask_compare_u32 (u32 v, u32 *a, u64 *bitmap, u32 n_elts)
if (PREDICT_TRUE (n_elts == 0))
return;
- bitmap[0] = clib_mask_compare_u32_x64 (v, a, n_elts) & pow2_mask (n_elts);
+ bitmap[0] = clib_mask_compare_u64_x64_n (v, a, n_elts) & pow2_mask (n_elts);
}
#endif
diff --git a/src/vppinfra/vector/test/array_mask.c b/src/vppinfra/vector/test/array_mask.c
deleted file mode 100644
index a1f4da728d4..00000000000
--- a/src/vppinfra/vector/test/array_mask.c
+++ /dev/null
@@ -1,124 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright(c) 2021 Cisco Systems, Inc.
- */
-
-#include <vppinfra/format.h>
-#include <vppinfra/vector/test/test.h>
-#include <vppinfra/vector/array_mask.h>
-
-__clib_test_fn void
-clib_array_mask_u32_wrapper (u32 *src, u32 mask, u32 n_elts)
-{
- clib_array_mask_u32 (src, mask, n_elts);
-}
-
-typedef struct
-{
- u32 mask;
- u32 expected[256];
-} array_mask_test_t;
-
-static array_mask_test_t tests[] = {
- /* mask values 0x1, output array of alternating 0 1 0 1 .. */
- { .mask = 1,
- .expected = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 } },
- /* mask values 0xFFFFFFFF, output array of 0, 1, 2, .., 255 */
- { .mask = ~0U,
- .expected = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
- 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
- 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
- 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
- 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
- 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
- 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
- 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
- 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
- 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
- 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
- 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
- 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
- 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
- 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
- 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
- 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203,
- 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215,
- 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227,
- 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
- 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
- 252, 253, 254, 255 } },
- /* mask values 0xF, output array of 0, .., 15, 0, .., 15, 0, .., 15 */
- { .mask = 15,
- .expected = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } },
- /* mask values 0x1, output array of 1, 0, 1, 0,.. */
- { .mask = 1, .expected = { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 } },
-};
-
-static clib_error_t *
-test_clib_array_mask_u32 (clib_error_t *err)
-{
- u32 i, j;
- for (i = 0; i < ARRAY_LEN (tests) - 1; i++)
- {
- u32 src[256];
- for (j = 0; j < ARRAY_LEN (src); j++)
- src[j] = j;
-
- array_mask_test_t *t = tests + i;
- clib_array_mask_u32_wrapper (src, t->mask, ARRAY_LEN (src));
- for (j = 0; j < ARRAY_LEN (src); j++)
- {
- if (src[j] != t->expected[j])
- return clib_error_return (err,
- "testcase %u failed at "
- "(src[%u] = 0x%x, expected 0x%x)",
- i, j, src[j], t->expected[j]);
- }
- }
-
- u32 src[10] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
- array_mask_test_t *t = tests + i;
-
- clib_array_mask_u32_wrapper (src, t->mask, ARRAY_LEN (src));
- for (j = 0; j < ARRAY_LEN (src); j++)
- {
- if (src[j] != t->expected[j])
- return clib_error_return (err,
- "testcase %u failed at "
- "(src[%u] = 0x%x, expected 0x%x)",
- i, j, src[j], t->expected[j]);
- }
-
- return err;
-}
-
-REGISTER_TEST (clib_array_mask_u32) = {
- .name = "clib_array_mask_u32",
- .fn = test_clib_array_mask_u32,
-};
diff --git a/src/vppinfra/vector/test/compress.c b/src/vppinfra/vector/test/compress.c
deleted file mode 100644
index 9bc53ff1e41..00000000000
--- a/src/vppinfra/vector/test/compress.c
+++ /dev/null
@@ -1,248 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright(c) 2021 Cisco Systems, Inc.
- */
-
-#include <vppinfra/format.h>
-#include <vppinfra/vector/test/test.h>
-#include <vppinfra/vector/compress.h>
-
-__clib_test_fn u32
-clib_compress_u64_wrapper (u64 *dst, u64 *src, u64 *mask, u32 n_elts)
-{
- return clib_compress_u64 (dst, src, mask, n_elts);
-}
-
-__clib_test_fn u32
-clib_compress_u32_wrapper (u32 *dst, u32 *src, u64 *mask, u32 n_elts)
-{
- return clib_compress_u32 (dst, src, mask, n_elts);
-}
-
-__clib_test_fn u32
-clib_compress_u16_wrapper (u16 *dst, u16 *src, u64 *mask, u32 n_elts)
-{
- return clib_compress_u16 (dst, src, mask, n_elts);
-}
-
-__clib_test_fn u32
-clib_compress_u8_wrapper (u8 *dst, u8 *src, u64 *mask, u32 n_elts)
-{
- return clib_compress_u8 (dst, src, mask, n_elts);
-}
-
-typedef struct
-{
- u64 mask[10];
- u32 n_elts;
-} compress_test_t;
-
-static compress_test_t tests[] = {
- { .mask = { 1 }, .n_elts = 1 },
- { .mask = { 2 }, .n_elts = 2 },
- { .mask = { 3 }, .n_elts = 2 },
- { .mask = { 0, 1 }, .n_elts = 66 },
- { .mask = { 0, 2 }, .n_elts = 69 },
- { .mask = { 0, 3 }, .n_elts = 66 },
- { .mask = { ~0ULL, ~0ULL, ~0ULL, ~0ULL }, .n_elts = 62 },
- { .mask = { ~0ULL, ~0ULL, ~0ULL, ~0ULL }, .n_elts = 255 },
- { .mask = { ~0ULL, 1, 1, ~0ULL }, .n_elts = 256 },
-};
-
-static clib_error_t *
-test_clib_compress_u64 (clib_error_t *err)
-{
- u64 src[513];
- u64 dst[513];
- u32 i, j;
-
- for (i = 0; i < ARRAY_LEN (src); i++)
- src[i] = i;
-
- for (i = 0; i < ARRAY_LEN (tests); i++)
- {
- compress_test_t *t = tests + i;
- u64 *dp = dst;
- u32 r;
-
- for (j = 0; j < ARRAY_LEN (dst); j++)
- dst[j] = 0xa5a5a5a5a5a5a5a5;
-
- r = clib_compress_u64_wrapper (dst, src, t->mask, t->n_elts);
-
- for (j = 0; j < t->n_elts; j++)
- {
- if ((t->mask[j >> 6] & (1ULL << (j & 0x3f))) == 0)
- continue;
- if (dp[0] != src[j])
- return clib_error_return (err,
- "wrong data in testcase %u at "
- "(dst[%u] = 0x%lx, src[%u] = 0x%lx)",
- i, dp - dst, dp[0], j, src[j]);
- dp++;
- }
-
- if (dst[dp - dst + 1] != 0xa5a5a5a5a5a5a5a5)
- return clib_error_return (err, "buffer overrun in testcase %u", i);
-
- if (dp - dst != r)
- return clib_error_return (err, "wrong number of elts in testcase %u",
- i);
- }
-
- return err;
-
- return err;
-}
-
-static clib_error_t *
-test_clib_compress_u32 (clib_error_t *err)
-{
- u32 src[513];
- u32 dst[513];
- u32 i, j;
-
- for (i = 0; i < ARRAY_LEN (src); i++)
- src[i] = i;
-
- for (i = 0; i < ARRAY_LEN (tests); i++)
- {
- compress_test_t *t = tests + i;
- u32 *dp = dst;
- u32 r;
-
- for (j = 0; j < ARRAY_LEN (dst); j++)
- dst[j] = 0xa5a5a5a5;
-
- r = clib_compress_u32_wrapper (dst, src, t->mask, t->n_elts);
-
- for (j = 0; j < t->n_elts; j++)
- {
- if ((t->mask[j >> 6] & (1ULL << (j & 0x3f))) == 0)
- continue;
-
- if (dp[0] != src[j])
- return clib_error_return (err,
- "wrong data in testcase %u at "
- "(dst[%u] = 0x%x, src[%u] = 0x%x)",
- i, dp - dst, dp[0], j, src[j]);
- dp++;
- }
-
- if (dst[dp - dst + 1] != 0xa5a5a5a5)
- return clib_error_return (err, "buffer overrun in testcase %u", i);
-
- if (dp - dst != r)
- return clib_error_return (err, "wrong number of elts in testcase %u",
- i);
- }
-
- return err;
-}
-
-static clib_error_t *
-test_clib_compress_u16 (clib_error_t *err)
-{
- u16 src[513];
- u16 dst[513];
- u32 i, j;
-
- for (i = 0; i < ARRAY_LEN (src); i++)
- src[i] = i;
-
- for (i = 0; i < ARRAY_LEN (tests); i++)
- {
- compress_test_t *t = tests + i;
- u16 *dp = dst;
- u32 r;
-
- for (j = 0; j < ARRAY_LEN (dst); j++)
- dst[j] = 0xa5a5;
-
- r = clib_compress_u16_wrapper (dst, src, t->mask, t->n_elts);
-
- for (j = 0; j < t->n_elts; j++)
- {
- if ((t->mask[j >> 6] & (1ULL << (j & 0x3f))) == 0)
- continue;
- if (dp[0] != src[j])
- return clib_error_return (err,
- "wrong data in testcase %u at "
- "(dst[%u] = 0x%x, src[%u] = 0x%x)",
- i, dp - dst, dp[0], j, src[j]);
- dp++;
- }
-
- if (dst[dp - dst + 1] != 0xa5a5)
- return clib_error_return (err, "buffer overrun in testcase %u", i);
-
- if (dp - dst != r)
- return clib_error_return (err, "wrong number of elts in testcase %u",
- i);
- }
-
- return err;
-}
-
-static clib_error_t *
-test_clib_compress_u8 (clib_error_t *err)
-{
- u8 src[513];
- u8 dst[513];
- u32 i, j;
-
- for (i = 0; i < ARRAY_LEN (src); i++)
- src[i] = i;
-
- for (i = 0; i < ARRAY_LEN (tests); i++)
- {
- compress_test_t *t = tests + i;
- u8 *dp = dst;
- u32 r;
-
- for (j = 0; j < ARRAY_LEN (dst); j++)
- dst[j] = 0xa5;
-
- r = clib_compress_u8_wrapper (dst, src, t->mask, t->n_elts);
-
- for (j = 0; j < t->n_elts; j++)
- {
- if ((t->mask[j >> 6] & (1ULL << (j & 0x3f))) == 0)
- continue;
- if (dp[0] != src[j])
- return clib_error_return (err,
- "wrong data in testcase %u at "
- "(dst[%u] = 0x%x, src[%u] = 0x%x)",
- i, dp - dst, dp[0], j, src[j]);
- dp++;
- }
-
- if (dst[dp - dst + 1] != 0xa5)
- return clib_error_return (err, "buffer overrun in testcase %u", i);
-
- if (dp - dst != r)
- return clib_error_return (err, "wrong number of elts in testcase %u",
- i);
- }
-
- return err;
-}
-
-REGISTER_TEST (clib_compress_u64) = {
- .name = "clib_compress_u64",
- .fn = test_clib_compress_u64,
-};
-
-REGISTER_TEST (clib_compress_u32) = {
- .name = "clib_compress_u32",
- .fn = test_clib_compress_u32,
-};
-
-REGISTER_TEST (clib_compress_u16) = {
- .name = "clib_compress_u16",
- .fn = test_clib_compress_u16,
-};
-
-REGISTER_TEST (clib_compress_u8) = {
- .name = "clib_compress_u8",
- .fn = test_clib_compress_u8,
-};
diff --git a/src/vppinfra/vector/test/mask_compare.c b/src/vppinfra/vector/test/mask_compare.c
deleted file mode 100644
index 64df0ee084a..00000000000
--- a/src/vppinfra/vector/test/mask_compare.c
+++ /dev/null
@@ -1,95 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright(c) 2021 Cisco Systems, Inc.
- */
-
-#include <vppinfra/format.h>
-#include <vppinfra/vector/test/test.h>
-#include <vppinfra/vector/mask_compare.h>
-
-__clib_test_fn void
-clib_mask_compare_u16_wrapper (u16 v, u16 *a, u64 *mask, u32 n_elts)
-{
- clib_mask_compare_u16 (v, a, mask, n_elts);
-}
-
-__clib_test_fn void
-clib_mask_compare_u32_wrapper (u32 v, u32 *a, u64 *mask, u32 n_elts)
-{
- clib_mask_compare_u32 (v, a, mask, n_elts);
-}
-
-static clib_error_t *
-test_clib_mask_compare_u16 (clib_error_t *err)
-{
- u16 array[513];
- u64 mask[10];
- u32 i, j;
-
- for (i = 0; i < ARRAY_LEN (array); i++)
- array[i] = i;
-
- for (i = 0; i < ARRAY_LEN (array); i++)
- {
- for (j = 0; j < ARRAY_LEN (mask); j++)
- mask[j] = 0xa5a5a5a5a5a5a5a5;
-
- clib_mask_compare_u16_wrapper (i, array, mask, i + 1);
-
- for (j = 0; j < (i >> 6); j++)
- {
- if (mask[j])
- return clib_error_return (err, "mask at position %u not zero", j);
- }
- if (mask[j] != 1ULL << (i & 0x3f))
- return clib_error_return (err,
- "mask at position %u is %lx, expected %lx",
- j, mask[j], 1ULL << (i % 64));
-
- if (mask[j + 1] != 0xa5a5a5a5a5a5a5a5)
- return clib_error_return (err, "mask overrun at position %u", j + 1);
- }
- return err;
-}
-
-REGISTER_TEST (clib_mask_compare_u16) = {
- .name = "clib_mask_compare_u16",
- .fn = test_clib_mask_compare_u16,
-};
-
-static clib_error_t *
-test_clib_mask_compare_u32 (clib_error_t *err)
-{
- u32 array[513];
- u64 mask[10];
- u32 i, j;
-
- for (i = 0; i < ARRAY_LEN (array); i++)
- array[i] = i;
-
- for (i = 0; i < ARRAY_LEN (array); i++)
- {
- for (j = 0; j < ARRAY_LEN (mask); j++)
- mask[j] = 0xa5a5a5a5a5a5a5a5;
-
- clib_mask_compare_u32_wrapper (i, array, mask, i + 1);
-
- for (j = 0; j < (i >> 6); j++)
- {
- if (mask[j])
- return clib_error_return (err, "mask at position %u not zero", j);
- }
- if (mask[j] != 1ULL << (i & 0x3f))
- return clib_error_return (err,
- "mask at position %u is %lx, expected %lx",
- j, mask[j], 1ULL << (i % 64));
-
- if (mask[j + 1] != 0xa5a5a5a5a5a5a5a5)
- return clib_error_return (err, "mask overrun at position %u", j + 1);
- }
- return err;
-}
-
-REGISTER_TEST (clib_mask_compare_u32) = {
- .name = "clib_mask_compare_u32",
- .fn = test_clib_mask_compare_u32,
-};
diff --git a/src/vppinfra/vector/test/test.c b/src/vppinfra/vector/test/test.c
deleted file mode 100644
index 1a8b9d6ea10..00000000000
--- a/src/vppinfra/vector/test/test.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright(c) 2021 Cisco Systems, Inc.
- */
-
-#include <vppinfra/format.h>
-#include <vppinfra/vector/test/test.h>
-
-test_registration_t *test_registrations[CLIB_MARCH_TYPE_N_VARIANTS] = {};
-
-int
-test_march_supported (clib_march_variant_type_t type)
-{
-#define _(s, n) \
- if (CLIB_MARCH_VARIANT_TYPE_##s == type) \
- return clib_cpu_march_priority_##s ();
- foreach_march_variant
-#undef _
- return 0;
-}
-
-int
-main (int argc, char *argv[])
-{
- clib_mem_init (0, 64ULL << 20);
-
- for (int i = 0; i < CLIB_MARCH_TYPE_N_VARIANTS; i++)
- {
- test_registration_t *r = test_registrations[i];
-
- if (r == 0 || test_march_supported (i) < 0)
- continue;
-
- fformat (stdout, "\nMultiarch Variant: %U\n", format_march_variant, i);
- fformat (stdout,
- "-------------------------------------------------------\n");
- while (r)
- {
- clib_error_t *err;
- err = (r->fn) (0);
- fformat (stdout, "%-50s %s\n", r->name, err ? "FAIL" : "PASS");
- if (err)
- {
- clib_error_report (err);
- fformat (stdout, "\n");
- }
-
- r = r->next;
- }
- }
-
- fformat (stdout, "\n");
- return 0;
-}
diff --git a/src/vppinfra/vector/test/test.h b/src/vppinfra/vector/test/test.h
deleted file mode 100644
index bc499fb24e8..00000000000
--- a/src/vppinfra/vector/test/test.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright(c) 2021 Cisco Systems, Inc.
- */
-
-#ifndef included_test_test_h
-#define included_test_test_h
-
-#include <vppinfra/cpu.h>
-
-typedef clib_error_t *(test_fn_t) (clib_error_t *);
-
-typedef struct test_registration_
-{
- char *name;
- u8 multiarch : 1;
- test_fn_t *fn;
- struct test_registration_ *next;
-} test_registration_t;
-
-extern test_registration_t *test_registrations[CLIB_MARCH_TYPE_N_VARIANTS];
-
-#define __clib_test_fn static __clib_noinline __clib_section (".test_wrapper")
-
-#define REGISTER_TEST(x) \
- test_registration_t CLIB_MARCH_SFX (__test_##x); \
- static void __clib_constructor CLIB_MARCH_SFX (__test_registration_##x) ( \
- void) \
- { \
- test_registration_t *r = &CLIB_MARCH_SFX (__test_##x); \
- r->next = test_registrations[CLIB_MARCH_SFX (CLIB_MARCH_VARIANT_TYPE)]; \
- test_registrations[CLIB_MARCH_SFX (CLIB_MARCH_VARIANT_TYPE)] = r; \
- } \
- test_registration_t CLIB_MARCH_SFX (__test_##x)
-
-#endif
diff --git a/src/vppinfra/vector/toeplitz.c b/src/vppinfra/vector/toeplitz.c
new file mode 100644
index 00000000000..fcc4b64ad19
--- /dev/null
+++ b/src/vppinfra/vector/toeplitz.c
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Cisco Systems, Inc.
+ */
+
+#include <vppinfra/clib.h>
+#include <vppinfra/mem.h>
+#include <vppinfra/vector/toeplitz.h>
+
+static u8 default_key[40] = {
+ 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 0x41, 0x67,
+ 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 0xd0, 0xca, 0x2b, 0xcb,
+ 0xae, 0x7b, 0x30, 0xb4, 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30,
+ 0xf2, 0x0c, 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+};
+
+#ifdef __x86_64__
+static_always_inline void
+clib_toeplitz_hash_key_expand_8 (u64x2 kv, u64x8u *m)
+{
+ u64x8 kv4, a, b, shift = { 0, 1, 2, 3, 4, 5, 6, 7 };
+
+ kv4 = (u64x8){ kv[0], kv[1], kv[0], kv[1], kv[0], kv[1], kv[0], kv[1] };
+
+ /* clang-format off */
+ /* create 8 byte-swapped copies of the bytes 0 - 7 */
+ a = (u64x8) u8x64_shuffle (kv4,
+ 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0,
+ 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0,
+ 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0,
+ 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0,
+ 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0,
+ 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0,
+ 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0,
+ 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0);
+ /* create 8 byte-swapped copies of the bytes 4 - 11 */
+ b = (u64x8) u8x64_shuffle (kv4,
+ 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4,
+ 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4,
+ 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4,
+ 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4,
+ 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4,
+ 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4,
+ 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4,
+ 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4);
+ /* clang-format on */
+
+ /* shift each 64-bit element for 0 - 7 bits */
+ a <<= shift;
+ b <<= shift;
+
+ /* clang-format off */
+ /* construct eight 8x8 bit matrix used by gf2p8affine */
+ * m = (u64x8) u8x64_shuffle2 (a, b,
+ 0x07, 0x0f, 0x17, 0x1f, 0x27, 0x2f, 0x37, 0x3f,
+ 0x06, 0x0e, 0x16, 0x1e, 0x26, 0x2e, 0x36, 0x3e,
+ 0x05, 0x0d, 0x15, 0x1d, 0x25, 0x2d, 0x35, 0x3d,
+ 0x04, 0x0c, 0x14, 0x1c, 0x24, 0x2c, 0x34, 0x3c,
+ 0x47, 0x4f, 0x57, 0x5f, 0x67, 0x6f, 0x77, 0x7f,
+ 0x46, 0x4e, 0x56, 0x5e, 0x66, 0x6e, 0x76, 0x7e,
+ 0x45, 0x4d, 0x55, 0x5d, 0x65, 0x6d, 0x75, 0x7d,
+ 0x44, 0x4c, 0x54, 0x5c, 0x64, 0x6c, 0x74, 0x7c);
+ /* clang-format on */
+}
+
+void
+clib_toeplitz_hash_key_expand (u64 *matrixes, u8 *key, int size)
+{
+ u64x8u *m = (u64x8u *) matrixes;
+ u64x2 kv = {}, zero = {};
+
+ while (size >= 8)
+ {
+ kv = *(u64x2u *) key;
+ clib_toeplitz_hash_key_expand_8 (kv, m);
+ key += 8;
+ m++;
+ size -= 8;
+ }
+
+ kv = u64x2_shuffle2 (kv, zero, 1, 2);
+ clib_toeplitz_hash_key_expand_8 (kv, m);
+}
+#endif
+
+__clib_export clib_toeplitz_hash_key_t *
+clib_toeplitz_hash_key_init (u8 *key, u32 keylen)
+{
+ clib_toeplitz_hash_key_t *k;
+ u32 size, gfni_size = 0;
+
+ if (key == 0)
+ {
+ key = default_key;
+ keylen = sizeof (default_key);
+ }
+
+ size =
+ round_pow2 (sizeof (clib_toeplitz_hash_key_t) + round_pow2 (keylen, 16),
+ CLIB_CACHE_LINE_BYTES);
+#ifdef __x86_64__
+ gfni_size = round_pow2 ((keylen + 1) * 8, CLIB_CACHE_LINE_BYTES);
+#endif
+
+ k = clib_mem_alloc_aligned (size + gfni_size, CLIB_CACHE_LINE_BYTES);
+ clib_memset_u8 (k, 0, size + gfni_size);
+ k->key_length = keylen;
+ k->gfni_offset = size;
+ clib_memcpy_fast (k->data, key, keylen);
+
+#ifdef __x86_64__
+ clib_toeplitz_hash_key_expand ((u64 *) ((u8 *) k + k->gfni_offset), k->data,
+ k->key_length);
+#endif
+
+ return k;
+}
+
+__clib_export void
+clib_toeplitz_hash_key_free (clib_toeplitz_hash_key_t *k)
+{
+ clib_mem_free (k);
+}
diff --git a/src/vppinfra/vector/toeplitz.h b/src/vppinfra/vector/toeplitz.h
new file mode 100644
index 00000000000..76297f05195
--- /dev/null
+++ b/src/vppinfra/vector/toeplitz.h
@@ -0,0 +1,513 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Cisco Systems, Inc.
+ */
+
+#ifndef included_vector_toeplitz_h
+#define included_vector_toeplitz_h
+#include <vppinfra/clib.h>
+
+typedef struct
+{
+ u16 key_length;
+ u16 gfni_offset;
+ u8 data[];
+} clib_toeplitz_hash_key_t;
+
+clib_toeplitz_hash_key_t *clib_toeplitz_hash_key_init (u8 *key, u32 keylen);
+void clib_toeplitz_hash_key_free (clib_toeplitz_hash_key_t *k);
+
+#ifdef CLIB_HAVE_VEC256
+static_always_inline u32x8
+toeplitz_hash_one_x8 (u32x8 hash, u64x4 v4, u8 data, u8 off)
+{
+ u32x8 v8 = u32x8_shuffle2 (v4 << (off * 8), v4 << (off * 8 + 4),
+ /*uppper 32 bits of each u64 in reverse order */
+ 15, 13, 11, 9, 7, 5, 3, 1);
+
+#ifdef CLIB_HAVE_VEC256_MASK_BITWISE_OPS
+ return u32x8_mask_xor (hash, v8, data);
+#else
+ static const u32x8 bits = { 1, 2, 4, 8, 16, 32, 64, 128 };
+ return hash ^ (((u32x8_splat (data) & bits) != u32x8_zero ()) & v8);
+#endif
+}
+#endif
+
+#if defined(__GFNI__) && defined(__AVX512F__)
+static const u8x64 __clib_toeplitz_hash_gfni_permute = {
+ /* clang-format off */
+ 0x00, 0x01, 0x02, 0x03, 0x40, 0x41, 0x42, 0x43,
+ 0x01, 0x02, 0x03, 0x04, 0x41, 0x42, 0x43, 0x44,
+ 0x02, 0x03, 0x04, 0x05, 0x42, 0x43, 0x44, 0x45,
+ 0x03, 0x04, 0x05, 0x06, 0x43, 0x44, 0x45, 0x46,
+ 0x04, 0x05, 0x06, 0x07, 0x44, 0x45, 0x46, 0x47,
+ 0x05, 0x06, 0x07, 0x08, 0x45, 0x46, 0x47, 0x48,
+ 0x06, 0x07, 0x08, 0x09, 0x46, 0x47, 0x48, 0x49,
+ 0x07, 0x08, 0x09, 0x0a, 0x47, 0x48, 0x49, 0x4a
+ /* clang-format on */
+};
+static_always_inline u64x8
+clib_toeplitz_hash_gfni_one (u8x64 d0, u64x8 m, int i)
+{
+
+ d0 = i == 1 ? (u8x64) u64x8_align_right (d0, d0, 1) : d0;
+ d0 = i == 2 ? (u8x64) u64x8_align_right (d0, d0, 2) : d0;
+ d0 = i == 3 ? (u8x64) u64x8_align_right (d0, d0, 3) : d0;
+ d0 = i == 4 ? (u8x64) u64x8_align_right (d0, d0, 4) : d0;
+ d0 = i == 5 ? (u8x64) u64x8_align_right (d0, d0, 5) : d0;
+ d0 = i == 6 ? (u8x64) u64x8_align_right (d0, d0, 6) : d0;
+
+ d0 = u8x64_permute (__clib_toeplitz_hash_gfni_permute, d0);
+
+ return (u64x8) _mm512_gf2p8affine_epi64_epi8 ((__m512i) d0, (__m512i) m, 0);
+}
+
+static_always_inline u64x8
+clib_toeplitz_hash_gfni_two (u8x64 d0, u8x64 d1, u64x8 m, int i)
+{
+
+ d0 = i == 1 ? (u8x64) u64x8_align_right (d0, d0, 1) : d0;
+ d1 = i == 1 ? (u8x64) u64x8_align_right (d1, d1, 1) : d1;
+ d0 = i == 2 ? (u8x64) u64x8_align_right (d0, d0, 2) : d0;
+ d1 = i == 2 ? (u8x64) u64x8_align_right (d1, d1, 2) : d1;
+ d0 = i == 3 ? (u8x64) u64x8_align_right (d0, d0, 3) : d0;
+ d1 = i == 3 ? (u8x64) u64x8_align_right (d1, d1, 3) : d1;
+ d0 = i == 4 ? (u8x64) u64x8_align_right (d0, d0, 4) : d0;
+ d1 = i == 4 ? (u8x64) u64x8_align_right (d1, d1, 4) : d1;
+ d0 = i == 5 ? (u8x64) u64x8_align_right (d0, d0, 5) : d0;
+ d1 = i == 5 ? (u8x64) u64x8_align_right (d1, d1, 5) : d1;
+ d0 = i == 6 ? (u8x64) u64x8_align_right (d0, d0, 6) : d0;
+ d1 = i == 6 ? (u8x64) u64x8_align_right (d1, d1, 6) : d1;
+
+ d0 = u8x64_permute2 (__clib_toeplitz_hash_gfni_permute, d0, d1);
+
+ return (u64x8) _mm512_gf2p8affine_epi64_epi8 ((__m512i) d0, (__m512i) m, 0);
+}
+#endif
+
+static_always_inline u32
+clib_toeplitz_hash (clib_toeplitz_hash_key_t *k, u8 *data, int n_bytes)
+{
+ u8 *key = k->data;
+ /* key must be 4 bytes longer than data */
+ ASSERT (k->key_length - n_bytes >= 4);
+
+#if defined(__GFNI__) && defined(__AVX512F__)
+ u8x64 d0;
+ u64x8 h0 = {};
+ u64x8u *m = (u64x8u *) ((u8 *) k + k->gfni_offset);
+
+ /* move data ptr backwards for 3 byte so mask load "prepends" three zeros */
+ data -= 3;
+ n_bytes += 3;
+
+ if (n_bytes < 64)
+ {
+ d0 = u8x64_mask_load_zero ((u8 *) data, pow2_mask (n_bytes - 3) << 3);
+ goto last8;
+ }
+
+ d0 = u8x64_mask_load_zero ((u8 *) data, -1ULL << 3);
+next56:
+ h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_one (d0, m[0], 0),
+ clib_toeplitz_hash_gfni_one (d0, m[1], 1));
+ h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_one (d0, m[2], 2),
+ clib_toeplitz_hash_gfni_one (d0, m[3], 3));
+ h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_one (d0, m[4], 4),
+ clib_toeplitz_hash_gfni_one (d0, m[5], 5));
+ h0 ^= clib_toeplitz_hash_gfni_one (d0, m[6], 6);
+ n_bytes -= 56;
+ data += 56;
+ m += 7;
+
+ if (n_bytes >= 64)
+ {
+ d0 = *(u8x64u *) data;
+ goto next56;
+ }
+
+ if (n_bytes == 0)
+ goto done;
+
+ d0 = u8x64_mask_load_zero ((u8 *) data, pow2_mask (n_bytes));
+last8:
+ h0 ^= clib_toeplitz_hash_gfni_one (d0, m[0], 0);
+ n_bytes -= 8;
+
+ if (n_bytes > 0)
+ {
+ m += 1;
+ d0 = (u8x64) u64x8_align_right (u64x8_zero (), d0, 1);
+ goto last8;
+ }
+
+done:
+ return u64x8_hxor (h0);
+#elif defined(CLIB_HAVE_VEC256)
+ u64x4 v4, shift = { 0, 1, 2, 3 };
+ u32x8 h0 = {};
+
+ while (n_bytes >= 4)
+ {
+ v4 = u64x4_splat (clib_net_to_host_u64 (*(u64u *) key)) << shift;
+
+ h0 = toeplitz_hash_one_x8 (h0, v4, data[0], 0);
+ h0 = toeplitz_hash_one_x8 (h0, v4, data[1], 1);
+ h0 = toeplitz_hash_one_x8 (h0, v4, data[2], 2);
+ h0 = toeplitz_hash_one_x8 (h0, v4, data[3], 3);
+
+ data += 4;
+ key += 4;
+ n_bytes -= 4;
+ }
+
+ if (n_bytes)
+ {
+ u64 v = (u64) clib_net_to_host_u32 ((u64) (*(u32u *) key)) << 32;
+ v |= (u64) key[4] << 24;
+
+ if (n_bytes == 3)
+ {
+ v |= (u64) key[5] << 16;
+ v |= (u64) key[6] << 8;
+ v4 = u64x4_splat (v) << shift;
+ h0 = toeplitz_hash_one_x8 (h0, v4, data[0], 0);
+ h0 = toeplitz_hash_one_x8 (h0, v4, data[1], 1);
+ h0 = toeplitz_hash_one_x8 (h0, v4, data[2], 2);
+ }
+ else if (n_bytes == 2)
+ {
+ v |= (u64) key[5] << 16;
+ v4 = u64x4_splat (v) << shift;
+ h0 = toeplitz_hash_one_x8 (h0, v4, data[0], 0);
+ h0 = toeplitz_hash_one_x8 (h0, v4, data[1], 1);
+ }
+ else
+ {
+ v4 = u64x4_splat (v) << shift;
+ h0 = toeplitz_hash_one_x8 (h0, v4, data[0], 0);
+ }
+ }
+
+ return u32x8_hxor (h0);
+#endif
+ u64 v, hash = 0;
+
+ while (n_bytes >= 4)
+ {
+ v = clib_net_to_host_u64 (*(u64u *) key);
+
+ for (u8 bit = 1 << 7, byte = data[0]; bit; bit >>= 1, v <<= 1)
+ hash ^= byte & bit ? v : 0;
+ for (u8 bit = 1 << 7, byte = data[1]; bit; bit >>= 1, v <<= 1)
+ hash ^= byte & bit ? v : 0;
+ for (u8 bit = 1 << 7, byte = data[2]; bit; bit >>= 1, v <<= 1)
+ hash ^= byte & bit ? v : 0;
+ for (u8 bit = 1 << 7, byte = data[3]; bit; bit >>= 1, v <<= 1)
+ hash ^= byte & bit ? v : 0;
+
+ data += 4;
+ key += 4;
+ n_bytes -= 4;
+ }
+
+ if (n_bytes)
+ {
+ v = (u64) clib_net_to_host_u32 ((u64) (*(u32u *) key)) << 32;
+ v |= (u64) key[4] << 24;
+ for (u8 bit = 1 << 7, byte = data[0]; bit; bit >>= 1, v <<= 1)
+ hash ^= byte & bit ? v : 0;
+ if (n_bytes > 1)
+ {
+ v |= (u64) key[5] << 24;
+ for (u8 bit = 1 << 7, byte = data[1]; bit; bit >>= 1, v <<= 1)
+ hash ^= byte & bit ? v : 0;
+ }
+ if (n_bytes > 2)
+ {
+ v |= (u64) key[6] << 24;
+ for (u8 bit = 1 << 7, byte = data[2]; bit; bit >>= 1, v <<= 1)
+ hash ^= byte & bit ? v : 0;
+ }
+ }
+ return hash >> 32;
+}
+
+static_always_inline void
+clib_toeplitz_hash_x4 (clib_toeplitz_hash_key_t *k, u8 *data0, u8 *data1,
+ u8 *data2, u8 *data3, u32 *hash0, u32 *hash1,
+ u32 *hash2, u32 *hash3, int n_bytes)
+{
+ /* key must be 4 bytes longer than data */
+ ASSERT (k->key_length - n_bytes >= 4);
+#if defined(__GFNI__) && defined(__AVX512F__)
+ u64x8u *m = (u64x8u *) ((u8 *) k + k->gfni_offset);
+ u8x64 d0, d1, d2, d3;
+ u64x8 h0 = {}, h2 = {};
+ u64 h, mask;
+
+ /* move data ptr backwards for 3 byte so mask load "prepends" three zeros */
+ data0 -= 3;
+ data1 -= 3;
+ data2 -= 3;
+ data3 -= 3;
+ n_bytes += 3;
+
+ if (n_bytes < 64)
+ {
+ mask = pow2_mask (n_bytes - 3) << 3;
+ d0 = u8x64_mask_load_zero ((u8 *) data0, mask);
+ d1 = u8x64_mask_load_zero ((u8 *) data1, mask);
+ d2 = u8x64_mask_load_zero ((u8 *) data2, mask);
+ d3 = u8x64_mask_load_zero ((u8 *) data3, mask);
+ goto last8;
+ }
+
+ mask = -1ULL << 3;
+ d0 = u8x64_mask_load_zero ((u8 *) data0, mask);
+ d1 = u8x64_mask_load_zero ((u8 *) data1, mask);
+ d2 = u8x64_mask_load_zero ((u8 *) data2, mask);
+ d3 = u8x64_mask_load_zero ((u8 *) data3, mask);
+next56:
+ h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_two (d0, d1, m[0], 0),
+ clib_toeplitz_hash_gfni_two (d0, d1, m[1], 1));
+ h2 = u64x8_xor3 (h2, clib_toeplitz_hash_gfni_two (d2, d3, m[0], 0),
+ clib_toeplitz_hash_gfni_two (d2, d3, m[1], 1));
+
+ h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_two (d0, d1, m[2], 2),
+ clib_toeplitz_hash_gfni_two (d0, d1, m[3], 3));
+ h2 = u64x8_xor3 (h2, clib_toeplitz_hash_gfni_two (d2, d3, m[2], 2),
+ clib_toeplitz_hash_gfni_two (d2, d3, m[3], 3));
+
+ h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_two (d0, d1, m[4], 4),
+ clib_toeplitz_hash_gfni_two (d0, d1, m[5], 5));
+ h2 = u64x8_xor3 (h2, clib_toeplitz_hash_gfni_two (d2, d3, m[4], 4),
+ clib_toeplitz_hash_gfni_two (d2, d3, m[5], 5));
+
+ h0 ^= clib_toeplitz_hash_gfni_two (d0, d1, m[6], 6);
+ h2 ^= clib_toeplitz_hash_gfni_two (d2, d3, m[6], 6);
+
+ n_bytes -= 56;
+ data0 += 56;
+ data1 += 56;
+ data2 += 56;
+ data3 += 56;
+ m += 7;
+
+ if (n_bytes >= 64)
+ {
+ d0 = *(u8x64u *) data0;
+ d1 = *(u8x64u *) data1;
+ d2 = *(u8x64u *) data2;
+ d3 = *(u8x64u *) data3;
+ goto next56;
+ }
+
+ if (n_bytes == 0)
+ goto done;
+
+ mask = pow2_mask (n_bytes);
+ d0 = u8x64_mask_load_zero ((u8 *) data0, mask);
+ d1 = u8x64_mask_load_zero ((u8 *) data1, mask);
+ d2 = u8x64_mask_load_zero ((u8 *) data2, mask);
+ d3 = u8x64_mask_load_zero ((u8 *) data3, mask);
+last8:
+ h0 ^= clib_toeplitz_hash_gfni_two (d0, d1, m[0], 0);
+ h2 ^= clib_toeplitz_hash_gfni_two (d2, d3, m[0], 0);
+ n_bytes -= 8;
+
+ if (n_bytes > 0)
+ {
+ u64x8 zero = {};
+ m += 1;
+ d0 = (u8x64) u64x8_align_right (zero, d0, 1);
+ d1 = (u8x64) u64x8_align_right (zero, d1, 1);
+ d2 = (u8x64) u64x8_align_right (zero, d2, 1);
+ d3 = (u8x64) u64x8_align_right (zero, d3, 1);
+ goto last8;
+ }
+
+done:
+ h = u64x8_hxor (h0);
+ *hash0 = h;
+ *hash1 = h >> 32;
+ h = u64x8_hxor (h2);
+ *hash2 = h;
+ *hash3 = h >> 32;
+#elif defined(CLIB_HAVE_VEC256)
+ u8 *key = k->data;
+ u64x4 v4, shift = { 0, 1, 2, 3 };
+ u32x8 h0 = {}, h1 = {}, h2 = {}, h3 = {};
+
+ while (n_bytes >= 4)
+ {
+ v4 = u64x4_splat (clib_net_to_host_u64 (*(u64u *) key)) << shift;
+
+ h0 = toeplitz_hash_one_x8 (h0, v4, data0[0], 0);
+ h1 = toeplitz_hash_one_x8 (h1, v4, data1[0], 0);
+ h2 = toeplitz_hash_one_x8 (h2, v4, data2[0], 0);
+ h3 = toeplitz_hash_one_x8 (h3, v4, data3[0], 0);
+
+ h0 = toeplitz_hash_one_x8 (h0, v4, data0[1], 1);
+ h1 = toeplitz_hash_one_x8 (h1, v4, data1[1], 1);
+ h2 = toeplitz_hash_one_x8 (h2, v4, data2[1], 1);
+ h3 = toeplitz_hash_one_x8 (h3, v4, data3[1], 1);
+
+ h0 = toeplitz_hash_one_x8 (h0, v4, data0[2], 2);
+ h1 = toeplitz_hash_one_x8 (h1, v4, data1[2], 2);
+ h2 = toeplitz_hash_one_x8 (h2, v4, data2[2], 2);
+ h3 = toeplitz_hash_one_x8 (h3, v4, data3[2], 2);
+
+ h0 = toeplitz_hash_one_x8 (h0, v4, data0[3], 3);
+ h1 = toeplitz_hash_one_x8 (h1, v4, data1[3], 3);
+ h2 = toeplitz_hash_one_x8 (h2, v4, data2[3], 3);
+ h3 = toeplitz_hash_one_x8 (h3, v4, data3[3], 3);
+
+ data0 += 4;
+ data1 += 4;
+ data2 += 4;
+ data3 += 4;
+ key += 4;
+ n_bytes -= 4;
+ }
+
+ if (n_bytes)
+ {
+ u64 v = (u64) clib_net_to_host_u32 ((u64) (*(u32u *) key)) << 32;
+ v |= (u64) key[4] << 24;
+
+ if (n_bytes == 3)
+ {
+ v |= (u64) key[5] << 16;
+ v |= (u64) key[6] << 8;
+ v4 = u64x4_splat (v) << shift;
+ h0 = toeplitz_hash_one_x8 (h0, v4, data0[0], 0);
+ h1 = toeplitz_hash_one_x8 (h1, v4, data1[0], 0);
+ h2 = toeplitz_hash_one_x8 (h2, v4, data2[0], 0);
+ h3 = toeplitz_hash_one_x8 (h3, v4, data3[0], 0);
+
+ h0 = toeplitz_hash_one_x8 (h0, v4, data0[1], 1);
+ h1 = toeplitz_hash_one_x8 (h1, v4, data1[1], 1);
+ h2 = toeplitz_hash_one_x8 (h2, v4, data2[1], 1);
+ h3 = toeplitz_hash_one_x8 (h3, v4, data3[1], 1);
+
+ h0 = toeplitz_hash_one_x8 (h0, v4, data0[2], 2);
+ h1 = toeplitz_hash_one_x8 (h1, v4, data1[2], 2);
+ h2 = toeplitz_hash_one_x8 (h2, v4, data2[2], 2);
+ h3 = toeplitz_hash_one_x8 (h3, v4, data3[2], 2);
+ }
+ else if (n_bytes == 2)
+ {
+ v |= (u64) key[5] << 16;
+ v4 = u64x4_splat (v) << shift;
+ h0 = toeplitz_hash_one_x8 (h0, v4, data0[0], 0);
+ h1 = toeplitz_hash_one_x8 (h1, v4, data1[0], 0);
+ h2 = toeplitz_hash_one_x8 (h2, v4, data2[0], 0);
+ h3 = toeplitz_hash_one_x8 (h3, v4, data3[0], 0);
+
+ h0 = toeplitz_hash_one_x8 (h0, v4, data0[1], 1);
+ h1 = toeplitz_hash_one_x8 (h1, v4, data1[1], 1);
+ h2 = toeplitz_hash_one_x8 (h2, v4, data2[1], 1);
+ h3 = toeplitz_hash_one_x8 (h3, v4, data3[1], 1);
+ }
+ else
+ {
+ v4 = u64x4_splat (v) << shift;
+ h0 = toeplitz_hash_one_x8 (h0, v4, data0[0], 0);
+ h1 = toeplitz_hash_one_x8 (h1, v4, data1[0], 0);
+ h2 = toeplitz_hash_one_x8 (h2, v4, data2[0], 0);
+ h3 = toeplitz_hash_one_x8 (h3, v4, data3[0], 0);
+ }
+ }
+
+ *hash0 = u32x8_hxor (h0);
+ *hash1 = u32x8_hxor (h1);
+ *hash2 = u32x8_hxor (h2);
+ *hash3 = u32x8_hxor (h3);
+#else
+ u8 *key = k->data;
+ u64 v, h0 = 0, h1 = 0, h2 = 0, h3 = 0;
+
+ while (n_bytes >= 4)
+ {
+ v = clib_net_to_host_u64 (*(u64u *) key);
+
+ for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1)
+ {
+ h0 ^= data0[0] & bit ? v : 0;
+ h1 ^= data1[0] & bit ? v : 0;
+ h2 ^= data2[0] & bit ? v : 0;
+ h3 ^= data3[0] & bit ? v : 0;
+ }
+ for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1)
+ {
+ h0 ^= data0[1] & bit ? v : 0;
+ h1 ^= data1[1] & bit ? v : 0;
+ h2 ^= data2[1] & bit ? v : 0;
+ h3 ^= data3[1] & bit ? v : 0;
+ }
+ for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1)
+ {
+ h0 ^= data0[2] & bit ? v : 0;
+ h1 ^= data1[2] & bit ? v : 0;
+ h2 ^= data2[2] & bit ? v : 0;
+ h3 ^= data3[2] & bit ? v : 0;
+ }
+ for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1)
+ {
+ h0 ^= data0[3] & bit ? v : 0;
+ h1 ^= data1[3] & bit ? v : 0;
+ h2 ^= data2[3] & bit ? v : 0;
+ h3 ^= data3[3] & bit ? v : 0;
+ }
+
+ data0 += 4;
+ data1 += 4;
+ data2 += 4;
+ data3 += 4;
+ key += 4;
+ n_bytes -= 4;
+ }
+
+ if (n_bytes)
+ {
+ v = (u64) clib_net_to_host_u32 ((u64) (*(u32u *) key)) << 32;
+ v |= (u64) key[4] << 24;
+ for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1)
+ {
+ h0 ^= data0[0] & bit ? v : 0;
+ h1 ^= data1[0] & bit ? v : 0;
+ h2 ^= data2[0] & bit ? v : 0;
+ h3 ^= data3[0] & bit ? v : 0;
+ }
+ if (n_bytes > 1)
+ {
+ v |= (u64) key[5] << 24;
+ for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1)
+ {
+ h0 ^= data0[1] & bit ? v : 0;
+ h1 ^= data1[1] & bit ? v : 0;
+ h2 ^= data2[1] & bit ? v : 0;
+ h3 ^= data3[1] & bit ? v : 0;
+ }
+ }
+ if (n_bytes > 2)
+ {
+ v |= (u64) key[6] << 24;
+ for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1)
+ {
+ h0 ^= data0[2] & bit ? v : 0;
+ h1 ^= data1[2] & bit ? v : 0;
+ h2 ^= data2[2] & bit ? v : 0;
+ h3 ^= data3[2] & bit ? v : 0;
+ }
+ }
+ }
+ *hash0 = h0 >> 32;
+ *hash1 = h1 >> 32;
+ *hash2 = h2 >> 32;
+ *hash3 = h3 >> 32;
+#endif
+}
+
+#endif