13 files changed, 1942 insertions, 616 deletions
diff --git a/src/vppinfra/vector/array_mask.h b/src/vppinfra/vector/array_mask.h
index 778ed3e638f..3d4a82ac01b 100644
--- a/src/vppinfra/vector/array_mask.h
+++ b/src/vppinfra/vector/array_mask.h
@@ -17,61 +17,114 @@
 static_always_inline void
 clib_array_mask_u32 (u32 *src, u32 mask, u32 n_elts)
 {
-  u32 i;
 #if defined(CLIB_HAVE_VEC512)
   u32x16 mask16 = u32x16_splat (mask);
-
-  for (i = 0; i + 16 <= n_elts; i += 16)
-    *((u32x16u *) (src + i)) &= mask16;
-  n_elts -= i;
-  if (n_elts)
+  if (n_elts <= 16)
     {
-      u16 m = pow2_mask (n_elts);
-      u32x16_mask_store (u32x16_mask_load_zero (src + i, m) & mask16, src + i,
-			 m);
+      u32 m = pow2_mask (n_elts);
+      u32x16 r = u32x16_mask_load_zero (src, m);
+      u32x16_mask_store (r & mask16, src, m);
+      return;
     }
-  return;
+  for (; n_elts >= 16; n_elts -= 16, src += 16)
+    *((u32x16u *) src) &= mask16;
+  *((u32x16u *) (src + n_elts - 16)) &= mask16;
 #elif defined(CLIB_HAVE_VEC256)
   u32x8 mask8 = u32x8_splat (mask);
-
-  for (i = 0; i + 8 <= n_elts; i += 8)
-    *((u32x8u *) (src + i)) &= mask8;
-  n_elts -= i;
-  src += i;
 #if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
-  if (n_elts)
+  if (n_elts <= 8)
     {
-      u8 m = pow2_mask (n_elts);
-      u32x8_mask_store (u32x8_mask_load_zero (src, m) & mask8, src, m);
+      u32 m = pow2_mask (n_elts);
+      u32x8 r = u32x8_mask_load_zero (src, m);
+      u32x8_mask_store (r & mask8, src, m);
+      return;
+    }
+#else
+  if (PREDICT_FALSE (n_elts < 4))
+    {
+      if (n_elts & 2)
+	{
+	  src[0] &= mask;
+	  src[1] &= mask;
+	  src += 2;
+	}
+      if (n_elts & 1)
+	src[0] &= mask;
+      return;
+    }
+  if (n_elts <= 8)
+    {
+      u32x4 mask4 = u32x4_splat (mask);
+      *(u32x4u *) src &= mask4;
+      *(u32x4u *) (src + n_elts - 4) &= mask4;
+      return;
     }
-  return;
 #endif
+
+  for (; n_elts >= 8; n_elts -= 8, src += 8)
+    *((u32x8u *) src) &= mask8;
+  *((u32x8u *) (src + n_elts - 8)) &= mask8;
 #elif defined(CLIB_HAVE_VEC128)
   u32x4 mask4 = u32x4_splat (mask);
 
-  for (i = 0; i + 4 <= n_elts; i += 4)
-    *((u32x4u *) (src + i)) &= mask4;
-  n_elts -= i;
-  src += i;
-  switch (n_elts)
+  if (PREDICT_FALSE (n_elts < 4))
     {
-    case 3:
-      src[2] &= mask;
-    case 2:
-      src[1] &= mask;
-    case 1:
-      src[0] &= mask;
-    case 0:
-    default:;
+      if (n_elts & 2)
+	{
+	  src[0] &= mask;
+	  src[1] &= mask;
+	  src += 2;
+	}
+      if (n_elts & 1)
+	src[0] &= mask;
+      return;
     }
+
+  for (; n_elts >= 4; n_elts -= 4, src += 4)
+    *((u32x4u *) src) &= mask4;
+  *((u32x4u *) (src + n_elts - 4)) &= mask4;
   return;
-#endif
+#else
   while (n_elts > 0)
     {
       src[0] &= mask;
       src++;
       n_elts--;
     }
+#endif
+}
+
+static_always_inline void
+clib_array_mask_set_u32_x64 (u32 *a, u32 v, uword bmp, int n_elts)
+{
+#if defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE)
+  u32x16 r = u32x16_splat (v);
+  for (; n_elts > 0; n_elts -= 16, a += 16, bmp >>= 16)
+    u32x16_mask_store (r, a, bmp);
+#elif defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
+  u32x8 r = u32x8_splat (v);
+  for (; n_elts > 0; n_elts -= 8, a += 8, bmp >>= 8)
+    u32x8_mask_store (r, a, bmp);
+#else
+  while (bmp)
+    {
+      a[get_lowest_set_bit_index (bmp)] = v;
+      bmp = clear_lowest_set_bit (bmp);
+    }
+#endif
+}
+
+static_always_inline void
+clib_array_mask_set_u32 (u32 *a, u32 v, uword *bmp, u32 n_elts)
+{
+  while (n_elts >= uword_bits)
+    {
+      clib_array_mask_set_u32_x64 (a, v, bmp++[0], uword_bits);
+      a += uword_bits;
+      n_elts -= uword_bits;
+    }
+
+  clib_array_mask_set_u32_x64 (a, v, bmp[0] & pow2_mask (n_elts), n_elts);
 }
 
 #endif
diff --git a/src/vppinfra/vector/compress.h b/src/vppinfra/vector/compress.h
index adb6503f711..5429113984b 100644
--- a/src/vppinfra/vector/compress.h
+++ b/src/vppinfra/vector/compress.h
@@ -27,12 +27,40 @@ clib_compress_u64_x64 (u64 *dst, u64 *src, u64 mask)
       mask >>= 4;
     }
 #else
-  while (mask)
+  u32 i;
+  foreach_set_bit_index (i, mask)
+    dst++[0] = src[i];
+#endif
+  return dst;
+}
+
+static_always_inline u64 *
+clib_compress_u64_x64_masked (u64 *dst, u64 *src, u64 mask)
+{
+#if defined(CLIB_HAVE_VEC512_COMPRESS) &&                                     \
+  defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE)
+  u64x8u *sv = (u64x8u *) src;
+  for (int i = 0; i < 8; i++)
     {
-      u16 bit = count_trailing_zeros (mask);
-      mask = clear_lowest_set_bit (mask);
-      dst++[0] = src[bit];
+      u64x8u s = u64x8_mask_load_zero (&sv[i], mask);
+      u64x8_compress_store (s, mask, dst);
+      dst += _popcnt32 ((u8) mask);
+      mask >>= 8;
     }
+#elif defined(CLIB_HAVE_VEC256_COMPRESS) &&                                   \
+  defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
+  u64x4u *sv = (u64x4u *) src;
+  for (int i = 0; i < 16; i++)
+    {
+      u64x4u s = u64x4_mask_load_zero (&sv[i], mask);
+      u64x4_compress_store (s, mask, dst);
+      dst += _popcnt32 (((u8) mask) & 0x0f);
+      mask >>= 4;
+    }
+#else
+  u32 i;
+  foreach_set_bit_index (i, mask)
+    dst++[0] = src[i];
 #endif
   return dst;
 }
@@ -69,7 +97,9 @@ clib_compress_u64 (u64 *dst, u64 *src, u64 *mask, u32 n_elts)
   if (PREDICT_TRUE (n_elts == 0))
     return dst - dst0;
 
-  return clib_compress_u64_x64 (dst, src, mask[0] & pow2_mask (n_elts)) - dst0;
+  return clib_compress_u64_x64_masked (dst, src,
+				       mask[0] & pow2_mask (n_elts)) -
+	 dst0;
 }
 
 static_always_inline u32 *
@@ -93,12 +123,41 @@ clib_compress_u32_x64 (u32 *dst, u32 *src, u64 mask)
       mask >>= 8;
     }
 #else
-  while (mask)
+  u32 i;
+  foreach_set_bit_index (i, mask)
+    dst++[0] = src[i];
+#endif
+  return dst;
+}
+
+static_always_inline u32 *
+clib_compress_u32_x64_masked (u32 *dst, u32 *src, u64 mask)
+{
+#if defined(CLIB_HAVE_VEC512_COMPRESS) &&                                     \
+  defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE)
+  u32x16u *sv = (u32x16u *) src;
+  for (int i = 0; i < 4; i++)
     {
-      u16 bit = count_trailing_zeros (mask);
-      mask = clear_lowest_set_bit (mask);
-      dst++[0] = src[bit];
+      u32x16u s = u32x16_mask_load_zero (&sv[i], mask);
+      u32x16_compress_store (s, mask, dst);
+      dst += _popcnt32 ((u16) mask);
+      mask >>= 16;
     }
+
+#elif defined(CLIB_HAVE_VEC256_COMPRESS) &&                                   \
+  defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
+  u32x8u *sv = (u32x8u *) src;
+  for (int i = 0; i < 8; i++)
+    {
+      u32x8u s = u32x8_mask_load_zero (&sv[i], mask);
+      u32x8_compress_store (s, mask, dst);
+      dst += _popcnt32 ((u8) mask);
+      mask >>= 8;
+    }
+#else
+  u32 i;
+  foreach_set_bit_index (i, mask)
+    dst++[0] = src[i];
 #endif
   return dst;
 }
@@ -135,7 +194,9 @@ clib_compress_u32 (u32 *dst, u32 *src, u64 *mask, u32 n_elts)
   if (PREDICT_TRUE (n_elts == 0))
     return dst - dst0;
 
-  return clib_compress_u32_x64 (dst, src, mask[0] & pow2_mask (n_elts)) - dst0;
+  return clib_compress_u32_x64_masked (dst, src,
+				       mask[0] & pow2_mask (n_elts)) -
+	 dst0;
 }
 
 static_always_inline u16 *
@@ -150,12 +211,30 @@ clib_compress_u16_x64 (u16 *dst, u16 *src, u64 mask)
       mask >>= 32;
     }
 #else
-  while (mask)
+  u32 i;
+  foreach_set_bit_index (i, mask)
+    dst++[0] = src[i];
+#endif
+  return dst;
+}
+
+static_always_inline u16 *
+clib_compress_u16_x64_masked (u16 *dst, u16 *src, u64 mask)
+{
+#if defined(CLIB_HAVE_VEC512_COMPRESS_U8_U16) &&                              \
+  defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE)
+  u16x32u *sv = (u16x32u *) src;
+  for (int i = 0; i < 2; i++)
     {
-      u16 bit = count_trailing_zeros (mask);
-      mask = clear_lowest_set_bit (mask);
-      dst++[0] = src[bit];
+      u16x32u s = u16x32_mask_load_zero (&sv[i], mask);
+      u16x32_compress_store (s, mask, dst);
+      dst += _popcnt32 ((u32) mask);
+      mask >>= 32;
     }
+#else
+  u32 i;
+  foreach_set_bit_index (i, mask)
+    dst++[0] = src[i];
 #endif
   return dst;
 }
@@ -192,7 +271,9 @@ clib_compress_u16 (u16 *dst, u16 *src, u64 *mask, u32 n_elts)
   if (PREDICT_TRUE (n_elts == 0))
     return dst - dst0;
 
-  return clib_compress_u16_x64 (dst, src, mask[0] & pow2_mask (n_elts)) - dst0;
+  return clib_compress_u16_x64_masked (dst, src,
+				       mask[0] & pow2_mask (n_elts)) -
+	 dst0;
 }
 
 static_always_inline u8 *
@@ -203,12 +284,26 @@ clib_compress_u8_x64 (u8 *dst, u8 *src, u64 mask)
   u8x64_compress_store (sv[0], mask, dst);
   dst += _popcnt64 (mask);
 #else
-  while (mask)
-    {
-      u16 bit = count_trailing_zeros (mask);
-      mask = clear_lowest_set_bit (mask);
-      dst++[0] = src[bit];
-    }
+  u32 i;
+  foreach_set_bit_index (i, mask)
+    dst++[0] = src[i];
+#endif
+  return dst;
+}
+
+static_always_inline u8 *
+clib_compress_u8_x64_masked (u8 *dst, u8 *src, u64 mask)
+{
+#if defined(CLIB_HAVE_VEC512_COMPRESS_U8_U16) &&                              \
+  defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE)
+  u8x64u *sv = (u8x64u *) src;
+  u8x64u s = u8x64_mask_load_zero (sv, mask);
+  u8x64_compress_store (s, mask, dst);
+  dst += _popcnt64 (mask);
+#else
+  u32 i;
+  foreach_set_bit_index (i, mask)
+    dst++[0] = src[i];
 #endif
   return dst;
 }
@@ -245,7 +340,8 @@ clib_compress_u8 (u8 *dst, u8 *src, u64 *mask, u32 n_elts)
   if (PREDICT_TRUE (n_elts == 0))
     return dst - dst0;
 
-  return clib_compress_u8_x64 (dst, src, mask[0] & pow2_mask (n_elts)) - dst0;
+  return clib_compress_u8_x64_masked (dst, src, mask[0] & pow2_mask (n_elts)) -
+	 dst0;
 }
 
 #endif
diff --git a/src/vppinfra/vector/count_equal.h b/src/vppinfra/vector/count_equal.h
new file mode 100644
index 00000000000..ca2fbb7fd39
--- /dev/null
+++ b/src/vppinfra/vector/count_equal.h
@@ -0,0 +1,306 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Cisco Systems, Inc.
+ */
+
+#ifndef included_vector_count_equal_h
+#define included_vector_count_equal_h
+#include <vppinfra/clib.h>
+
+static_always_inline uword
+clib_count_equal_u64 (u64 *data, uword max_count)
+{
+  uword count;
+  u64 first;
+
+  if (max_count <= 1)
+    return max_count;
+  if (data[0] != data[1])
+    return 1;
+
+  count = 0;
+  first = data[0];
+
+#if defined(CLIB_HAVE_VEC256)
+  u64x4 splat = u64x4_splat (first);
+  while (count + 3 < max_count)
+    {
+      u64 bmp;
+      bmp = u8x32_msb_mask ((u8x32) (u64x4_load_unaligned (data) == splat));
+      if (bmp != 0xffffffff)
+	{
+	  count += count_trailing_zeros (~bmp) / 8;
+	  return count;
+	}
+
+      data += 4;
+      count += 4;
+    }
+#else
+  count += 2;
+  data += 2;
+  while (count + 3 < max_count && ((data[0] ^ first) | (data[1] ^ first) |
+				   (data[2] ^ first) | (data[3] ^ first)) == 0)
+    {
+      data += 4;
+      count += 4;
+    }
+#endif
+  while (count < max_count && (data[0] == first))
+    {
+      data += 1;
+      count += 1;
+    }
+  return count;
+}
+
+static_always_inline uword
+clib_count_equal_u32 (u32 *data, uword max_count)
+{
+  uword count;
+  u32 first;
+
+  if (max_count <= 1)
+    return max_count;
+  if (data[0] != data[1])
+    return 1;
+
+  count = 0;
+  first = data[0];
+
+#if defined(CLIB_HAVE_VEC512)
+  u32x16 splat = u32x16_splat (first);
+  while (count + 15 < max_count)
+    {
+      u32 bmp;
+      bmp = u32x16_is_equal_mask (u32x16_load_unaligned (data), splat);
+      if (bmp != pow2_mask (16))
+	return count + count_trailing_zeros (~bmp);
+
+      data += 16;
+      count += 16;
+    }
+  if (count == max_count)
+    return count;
+  else
+    {
+      u32 mask = pow2_mask (max_count - count);
+      u32 bmp =
+	u32x16_is_equal_mask (u32x16_mask_load_zero (data, mask), splat) &
+	mask;
+      return count + count_trailing_zeros (~bmp);
+    }
+#elif defined(CLIB_HAVE_VEC256)
+  u32x8 splat = u32x8_splat (first);
+  while (count + 7 < max_count)
+    {
+      u32 bmp;
+#ifdef __AVX512F__
+      bmp = u32x8_is_equal_mask (u32x8_load_unaligned (data), splat);
+      if (bmp != pow2_mask (8))
+	return count + count_trailing_zeros (~bmp);
+#else
+      bmp = u8x32_msb_mask ((u8x32) (u32x8_load_unaligned (data) == splat));
+      if (bmp != 0xffffffff)
+	return count + count_trailing_zeros (~bmp) / 4;
+#endif
+
+      data += 8;
+      count += 8;
+    }
+  if (count == max_count)
+    return count;
+#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
+  else
+    {
+      u32 mask = pow2_mask (max_count - count);
+      u32 bmp =
+	u32x8_is_equal_mask (u32x8_mask_load_zero (data, mask), splat) & mask;
+      return count + count_trailing_zeros (~bmp);
+    }
+#endif
+#elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK)
+  u32x4 splat = u32x4_splat (first);
+  while (count + 3 < max_count)
+    {
+      u64 bmp;
+      bmp = u8x16_msb_mask ((u8x16) (u32x4_load_unaligned (data) == splat));
+      if (bmp != pow2_mask (4 * 4))
+	{
+	  count += count_trailing_zeros (~bmp) / 4;
+	  return count;
+	}
+
+      data += 4;
+      count += 4;
+    }
+#else
+  count += 2;
+  data += 2;
+  while (count + 3 < max_count && ((data[0] ^ first) | (data[1] ^ first) |
+				   (data[2] ^ first) | (data[3] ^ first)) == 0)
+    {
+      data += 4;
+      count += 4;
+    }
+#endif
+  while (count < max_count && (data[0] == first))
+    {
+      data += 1;
+      count += 1;
+    }
+  return count;
+}
+
+static_always_inline uword
+clib_count_equal_u16 (u16 *data, uword max_count)
+{
+  uword count;
+  u16 first;
+
+  if (max_count <= 1)
+    return max_count;
+  if (data[0] != data[1])
+    return 1;
+
+  count = 0;
+  first = data[0];
+
+#if defined(CLIB_HAVE_VEC256)
+  u16x16 splat = u16x16_splat (first);
+  while (count + 15 < max_count)
+    {
+      u64 bmp;
+      bmp = u8x32_msb_mask ((u8x32) (u16x16_load_unaligned (data) == splat));
+      if (bmp != 0xffffffff)
+	{
+	  count += count_trailing_zeros (~bmp) / 2;
+	  return count;
+	}
+
+      data += 16;
+      count += 16;
+    }
+#elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK)
+  u16x8 splat = u16x8_splat (first);
+  while (count + 7 < max_count)
+    {
+      u64 bmp;
+      bmp = u8x16_msb_mask ((u8x16) (u16x8_load_unaligned (data) == splat));
+      if (bmp != 0xffff)
+	{
+	  count += count_trailing_zeros (~bmp) / 2;
+	  return count;
+	}
+
+      data += 8;
+      count += 8;
+    }
+#else
+  count += 2;
+  data += 2;
+  while (count + 3 < max_count && ((data[0] ^ first) | (data[1] ^ first) |
+				   (data[2] ^ first) | (data[3] ^ first)) == 0)
+    {
+      data += 4;
+      count += 4;
+    }
+#endif
+  while (count < max_count && (data[0] == first))
+    {
+      data += 1;
+      count += 1;
+    }
+  return count;
+}
+
+static_always_inline uword
+clib_count_equal_u8 (u8 *data, uword max_count)
+{
+  uword count;
+  u8 first;
+
+  if (max_count <= 1)
+    return max_count;
+  if (data[0] != data[1])
+    return 1;
+
+  count = 0;
+  first = data[0];
+
+#if defined(CLIB_HAVE_VEC512)
+  u8x64 splat = u8x64_splat (first);
+  while (count + 63 < max_count)
+    {
+      u64 bmp;
+      bmp = u8x64_is_equal_mask (u8x64_load_unaligned (data), splat);
+      if (bmp != -1)
+	return count + count_trailing_zeros (~bmp);
+
+      data += 64;
+      count += 64;
+    }
+  if (count == max_count)
+    return count;
+#if defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE)
+  else
+    {
+      u64 mask = pow2_mask (max_count - count);
+      u64 bmp =
+	u8x64_is_equal_mask (u8x64_mask_load_zero (data, mask), splat) & mask;
+      return count + count_trailing_zeros (~bmp);
+    }
+#endif
+#elif defined(CLIB_HAVE_VEC256)
+  u8x32 splat = u8x32_splat (first);
+  while (count + 31 < max_count)
+    {
+      u64 bmp;
+      bmp = u8x32_msb_mask ((u8x32) (u8x32_load_unaligned (data) == splat));
+      if (bmp != 0xffffffff)
+	return count + count_trailing_zeros (~bmp);
+
+      data += 32;
+      count += 32;
+    }
+  if (count == max_count)
+    return count;
+#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
+  else
+    {
+      u32 mask = pow2_mask (max_count - count);
+      u64 bmp =
+	u8x32_msb_mask (u8x32_mask_load_zero (data, mask) == splat) & mask;
+      return count + count_trailing_zeros (~bmp);
+    }
+#endif
+#elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK)
+  u8x16 splat = u8x16_splat (first);
+  while (count + 15 < max_count)
+    {
+      u64 bmp;
+      bmp = u8x16_msb_mask ((u8x16) (u8x16_load_unaligned (data) == splat));
+      if (bmp != 0xffff)
+	return count + count_trailing_zeros (~bmp);
+
+      data += 16;
+      count += 16;
+    }
+#else
+  count += 2;
+  data += 2;
+  while (count + 3 < max_count && ((data[0] ^ first) | (data[1] ^ first) |
+				   (data[2] ^ first) | (data[3] ^ first)) == 0)
+    {
+      data += 4;
+      count += 4;
+    }
+#endif
+  while (count < max_count && (data[0] == first))
+    {
+      data += 1;
+      count += 1;
+    }
+  return count;
+}
+
+#endif
diff --git a/src/vppinfra/vector/index_to_ptr.h b/src/vppinfra/vector/index_to_ptr.h
new file mode 100644
index 00000000000..3985b757d54
--- /dev/null
+++ b/src/vppinfra/vector/index_to_ptr.h
@@ -0,0 +1,257 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Cisco Systems, Inc.
+ */
+
+#ifndef included_vector_index_to_ptr_h
+#define included_vector_index_to_ptr_h
+#include <vppinfra/clib.h>
+
+#ifdef CLIB_HAVE_VEC128
+static_always_inline void
+clib_index_to_ptr_u32x4 (u32 *indices, void **ptrs, i32 i, u64x2 ov, u8 shift)
+{
+  u32x4 iv4 = u32x4_load_unaligned (indices + i);
+  u64x2 pv2;
+  pv2 = u64x2_from_u32x4 (iv4);
+  u64x2_store_unaligned ((pv2 << shift) + ov, ptrs + i);
+#ifdef __aarch64__
+  pv2 = u64x2_from_u32x4_high (iv4);
+#else
+  pv2 = u64x2_from_u32x4 ((u32x4) u8x16_word_shift_right (iv4, 8));
+#endif
+  u64x2_store_unaligned ((pv2 << shift) + ov, ptrs + i + 2);
+}
+#endif
+
+/** \brief Convert array of indices to pointers with base and shift
+
+    @param indices source array of u32 indices
+    @param base base pointer
+    @param shift numbers of bits to be shifted
+    @param ptrs destinatin array of pointers
+    @param n_elts number of elements in the source array
+*/
+
+static_always_inline void
+clib_index_to_ptr_u32 (u32 *indices, void *base, u8 shift, void **ptrs,
+		       u32 n_elts)
+{
+#if defined CLIB_HAVE_VEC512
+  if (n_elts >= 8)
+    {
+      u64x8 off = u64x8_splat ((u64) base);
+      u64x8 b0, b1, b2, b3, b4, b5, b6, b7;
+
+      while (n_elts >= 64)
+	{
+	  b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
+	  b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8));
+	  b2 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 16));
+	  b3 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 24));
+	  b4 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 32));
+	  b5 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 40));
+	  b6 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 48));
+	  b7 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 56));
+	  u64x8_store_unaligned ((b0 << shift) + off, ptrs);
+	  u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8);
+	  u64x8_store_unaligned ((b2 << shift) + off, ptrs + 16);
+	  u64x8_store_unaligned ((b3 << shift) + off, ptrs + 24);
+	  u64x8_store_unaligned ((b4 << shift) + off, ptrs + 32);
+	  u64x8_store_unaligned ((b5 << shift) + off, ptrs + 40);
+	  u64x8_store_unaligned ((b6 << shift) + off, ptrs + 48);
+	  u64x8_store_unaligned ((b7 << shift) + off, ptrs + 56);
+	  ptrs += 64;
+	  indices += 64;
+	  n_elts -= 64;
+	}
+
+      if (n_elts == 0)
+	return;
+
+      if (n_elts >= 32)
+	{
+	  b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
+	  b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8));
+	  b2 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 16));
+	  b3 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 24));
+	  u64x8_store_unaligned ((b0 << shift) + off, ptrs);
+	  u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8);
+	  u64x8_store_unaligned ((b2 << shift) + off, ptrs + 16);
+	  u64x8_store_unaligned ((b3 << shift) + off, ptrs + 24);
+	  ptrs += 32;
+	  indices += 32;
+	  n_elts -= 32;
+	}
+      if (n_elts >= 16)
+	{
+	  b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
+	  b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8));
+	  u64x8_store_unaligned ((b0 << shift) + off, ptrs);
+	  u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8);
+	  ptrs += 16;
+	  indices += 16;
+	  n_elts -= 16;
+	}
+      if (n_elts >= 8)
+	{
+	  b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
+	  u64x8_store_unaligned ((b0 << shift) + off, ptrs);
+	  ptrs += 8;
+	  indices += 8;
+	  n_elts -= 8;
+	}
+
+      if (n_elts == 0)
+	return;
+
+      b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + n_elts - 8));
+      u64x8_store_unaligned ((b0 << shift) + off, ptrs + n_elts - 8);
+    }
+  else
+    {
+      u32 mask = pow2_mask (n_elts);
+      u64x8 r = u64x8_from_u32x8 (u32x8_mask_load_zero (indices, mask));
+      u64x8_mask_store ((r << shift) + u64x8_splat ((u64) base), ptrs, mask);
+      return;
+    }
+#elif defined CLIB_HAVE_VEC256
+  if (n_elts >= 4)
+    {
+      u64x4 off = u64x4_splat ((u64) base);
+      u64x4 b0, b1, b2, b3, b4, b5, b6, b7;
+
+      while (n_elts >= 32)
+	{
+	  b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
+	  b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4));
+	  b2 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 8));
+	  b3 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 12));
+	  b4 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 16));
+	  b5 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 20));
+	  b6 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 24));
+	  b7 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 28));
+	  u64x4_store_unaligned ((b0 << shift) + off, ptrs);
+	  u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4);
+	  u64x4_store_unaligned ((b2 << shift) + off, ptrs + 8);
+	  u64x4_store_unaligned ((b3 << shift) + off, ptrs + 12);
+	  u64x4_store_unaligned ((b4 << shift) + off, ptrs + 16);
+	  u64x4_store_unaligned ((b5 << shift) + off, ptrs + 20);
+	  u64x4_store_unaligned ((b6 << shift) + off, ptrs + 24);
+	  u64x4_store_unaligned ((b7 << shift) + off, ptrs + 28);
+	  ptrs += 32;
+	  indices += 32;
+	  n_elts -= 32;
+	}
+
+      if (n_elts == 0)
+	return;
+
+      if (n_elts >= 16)
+	{
+	  b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
+	  b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4));
+	  b2 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 8));
+	  b3 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 12));
+	  u64x4_store_unaligned ((b0 << shift) + off, ptrs);
+	  u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4);
+	  u64x4_store_unaligned ((b2 << shift) + off, ptrs + 8);
+	  u64x4_store_unaligned ((b3 << shift) + off, ptrs + 12);
+	  ptrs += 16;
+	  indices += 16;
+	  n_elts -= 16;
+	}
+      if (n_elts >= 8)
+	{
+	  b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
+	  b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4));
+	  u64x4_store_unaligned ((b0 << shift) + off, ptrs);
+	  u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4);
+	  ptrs += 8;
+	  indices += 8;
+	  n_elts -= 8;
+	}
+      if (n_elts > 4)
+	{
+	  b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
+	  u64x4_store_unaligned ((b0 << shift) + off, ptrs);
+	  ptrs += 4;
+	  indices += 4;
+	  n_elts -= 4;
+	}
+
+      b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + n_elts - 4));
+      u64x4_store_unaligned ((b0 << shift) + off, ptrs + n_elts - 4);
+      return;
+    }
+#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE
+  else
+    {
+      u32 mask = pow2_mask (n_elts);
+      u64x4 r = u64x4_from_u32x4 (u32x4_mask_load_zero (indices, mask));
+      u64x4_mask_store ((r << shift) + u64x4_splat ((u64) base), ptrs, mask);
+      return;
+    }
+#endif
+#elif defined(CLIB_HAVE_VEC128)
+  if (n_elts >= 4)
+    {
+      u64x2 ov = u64x2_splat ((u64) base);
+      u32 *i = (u32 *) indices;
+      void **p = (void **) ptrs;
+      u32 n = n_elts;
+
+      while (n >= 32)
+	{
+	  clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
+	  clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift);
+	  clib_index_to_ptr_u32x4 (indices, ptrs, 8, ov, shift);
+	  clib_index_to_ptr_u32x4 (indices, ptrs, 12, ov, shift);
+	  clib_index_to_ptr_u32x4 (indices, ptrs, 16, ov, shift);
+	  clib_index_to_ptr_u32x4 (indices, ptrs, 20, ov, shift);
+	  clib_index_to_ptr_u32x4 (indices, ptrs, 24, ov, shift);
+	  clib_index_to_ptr_u32x4 (indices, ptrs, 28, ov, shift);
+	  indices += 32;
+	  ptrs += 32;
+	  n -= 32;
+	}
+
+      if (n == 0)
+	return;
+
+      if (n >= 16)
+	{
+	  clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
+	  clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift);
+	  clib_index_to_ptr_u32x4 (indices, ptrs, 8, ov, shift);
+	  clib_index_to_ptr_u32x4 (indices, ptrs, 12, ov, shift);
+	  indices += 16;
+	  ptrs += 16;
+	  n -= 16;
+	}
+
+      if (n >= 8)
+	{
+	  clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
+	  clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift);
+	  indices += 8;
+	  ptrs += 8;
+	  n -= 8;
+	}
+
+      if (n > 4)
+	clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
+
+      clib_index_to_ptr_u32x4 (i, p, n_elts - 4, ov, shift);
+      return;
+    }
+#endif
+  while (n_elts)
+    {
+      ptrs[0] = base + ((u64) indices[0] << shift);
+      ptrs += 1;
+      indices += 1;
+      n_elts -= 1;
+    }
+}
+
+#endif
diff --git a/src/vppinfra/vector/ip_csum.h b/src/vppinfra/vector/ip_csum.h
new file mode 100644
index 00000000000..2cea9b448ea
--- /dev/null
+++ b/src/vppinfra/vector/ip_csum.h
@@ -0,0 +1,339 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Cisco Systems, Inc.
+ */
+
+#ifndef included_vector_ip_csum_h
+#define included_vector_ip_csum_h
+#include <vppinfra/clib.h>
+typedef struct
+{
+  u64 sum;
+  u8 odd;
+} clib_ip_csum_t;
+
+#if defined(CLIB_HAVE_VEC128)
+static_always_inline u64x2
+clib_ip_csum_cvt_and_add_4 (u32x4 v)
+{
+  return ((u64x2) u32x4_interleave_lo ((u32x4) v, u32x4_zero ()) +
+	  (u64x2) u32x4_interleave_hi ((u32x4) v, u32x4_zero ()));
+}
+static_always_inline u64
+clib_ip_csum_hadd_2 (u64x2 v)
+{
+  return v[0] + v[1];
+}
+#endif
+
+#if defined(CLIB_HAVE_VEC256)
+static_always_inline u64x4
+clib_ip_csum_cvt_and_add_8 (u32x8 v)
+{
+  return ((u64x4) u32x8_interleave_lo ((u32x8) v, u32x8_zero ()) +
+	  (u64x4) u32x8_interleave_hi ((u32x8) v, u32x8_zero ()));
+}
+static_always_inline u64
+clib_ip_csum_hadd_4 (u64x4 v)
+{
+  return clib_ip_csum_hadd_2 (u64x4_extract_lo (v) + u64x4_extract_hi (v));
+}
+#endif
+
+#if defined(CLIB_HAVE_VEC512)
+static_always_inline u64x8
+clib_ip_csum_cvt_and_add_16 (u32x16 v)
+{
+  return ((u64x8) u32x16_interleave_lo ((u32x16) v, u32x16_zero ()) +
+	  (u64x8) u32x16_interleave_hi ((u32x16) v, u32x16_zero ()));
+}
+static_always_inline u64
+clib_ip_csum_hadd_8 (u64x8 v)
+{
+  return clib_ip_csum_hadd_4 (u64x8_extract_lo (v) + u64x8_extract_hi (v));
+}
+#endif
+
+static_always_inline void
+clib_ip_csum_inline (clib_ip_csum_t *c, u8 *dst, u8 *src, u16 count,
+		     int is_copy)
+{
+  if (c->odd)
+    {
+      c->odd = 0;
+      c->sum += (u16) src[0] << 8;
+      count--;
+      src++;
+      if (is_copy)
+	dst++[0] = src[0];
+    }
+
+#if defined(CLIB_HAVE_VEC512)
+  u64x8 sum8 = {};
+
+  while (count >= 512)
+    {
+      u32x16u *s = (u32x16u *) src;
+      sum8 += clib_ip_csum_cvt_and_add_16 (s[0]);
+      sum8 += clib_ip_csum_cvt_and_add_16 (s[1]);
+      sum8 += clib_ip_csum_cvt_and_add_16 (s[2]);
+      sum8 += clib_ip_csum_cvt_and_add_16 (s[3]);
+      sum8 += clib_ip_csum_cvt_and_add_16 (s[8]);
+      sum8 += clib_ip_csum_cvt_and_add_16 (s[5]);
+      sum8 += clib_ip_csum_cvt_and_add_16 (s[6]);
+      sum8 += clib_ip_csum_cvt_and_add_16 (s[7]);
+      count -= 512;
+      src += 512;
+      if (is_copy)
+	{
+	  u32x16u *d = (u32x16u *) dst;
+	  d[0] = s[0];
+	  d[1] = s[1];
+	  d[2] = s[2];
+	  d[3] = s[3];
+	  d[4] = s[4];
+	  d[5] = s[5];
+	  d[6] = s[6];
+	  d[7] = s[7];
+	  dst += 512;
+	}
+    }
+
+  while (count >= 64)
+    {
+      u32x16u *s = (u32x16u *) src;
+      sum8 += clib_ip_csum_cvt_and_add_16 (s[0]);
+      count -= 64;
+      src += 64;
+      if (is_copy)
+	{
+	  u32x16u *d = (u32x16u *) dst;
+	  d[0] = s[0];
+	  dst += 512;
+	}
+    }
+
+#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE
+  if (count)
+    {
+      u64 mask = pow2_mask (count);
+      u32x16 v = (u32x16) u8x64_mask_load_zero (src, mask);
+      sum8 += clib_ip_csum_cvt_and_add_16 (v);
+      c->odd = count & 1;
+      if (is_copy)
+	u32x16_mask_store (v, dst, mask);
+    }
+  c->sum += clib_ip_csum_hadd_8 (sum8);
+  return;
+#endif
+
+  c->sum += clib_ip_csum_hadd_8 (sum8);
+#elif defined(CLIB_HAVE_VEC256)
+  u64x4 sum4 = {};
+
+  while (count >= 256)
+    {
+      u32x8u *s = (u32x8u *) src;
+      sum4 += clib_ip_csum_cvt_and_add_8 (s[0]);
+      sum4 += clib_ip_csum_cvt_and_add_8 (s[1]);
+      sum4 += clib_ip_csum_cvt_and_add_8 (s[2]);
+      sum4 += clib_ip_csum_cvt_and_add_8 (s[3]);
+      sum4 += clib_ip_csum_cvt_and_add_8 (s[4]);
+      sum4 += clib_ip_csum_cvt_and_add_8 (s[5]);
+      sum4 += clib_ip_csum_cvt_and_add_8 (s[6]);
+      sum4 += clib_ip_csum_cvt_and_add_8 (s[7]);
+      count -= 256;
+      src += 256;
+      if (is_copy)
+	{
+	  u32x8u *d = (u32x8u *) dst;
+	  d[0] = s[0];
+	  d[1] = s[1];
+	  d[2] = s[2];
+	  d[3] = s[3];
+	  d[4] = s[4];
+	  d[5] = s[5];
+	  d[6] = s[6];
+	  d[7] = s[7];
+	  dst += 256;
+	}
+    }
+
+  while (count >= 32)
+    {
+      u32x8u *s = (u32x8u *) src;
+      sum4 += clib_ip_csum_cvt_and_add_8 (s[0]);
+      count -= 32;
+      src += 32;
+      if (is_copy)
+	{
+	  u32x8u *d = (u32x8u *) dst;
+	  d[0] = s[0];
+	  dst += 32;
+	}
+    }
+
+#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE
+  if (count)
+    {
+      u32 mask = pow2_mask (count);
+      u32x8 v = (u32x8) u8x32_mask_load_zero (src, mask);
+      sum4 += clib_ip_csum_cvt_and_add_8 (v);
+      c->odd = count & 1;
+      if (is_copy)
+	u32x8_mask_store (v, dst, mask);
+    }
+  c->sum += clib_ip_csum_hadd_4 (sum4);
+  return;
+#endif
+
+  c->sum += clib_ip_csum_hadd_4 (sum4);
+#elif defined(CLIB_HAVE_VEC128)
+  u64x2 sum2 = {};
+
+  while (count >= 128)
+    {
+      u32x4u *s = (u32x4u *) src;
+      sum2 += clib_ip_csum_cvt_and_add_4 (s[0]);
+      sum2 += clib_ip_csum_cvt_and_add_4 (s[1]);
+      sum2 += clib_ip_csum_cvt_and_add_4 (s[2]);
+      sum2 += clib_ip_csum_cvt_and_add_4 (s[3]);
+      sum2 += clib_ip_csum_cvt_and_add_4 (s[4]);
+      sum2 += clib_ip_csum_cvt_and_add_4 (s[5]);
+      sum2 += clib_ip_csum_cvt_and_add_4 (s[6]);
+      sum2 += clib_ip_csum_cvt_and_add_4 (s[7]);
+      count -= 128;
+      src += 128;
+      if (is_copy)
+	{
+	  u32x4u *d = (u32x4u *) dst;
+	  d[0] = s[0];
+	  d[1] = s[1];
+	  d[2] = s[2];
+	  d[3] = s[3];
+	  d[4] = s[4];
+	  d[5] = s[5];
+	  d[6] = s[6];
+	  d[7] = s[7];
+	  dst += 128;
+	}
+    }
+
+  while (count >= 16)
+    {
+      u32x4u *s = (u32x4u *) src;
+      sum2 += clib_ip_csum_cvt_and_add_4 (s[0]);
+      count -= 16;
+      src += 16;
+      if (is_copy)
+	{
+	  u32x4u *d = (u32x4u *) dst;
+	  d[0] = s[0];
+	  dst += 16;
+	}
+    }
+  c->sum += clib_ip_csum_hadd_2 (sum2);
+#else
+  while (count >= 4)
+    {
+      u32 v = *((u32 *) src);
+      c->sum += v;
+      count -= 4;
+      src += 4;
+      if (is_copy)
+	{
+	  *(u32 *) dst = v;
+	  dst += 4;
+	}
+    }
+#endif
+  while (count >= 2)
+    {
+      u16 v = *((u16 *) src);
+      c->sum += v;
+      count -= 2;
+      src += 2;
+      if (is_copy)
+	{
+	  *(u16 *) dst = v;
+	  dst += 2;
+	}
+    }
+
+  if (count)
+    {
+      c->odd = 1;
+      c->sum += (u16) src[0];
+      if (is_copy)
+	dst[0] = src[0];
+    }
+}
+
+static_always_inline u16
+clib_ip_csum_fold (clib_ip_csum_t *c)
+{
+  u64 sum = c->sum;
+#if defined(__x86_64__) && defined(__BMI2__)
+  u64 tmp = sum;
+  asm volatile(
+    /* using ADC is much faster than mov, shift, add sequence
+     * compiler produces */
+    "shr	$32, %[sum]			\n\t"
+    "add	%k[tmp], %k[sum]		\n\t"
+    "mov	$16, %k[tmp]			\n\t"
+    "shrx	%k[tmp], %k[sum], %k[tmp]	\n\t"
+    "adc	%w[tmp], %w[sum]		\n\t"
+    "adc	$0, %w[sum]			\n\t"
+    : [ sum ] "+&r"(sum), [ tmp ] "+&r"(tmp));
+#else
+  sum = ((u32) sum) + (sum >> 32);
+  sum = ((u16) sum) + (sum >> 16);
+  sum = ((u16) sum) + (sum >> 16);
+#endif
+  return (~((u16) sum));
+}
+
+static_always_inline void
+clib_ip_csum_chunk (clib_ip_csum_t *c, u8 *src, u16 count)
+{
+  return clib_ip_csum_inline (c, 0, src, count, 0);
+}
+
+static_always_inline void
+clib_ip_csum_and_copy_chunk (clib_ip_csum_t *c, u8 *src, u8 *dst, u16 count)
+{
+  return clib_ip_csum_inline (c, dst, src, count, 1);
+}
+
+static_always_inline u16
+clib_ip_csum (u8 *src, u16 count)
+{
+  clib_ip_csum_t c = {};
+  if (COMPILE_TIME_CONST (count) && count == 12)
+    {
+      for (int i = 0; i < 3; i++)
+	c.sum += ((u32 *) src)[i];
+    }
+  else if (COMPILE_TIME_CONST (count) && count == 20)
+    {
+      for (int i = 0; i < 5; i++)
+	c.sum += ((u32 *) src)[i];
+    }
+  else if (COMPILE_TIME_CONST (count) && count == 40)
+    {
+      for (int i = 0; i < 10; i++)
+	c.sum += ((u32 *) src)[i];
+    }
+  else
+    clib_ip_csum_inline (&c, 0, src, count, 0);
+  return clib_ip_csum_fold (&c);
+}
+
+static_always_inline u16
+clib_ip_csum_and_copy (u8 *dst, u8 *src, u16 count)
+{
+  clib_ip_csum_t c = {};
+  clib_ip_csum_inline (&c, dst, src, count, 1);
+  return clib_ip_csum_fold (&c);
+}
+
+#endif
diff --git a/src/vppinfra/vector/mask_compare.h b/src/vppinfra/vector/mask_compare.h
index cac48a31f47..fc72d7dac35 100644
--- a/src/vppinfra/vector/mask_compare.h
+++ b/src/vppinfra/vector/mask_compare.h
@@ -8,7 +8,7 @@
 #include <vppinfra/memcpy.h>
 
 static_always_inline u64
-clib_mask_compare_u16_x64 (u16 v, u16 *a, u32 n_elts)
+clib_mask_compare_u16_x64 (u16 v, u16 *a)
 {
   u64 mask = 0;
 #if defined(CLIB_HAVE_VEC512)
@@ -47,6 +47,38 @@ clib_mask_compare_u16_x64 (u16 v, u16 *a, u32 n_elts)
 	  (u64) i8x16_msb_mask (i8x16_pack (v8 == av[4], v8 == av[5])) << 32 |
 	  (u64) i8x16_msb_mask (i8x16_pack (v8 == av[6], v8 == av[7])) << 48);
 #else
+  for (int i = 0; i < 64; i++)
+    if (a[i] == v)
+      mask |= 1ULL << i;
+#endif
+  return mask;
+}
+
+static_always_inline u64
+clib_mask_compare_u16_x64_n (u16 v, u16 *a, u32 n_elts)
+{
+  u64 mask = 0;
+  CLIB_UNUSED (u64 data_mask) = pow2_mask (n_elts);
+#if defined(CLIB_HAVE_VEC512)
+  u16x32 v32 = u16x32_splat (v);
+  u16x32u *av = (u16x32u *) a;
+  mask = ((u64) u16x32_is_equal_mask (
+	    u16x32_mask_load_zero (&av[0], data_mask), v32) |
+	  (u64) u16x32_is_equal_mask (
+	    u16x32_mask_load_zero (&av[1], data_mask >> 32), v32)
+	    << 32);
+#elif defined(CLIB_HAVE_VEC256) && defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
+  u16x16 v16 = u16x16_splat (v);
+  u16x16u *av = (u16x16u *) a;
+  i8x32 x;
+
+  x = i8x32_pack (v16 == u16x16_mask_load_zero (&av[0], data_mask),
+		  v16 == u16x16_mask_load_zero (&av[1], data_mask >> 16));
+  mask = i8x32_msb_mask ((i8x32) u64x4_permute (x, 0, 2, 1, 3));
+  x = i8x32_pack (v16 == u16x16_mask_load_zero (&av[2], data_mask >> 32),
+		  v16 == u16x16_mask_load_zero (&av[3], data_mask >> 48));
+  mask |= (u64) i8x32_msb_mask ((i8x32) u64x4_permute (x, 0, 2, 1, 3)) << 32;
+#else
   for (int i = 0; i < n_elts; i++)
     if (a[i] == v)
       mask |= 1ULL << i;
@@ -68,7 +100,7 @@ clib_mask_compare_u16 (u16 v, u16 *a, u64 *mask, u32 n_elts)
 {
   while (n_elts >= 64)
     {
-      mask++[0] = clib_mask_compare_u16_x64 (v, a, 64);
+      mask++[0] = clib_mask_compare_u16_x64 (v, a);
       n_elts -= 64;
       a += 64;
     }
@@ -76,11 +108,11 @@ clib_mask_compare_u16 (u16 v, u16 *a, u64 *mask, u32 n_elts)
   if (PREDICT_TRUE (n_elts == 0))
     return;
 
-  mask[0] = clib_mask_compare_u16_x64 (v, a, n_elts) & pow2_mask (n_elts);
+  mask[0] = clib_mask_compare_u16_x64_n (v, a, n_elts) & pow2_mask (n_elts);
 }
 
 static_always_inline u64
-clib_mask_compare_u32_x64 (u32 v, u32 *a, u32 n_elts)
+clib_mask_compare_u32_x64 (u32 v, u32 *a)
 {
   u64 mask = 0;
 #if defined(CLIB_HAVE_VEC512)
@@ -131,6 +163,57 @@ clib_mask_compare_u32_x64 (u32 v, u32 *a, u32 n_elts)
     }
 
 #else
+  for (int i = 0; i < 64; i++)
+    if (a[i] == v)
+      mask |= 1ULL << i;
+#endif
+  return mask;
+}
+
+static_always_inline u64
+clib_mask_compare_u32_x64_n (u32 v, u32 *a, u32 n_elts)
+{
+  u64 mask = 0;
+  CLIB_UNUSED (u64 data_mask) = pow2_mask (n_elts);
+#if defined(CLIB_HAVE_VEC512)
+  u32x16 v16 = u32x16_splat (v);
+  u32x16u *av = (u32x16u *) a;
+  mask = ((u64) u32x16_is_equal_mask (
+	    u32x16_mask_load_zero (&av[0], data_mask), v16) |
+	  (u64) u32x16_is_equal_mask (
+	    u32x16_mask_load_zero (&av[1], data_mask >> 16), v16)
+	    << 16 |
+	  (u64) u32x16_is_equal_mask (
+	    u32x16_mask_load_zero (&av[2], data_mask >> 32), v16)
+	    << 32 |
+	  (u64) u32x16_is_equal_mask (
+	    u32x16_mask_load_zero (&av[3], data_mask >> 48), v16)
+	    << 48);
+#elif defined(CLIB_HAVE_VEC256) && defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
+  u32x8 v8 = u32x8_splat (v);
+  u32x8u *av = (u32x8u *) a;
+  u32x8 m = { 0, 4, 1, 5, 2, 6, 3, 7 };
+  i8x32 c;
+
+  c = i8x32_pack (
+    i16x16_pack (
+      (i32x8) (v8 == u32x8_mask_load_zero (&av[0], data_mask)),
+      (i32x8) (v8 == u32x8_mask_load_zero (&av[1], data_mask >> 8))),
+    i16x16_pack (
+      (i32x8) (v8 == u32x8_mask_load_zero (&av[2], data_mask >> 16)),
+      (i32x8) (v8 == u32x8_mask_load_zero (&av[3], data_mask >> 24))));
+  mask = i8x32_msb_mask ((i8x32) u32x8_permute ((u32x8) c, m));
+
+  c = i8x32_pack (
+    i16x16_pack (
+      (i32x8) (v8 == u32x8_mask_load_zero (&av[4], data_mask >> 32)),
+      (i32x8) (v8 == u32x8_mask_load_zero (&av[5], data_mask >> 40))),
+    i16x16_pack (
+      (i32x8) (v8 == u32x8_mask_load_zero (&av[6], data_mask >> 48)),
+      (i32x8) (v8 == u32x8_mask_load_zero (&av[7], data_mask >> 56))));
+  mask |= (u64) i8x32_msb_mask ((i8x32) u32x8_permute ((u32x8) c, m)) << 32;
+  mask |= (u64) i8x32_msb_mask ((i8x32) u32x8_permute ((u32x8) c, m)) << 32;
+#else
   for (int i = 0; i < n_elts; i++)
     if (a[i] == v)
       mask |= 1ULL << i;
@@ -152,7 +235,119 @@ clib_mask_compare_u32 (u32 v, u32 *a, u64 *bitmap, u32 n_elts)
 {
   while (n_elts >= 64)
     {
-      bitmap++[0] = clib_mask_compare_u32_x64 (v, a, 64);
+      bitmap++[0] = clib_mask_compare_u32_x64 (v, a);
+      n_elts -= 64;
+      a += 64;
+    }
+
+  if (PREDICT_TRUE (n_elts == 0))
+    return;
+
+  bitmap[0] = clib_mask_compare_u32_x64_n (v, a, n_elts) & pow2_mask (n_elts);
+}
+
+static_always_inline u64
+clib_mask_compare_u64_x64 (u64 v, u64 *a)
+{
+  u64 mask = 0;
+#if defined(CLIB_HAVE_VEC512)
+  u64x8 v8 = u64x8_splat (v);
+  u64x8u *av = (u64x8u *) a;
+  mask = ((u64) u64x8_is_equal_mask (av[0], v8) |
+	  (u64) u64x8_is_equal_mask (av[1], v8) << 8 |
+	  (u64) u64x8_is_equal_mask (av[2], v8) << 16 |
+	  (u64) u64x8_is_equal_mask (av[3], v8) << 24 |
+	  (u64) u64x8_is_equal_mask (av[4], v8) << 32 |
+	  (u64) u64x8_is_equal_mask (av[5], v8) << 40 |
+	  (u64) u64x8_is_equal_mask (av[6], v8) << 48 |
+	  (u64) u64x8_is_equal_mask (av[7], v8) << 56);
+
+#elif defined(CLIB_HAVE_VEC256) && defined(__BMI2__)
+  u64x4 v4 = u64x4_splat (v);
+  u64x4u *av = (u64x4u *) a;
+
+  for (int i = 0; i < 16; i += 2)
+    {
+      u64 l = u8x32_msb_mask (v4 == av[i]);
+      u64 h = u8x32_msb_mask (v4 == av[i + 1]);
+      mask |= _pext_u64 (l | h << 32, 0x0101010101010101) << (i * 4);
+    }
+#else
+  for (int i = 0; i < 64; i++)
+    if (a[i] == v)
+      mask |= 1ULL << i;
+#endif
+  return mask;
+}
+
+static_always_inline u64
+clib_mask_compare_u64_x64_n (u64 v, u64 *a, u32 n_elts)
+{
+  u64 mask = 0;
+  CLIB_UNUSED (u64 data_mask) = pow2_mask (n_elts);
+#if defined(CLIB_HAVE_VEC512)
+  u64x8 v8 = u64x8_splat (v);
+  u64x8u *av = (u64x8u *) a;
+  mask =
+    ((u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[0], data_mask), v8) |
+     (u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[1], data_mask >> 8),
+				v8)
+       << 8 |
+     (u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[2], data_mask >> 16),
+				v8)
+       << 16 |
+     (u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[3], data_mask >> 24),
+				v8)
+       << 24 |
+     (u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[4], data_mask >> 32),
+				v8)
+       << 32 |
+     (u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[5], data_mask >> 40),
+				v8)
+       << 40 |
+     (u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[6], data_mask >> 48),
+				v8)
+       << 48 |
+     (u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[7], data_mask >> 56),
+				v8)
+       << 56);
+
+#elif defined(CLIB_HAVE_VEC256) && defined(__BMI2__) &&                       \
+  defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
+  u64x4 v4 = u64x4_splat (v);
+  u64x4u *av = (u64x4u *) a;
+
+  for (int i = 0; i < 16; i += 2)
+    {
+      u64 l = u8x32_msb_mask (v4 == u64x4_mask_load_zero (&av[i], data_mask));
+      u64 h = u8x32_msb_mask (
+	v4 == u64x4_mask_load_zero (&av[i + 1], data_mask >> 4));
+      mask |= _pext_u64 (l | h << 32, 0x0101010101010101) << (i * 4);
+      data_mask >>= 8;
+    }
+#else
+  for (int i = 0; i < n_elts; i++)
+    if (a[i] == v)
+      mask |= 1ULL << i;
+#endif
+  return mask;
+}
+
+/** \brief Compare 64-bit elemments with provied value and return bitmap
+
+    @param v value to compare elements with
+    @param a array of u64 elements
+    @param mask array of u64 where reuslting mask will be stored
+    @param n_elts number of elements in the array
+    @return none
+*/
+
+static_always_inline void
+clib_mask_compare_u64 (u64 v, u64 *a, u64 *bitmap, u32 n_elts)
+{
+  while (n_elts >= 64)
+    {
+      bitmap++[0] = clib_mask_compare_u64_x64 (v, a);
       n_elts -= 64;
       a += 64;
     }
@@ -160,7 +355,7 @@ clib_mask_compare_u32 (u32 v, u32 *a, u64 *bitmap, u32 n_elts)
   if (PREDICT_TRUE (n_elts == 0))
     return;
 
-  bitmap[0] = clib_mask_compare_u32_x64 (v, a, n_elts) & pow2_mask (n_elts);
+  bitmap[0] = clib_mask_compare_u64_x64_n (v, a, n_elts) & pow2_mask (n_elts);
 }
 
 #endif
diff --git a/src/vppinfra/vector/test/array_mask.c b/src/vppinfra/vector/test/array_mask.c
deleted file mode 100644
index a1f4da728d4..00000000000
--- a/src/vppinfra/vector/test/array_mask.c
+++ /dev/null
@@ -1,124 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright(c) 2021 Cisco Systems, Inc.
- */
-
-#include <vppinfra/format.h>
-#include <vppinfra/vector/test/test.h>
-#include <vppinfra/vector/array_mask.h>
-
-__clib_test_fn void
-clib_array_mask_u32_wrapper (u32 *src, u32 mask, u32 n_elts)
-{
-  clib_array_mask_u32 (src, mask, n_elts);
-}
-
-typedef struct
-{
-  u32 mask;
-  u32 expected[256];
-} array_mask_test_t;
-
-static array_mask_test_t tests[] = {
-  /* mask values 0x1, output array of alternating 0 1 0 1 .. */
-  { .mask = 1,
-    .expected = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-		  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-		  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-		  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-		  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-		  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-		  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-		  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-		  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-		  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-		  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-		  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-		  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 } },
-  /* mask values 0xFFFFFFFF, output array of 0, 1, 2, .., 255 */
-  { .mask = ~0U,
-    .expected = { 0,   1,   2,	 3,   4,   5,	6,   7,	  8,   9,   10,	 11,
-		  12,  13,  14,	 15,  16,  17,	18,  19,  20,  21,  22,	 23,
-		  24,  25,  26,	 27,  28,  29,	30,  31,  32,  33,  34,	 35,
-		  36,  37,  38,	 39,  40,  41,	42,  43,  44,  45,  46,	 47,
-		  48,  49,  50,	 51,  52,  53,	54,  55,  56,  57,  58,	 59,
-		  60,  61,  62,	 63,  64,  65,	66,  67,  68,  69,  70,	 71,
-		  72,  73,  74,	 75,  76,  77,	78,  79,  80,  81,  82,	 83,
-		  84,  85,  86,	 87,  88,  89,	90,  91,  92,  93,  94,	 95,
-		  96,  97,  98,	 99,  100, 101, 102, 103, 104, 105, 106, 107,
-		  108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
-		  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
-		  132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
-		  144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
-		  156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
-		  168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
-		  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
-		  192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203,
-		  204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215,
-		  216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227,
-		  228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
-		  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
-		  252, 253, 254, 255 } },
-  /* mask values 0xF, output array of 0, .., 15, 0, .., 15, 0, .., 15 */
-  { .mask = 15,
-    .expected = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-		  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-		  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-		  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-		  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-		  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-		  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-		  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-		  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-		  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-		  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-		  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-		  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-		  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-		  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-		  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } },
-  /* mask values 0x1, output array of 1, 0, 1, 0,.. */
-  { .mask = 1, .expected = { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 } },
-};
-
-static clib_error_t *
-test_clib_array_mask_u32 (clib_error_t *err)
-{
-  u32 i, j;
-  for (i = 0; i < ARRAY_LEN (tests) - 1; i++)
-    {
-      u32 src[256];
-      for (j = 0; j < ARRAY_LEN (src); j++)
-	src[j] = j;
-
-      array_mask_test_t *t = tests + i;
-      clib_array_mask_u32_wrapper (src, t->mask, ARRAY_LEN (src));
-      for (j = 0; j < ARRAY_LEN (src); j++)
-	{
-	  if (src[j] != t->expected[j])
-	    return clib_error_return (err,
-				      "testcase %u failed at "
-				      "(src[%u] = 0x%x, expected 0x%x)",
-				      i, j, src[j], t->expected[j]);
-	}
-    }
-
-  u32 src[10] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
-  array_mask_test_t *t = tests + i;
-
-  clib_array_mask_u32_wrapper (src, t->mask, ARRAY_LEN (src));
-  for (j = 0; j < ARRAY_LEN (src); j++)
-    {
-      if (src[j] != t->expected[j])
-	return clib_error_return (err,
-				  "testcase %u failed at "
-				  "(src[%u] = 0x%x, expected 0x%x)",
-				  i, j, src[j], t->expected[j]);
-    }
-
-  return err;
-}
-
-REGISTER_TEST (clib_array_mask_u32) = {
-  .name = "clib_array_mask_u32",
-  .fn = test_clib_array_mask_u32,
-};
diff --git a/src/vppinfra/vector/test/compress.c b/src/vppinfra/vector/test/compress.c
deleted file mode 100644
index 9bc53ff1e41..00000000000
--- a/src/vppinfra/vector/test/compress.c
+++ /dev/null
@@ -1,248 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright(c) 2021 Cisco Systems, Inc.
- */
-
-#include <vppinfra/format.h>
-#include <vppinfra/vector/test/test.h>
-#include <vppinfra/vector/compress.h>
-
-__clib_test_fn u32
-clib_compress_u64_wrapper (u64 *dst, u64 *src, u64 *mask, u32 n_elts)
-{
-  return clib_compress_u64 (dst, src, mask, n_elts);
-}
-
-__clib_test_fn u32
-clib_compress_u32_wrapper (u32 *dst, u32 *src, u64 *mask, u32 n_elts)
-{
-  return clib_compress_u32 (dst, src, mask, n_elts);
-}
-
-__clib_test_fn u32
-clib_compress_u16_wrapper (u16 *dst, u16 *src, u64 *mask, u32 n_elts)
-{
-  return clib_compress_u16 (dst, src, mask, n_elts);
-}
-
-__clib_test_fn u32
-clib_compress_u8_wrapper (u8 *dst, u8 *src, u64 *mask, u32 n_elts)
-{
-  return clib_compress_u8 (dst, src, mask, n_elts);
-}
-
-typedef struct
-{
-  u64 mask[10];
-  u32 n_elts;
-} compress_test_t;
-
-static compress_test_t tests[] = {
-  { .mask = { 1 }, .n_elts = 1 },
-  { .mask = { 2 }, .n_elts = 2 },
-  { .mask = { 3 }, .n_elts = 2 },
-  { .mask = { 0, 1 }, .n_elts = 66 },
-  { .mask = { 0, 2 }, .n_elts = 69 },
-  { .mask = { 0, 3 }, .n_elts = 66 },
-  { .mask = { ~0ULL, ~0ULL, ~0ULL, ~0ULL }, .n_elts = 62 },
-  { .mask = { ~0ULL, ~0ULL, ~0ULL, ~0ULL }, .n_elts = 255 },
-  { .mask = { ~0ULL, 1, 1, ~0ULL }, .n_elts = 256 },
-};
-
-static clib_error_t *
-test_clib_compress_u64 (clib_error_t *err)
-{
-  u64 src[513];
-  u64 dst[513];
-  u32 i, j;
-
-  for (i = 0; i < ARRAY_LEN (src); i++)
-    src[i] = i;
-
-  for (i = 0; i < ARRAY_LEN (tests); i++)
-    {
-      compress_test_t *t = tests + i;
-      u64 *dp = dst;
-      u32 r;
-
-      for (j = 0; j < ARRAY_LEN (dst); j++)
-	dst[j] = 0xa5a5a5a5a5a5a5a5;
-
-      r = clib_compress_u64_wrapper (dst, src, t->mask, t->n_elts);
-
-      for (j = 0; j < t->n_elts; j++)
-	{
-	  if ((t->mask[j >> 6] & (1ULL << (j & 0x3f))) == 0)
-	    continue;
-	  if (dp[0] != src[j])
-	    return clib_error_return (err,
-				      "wrong data in testcase %u at "
-				      "(dst[%u] = 0x%lx, src[%u] = 0x%lx)",
-				      i, dp - dst, dp[0], j, src[j]);
-	  dp++;
-	}
-
-      if (dst[dp - dst + 1] != 0xa5a5a5a5a5a5a5a5)
-	return clib_error_return (err, "buffer overrun in testcase %u", i);
-
-      if (dp - dst != r)
-	return clib_error_return (err, "wrong number of elts in testcase %u",
-				  i);
-    }
-
-  return err;
-
-  return err;
-}
-
-static clib_error_t *
-test_clib_compress_u32 (clib_error_t *err)
-{
-  u32 src[513];
-  u32 dst[513];
-  u32 i, j;
-
-  for (i = 0; i < ARRAY_LEN (src); i++)
-    src[i] = i;
-
-  for (i = 0; i < ARRAY_LEN (tests); i++)
-    {
-      compress_test_t *t = tests + i;
-      u32 *dp = dst;
-      u32 r;
-
-      for (j = 0; j < ARRAY_LEN (dst); j++)
-	dst[j] = 0xa5a5a5a5;
-
-      r = clib_compress_u32_wrapper (dst, src, t->mask, t->n_elts);
-
-      for (j = 0; j < t->n_elts; j++)
-	{
-	  if ((t->mask[j >> 6] & (1ULL << (j & 0x3f))) == 0)
-	    continue;
-
-	  if (dp[0] != src[j])
-	    return clib_error_return (err,
-				      "wrong data in testcase %u at "
-				      "(dst[%u] = 0x%x, src[%u] = 0x%x)",
-				      i, dp - dst, dp[0], j, src[j]);
-	  dp++;
-	}
-
-      if (dst[dp - dst + 1] != 0xa5a5a5a5)
-	return clib_error_return (err, "buffer overrun in testcase %u", i);
-
-      if (dp - dst != r)
-	return clib_error_return (err, "wrong number of elts in testcase %u",
-				  i);
-    }
-
-  return err;
-}
-
-static clib_error_t *
-test_clib_compress_u16 (clib_error_t *err)
-{
-  u16 src[513];
-  u16 dst[513];
-  u32 i, j;
-
-  for (i = 0; i < ARRAY_LEN (src); i++)
-    src[i] = i;
-
-  for (i = 0; i < ARRAY_LEN (tests); i++)
-    {
-      compress_test_t *t = tests + i;
-      u16 *dp = dst;
-      u32 r;
-
-      for (j = 0; j < ARRAY_LEN (dst); j++)
-	dst[j] = 0xa5a5;
-
-      r = clib_compress_u16_wrapper (dst, src, t->mask, t->n_elts);
-
-      for (j = 0; j < t->n_elts; j++)
-	{
-	  if ((t->mask[j >> 6] & (1ULL << (j & 0x3f))) == 0)
-	    continue;
-	  if (dp[0] != src[j])
-	    return clib_error_return (err,
-				      "wrong data in testcase %u at "
-				      "(dst[%u] = 0x%x, src[%u] = 0x%x)",
-				      i, dp - dst, dp[0], j, src[j]);
-	  dp++;
-	}
-
-      if (dst[dp - dst + 1] != 0xa5a5)
-	return clib_error_return (err, "buffer overrun in testcase %u", i);
-
-      if (dp - dst != r)
-	return clib_error_return (err, "wrong number of elts in testcase %u",
-				  i);
-    }
-
-  return err;
-}
-
-static clib_error_t *
-test_clib_compress_u8 (clib_error_t *err)
-{
-  u8 src[513];
-  u8 dst[513];
-  u32 i, j;
-
-  for (i = 0; i < ARRAY_LEN (src); i++)
-    src[i] = i;
-
-  for (i = 0; i < ARRAY_LEN (tests); i++)
-    {
-      compress_test_t *t = tests + i;
-      u8 *dp = dst;
-      u32 r;
-
-      for (j = 0; j < ARRAY_LEN (dst); j++)
-	dst[j] = 0xa5;
-
-      r = clib_compress_u8_wrapper (dst, src, t->mask, t->n_elts);
-
-      for (j = 0; j < t->n_elts; j++)
-	{
-	  if ((t->mask[j >> 6] & (1ULL << (j & 0x3f))) == 0)
-	    continue;
-	  if (dp[0] != src[j])
-	    return clib_error_return (err,
-				      "wrong data in testcase %u at "
-				      "(dst[%u] = 0x%x, src[%u] = 0x%x)",
-				      i, dp - dst, dp[0], j, src[j]);
-	  dp++;
-	}
-
-      if (dst[dp - dst + 1] != 0xa5)
-	return clib_error_return (err, "buffer overrun in testcase %u", i);
-
-      if (dp - dst != r)
-	return clib_error_return (err, "wrong number of elts in testcase %u",
-				  i);
-    }
-
-  return err;
-}
-
-REGISTER_TEST (clib_compress_u64) = {
-  .name = "clib_compress_u64",
-  .fn = test_clib_compress_u64,
-};
-
-REGISTER_TEST (clib_compress_u32) = {
-  .name = "clib_compress_u32",
-  .fn = test_clib_compress_u32,
-};
-
-REGISTER_TEST (clib_compress_u16) = {
-  .name = "clib_compress_u16",
-  .fn = test_clib_compress_u16,
-};
-
-REGISTER_TEST (clib_compress_u8) = {
-  .name = "clib_compress_u8",
-  .fn = test_clib_compress_u8,
-};
diff --git a/src/vppinfra/vector/test/mask_compare.c b/src/vppinfra/vector/test/mask_compare.c
deleted file mode 100644
index 64df0ee084a..00000000000
--- a/src/vppinfra/vector/test/mask_compare.c
+++ /dev/null
@@ -1,95 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright(c) 2021 Cisco Systems, Inc.
- */
-
-#include <vppinfra/format.h>
-#include <vppinfra/vector/test/test.h>
-#include <vppinfra/vector/mask_compare.h>
-
-__clib_test_fn void
-clib_mask_compare_u16_wrapper (u16 v, u16 *a, u64 *mask, u32 n_elts)
-{
-  clib_mask_compare_u16 (v, a, mask, n_elts);
-}
-
-__clib_test_fn void
-clib_mask_compare_u32_wrapper (u32 v, u32 *a, u64 *mask, u32 n_elts)
-{
-  clib_mask_compare_u32 (v, a, mask, n_elts);
-}
-
-static clib_error_t *
-test_clib_mask_compare_u16 (clib_error_t *err)
-{
-  u16 array[513];
-  u64 mask[10];
-  u32 i, j;
-
-  for (i = 0; i < ARRAY_LEN (array); i++)
-    array[i] = i;
-
-  for (i = 0; i < ARRAY_LEN (array); i++)
-    {
-      for (j = 0; j < ARRAY_LEN (mask); j++)
-	mask[j] = 0xa5a5a5a5a5a5a5a5;
-
-      clib_mask_compare_u16_wrapper (i, array, mask, i + 1);
-
-      for (j = 0; j < (i >> 6); j++)
-	{
-	  if (mask[j])
-	    return clib_error_return (err, "mask at position %u not zero", j);
-	}
-      if (mask[j] != 1ULL << (i & 0x3f))
-	return clib_error_return (err,
-				  "mask at position %u is %lx, expected %lx",
-				  j, mask[j], 1ULL << (i % 64));
-
-      if (mask[j + 1] != 0xa5a5a5a5a5a5a5a5)
-	return clib_error_return (err, "mask overrun at position %u", j + 1);
-    }
-  return err;
-}
-
-REGISTER_TEST (clib_mask_compare_u16) = {
-  .name = "clib_mask_compare_u16",
-  .fn = test_clib_mask_compare_u16,
-};
-
-static clib_error_t *
-test_clib_mask_compare_u32 (clib_error_t *err)
-{
-  u32 array[513];
-  u64 mask[10];
-  u32 i, j;
-
-  for (i = 0; i < ARRAY_LEN (array); i++)
-    array[i] = i;
-
-  for (i = 0; i < ARRAY_LEN (array); i++)
-    {
-      for (j = 0; j < ARRAY_LEN (mask); j++)
-	mask[j] = 0xa5a5a5a5a5a5a5a5;
-
-      clib_mask_compare_u32_wrapper (i, array, mask, i + 1);
-
-      for (j = 0; j < (i >> 6); j++)
-	{
-	  if (mask[j])
-	    return clib_error_return (err, "mask at position %u not zero", j);
-	}
-      if (mask[j] != 1ULL << (i & 0x3f))
-	return clib_error_return (err,
-				  "mask at position %u is %lx, expected %lx",
-				  j, mask[j], 1ULL << (i % 64));
-
-      if (mask[j + 1] != 0xa5a5a5a5a5a5a5a5)
-	return clib_error_return (err, "mask overrun at position %u", j + 1);
-    }
-  return err;
-}
-
-REGISTER_TEST (clib_mask_compare_u32) = {
-  .name = "clib_mask_compare_u32",
-  .fn = test_clib_mask_compare_u32,
-};
diff --git a/src/vppinfra/vector/test/test.c b/src/vppinfra/vector/test/test.c
deleted file mode 100644
index 1a8b9d6ea10..00000000000
--- a/src/vppinfra/vector/test/test.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright(c) 2021 Cisco Systems, Inc.
- */
-
-#include <vppinfra/format.h>
-#include <vppinfra/vector/test/test.h>
-
-test_registration_t *test_registrations[CLIB_MARCH_TYPE_N_VARIANTS] = {};
-
-int
-test_march_supported (clib_march_variant_type_t type)
-{
-#define _(s, n)                                                               \
-  if (CLIB_MARCH_VARIANT_TYPE_##s == type)                                    \
-    return clib_cpu_march_priority_##s ();
-  foreach_march_variant
-#undef _
-    return 0;
-}
-
-int
-main (int argc, char *argv[])
-{
-  clib_mem_init (0, 64ULL << 20);
-
-  for (int i = 0; i < CLIB_MARCH_TYPE_N_VARIANTS; i++)
-    {
-      test_registration_t *r = test_registrations[i];
-
-      if (r == 0 || test_march_supported (i) < 0)
-	continue;
-
-      fformat (stdout, "\nMultiarch Variant: %U\n", format_march_variant, i);
-      fformat (stdout,
-	       "-------------------------------------------------------\n");
-      while (r)
-	{
-	  clib_error_t *err;
-	  err = (r->fn) (0);
-	  fformat (stdout, "%-50s %s\n", r->name, err ? "FAIL" : "PASS");
-	  if (err)
-	    {
-	      clib_error_report (err);
-	      fformat (stdout, "\n");
-	    }
-
-	  r = r->next;
-	}
-    }
-
-  fformat (stdout, "\n");
-  return 0;
-}
diff --git a/src/vppinfra/vector/test/test.h b/src/vppinfra/vector/test/test.h
deleted file mode 100644
index bc499fb24e8..00000000000
--- a/src/vppinfra/vector/test/test.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright(c) 2021 Cisco Systems, Inc.
- */
-
-#ifndef included_test_test_h
-#define included_test_test_h
-
-#include <vppinfra/cpu.h>
-
-typedef clib_error_t *(test_fn_t) (clib_error_t *);
-
-typedef struct test_registration_
-{
-  char *name;
-  u8 multiarch : 1;
-  test_fn_t *fn;
-  struct test_registration_ *next;
-} test_registration_t;
-
-extern test_registration_t *test_registrations[CLIB_MARCH_TYPE_N_VARIANTS];
-
-#define __clib_test_fn static __clib_noinline __clib_section (".test_wrapper")
-
-#define REGISTER_TEST(x)                                                      \
-  test_registration_t CLIB_MARCH_SFX (__test_##x);                            \
-  static void __clib_constructor CLIB_MARCH_SFX (__test_registration_##x) (   \
-    void)                                                                     \
-  {                                                                           \
-    test_registration_t *r = &CLIB_MARCH_SFX (__test_##x);                    \
-    r->next = test_registrations[CLIB_MARCH_SFX (CLIB_MARCH_VARIANT_TYPE)];   \
-    test_registrations[CLIB_MARCH_SFX (CLIB_MARCH_VARIANT_TYPE)] = r;         \
-  }                                                                           \
-  test_registration_t CLIB_MARCH_SFX (__test_##x)
-
-#endif
diff --git a/src/vppinfra/vector/toeplitz.c b/src/vppinfra/vector/toeplitz.c
new file mode 100644
index 00000000000..fcc4b64ad19
--- /dev/null
+++ b/src/vppinfra/vector/toeplitz.c
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Cisco Systems, Inc.
+ */
+
+#include <vppinfra/clib.h>
+#include <vppinfra/mem.h>
+#include <vppinfra/vector/toeplitz.h>
+
+static u8 default_key[40] = {
+  0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 0x41, 0x67,
+  0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 0xd0, 0xca, 0x2b, 0xcb,
+  0xae, 0x7b, 0x30, 0xb4, 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30,
+  0xf2, 0x0c, 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+};
+
+#ifdef __x86_64__
+static_always_inline void
+clib_toeplitz_hash_key_expand_8 (u64x2 kv, u64x8u *m)
+{
+  u64x8 kv4, a, b, shift = { 0, 1, 2, 3, 4, 5, 6, 7 };
+
+  kv4 = (u64x8){ kv[0], kv[1], kv[0], kv[1], kv[0], kv[1], kv[0], kv[1] };
+
+  /* clang-format off */
+  /* create 8 byte-swapped copies of the bytes 0 - 7 */
+  a = (u64x8) u8x64_shuffle (kv4,
+    0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0,
+    0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0,
+    0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0,
+    0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0,
+    0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0,
+    0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0,
+    0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0,
+    0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0);
+  /* create 8 byte-swapped copies of the bytes 4 - 11 */
+  b = (u64x8) u8x64_shuffle (kv4,
+    0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4,
+    0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4,
+    0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4,
+    0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4,
+    0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4,
+    0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4,
+    0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4,
+    0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4);
+  /* clang-format on */
+
+  /* shift each 64-bit element for 0 - 7 bits */
+  a <<= shift;
+  b <<= shift;
+
+  /* clang-format off */
+  /* construct eight 8x8 bit matrix used by gf2p8affine */
+  * m = (u64x8) u8x64_shuffle2 (a, b,
+    0x07, 0x0f, 0x17, 0x1f, 0x27, 0x2f, 0x37, 0x3f,
+    0x06, 0x0e, 0x16, 0x1e, 0x26, 0x2e, 0x36, 0x3e,
+    0x05, 0x0d, 0x15, 0x1d, 0x25, 0x2d, 0x35, 0x3d,
+    0x04, 0x0c, 0x14, 0x1c, 0x24, 0x2c, 0x34, 0x3c,
+    0x47, 0x4f, 0x57, 0x5f, 0x67, 0x6f, 0x77, 0x7f,
+    0x46, 0x4e, 0x56, 0x5e, 0x66, 0x6e, 0x76, 0x7e,
+    0x45, 0x4d, 0x55, 0x5d, 0x65, 0x6d, 0x75, 0x7d,
+    0x44, 0x4c, 0x54, 0x5c, 0x64, 0x6c, 0x74, 0x7c);
+  /* clang-format on */
+}
+
+void
+clib_toeplitz_hash_key_expand (u64 *matrixes, u8 *key, int size)
+{
+  u64x8u *m = (u64x8u *) matrixes;
+  u64x2 kv = {}, zero = {};
+
+  while (size >= 8)
+    {
+      kv = *(u64x2u *) key;
+      clib_toeplitz_hash_key_expand_8 (kv, m);
+      key += 8;
+      m++;
+      size -= 8;
+    }
+
+  kv = u64x2_shuffle2 (kv, zero, 1, 2);
+  clib_toeplitz_hash_key_expand_8 (kv, m);
+}
+#endif
+
+__clib_export clib_toeplitz_hash_key_t *
+clib_toeplitz_hash_key_init (u8 *key, u32 keylen)
+{
+  clib_toeplitz_hash_key_t *k;
+  u32 size, gfni_size = 0;
+
+  if (key == 0)
+    {
+      key = default_key;
+      keylen = sizeof (default_key);
+    }
+
+  size =
+    round_pow2 (sizeof (clib_toeplitz_hash_key_t) + round_pow2 (keylen, 16),
+		CLIB_CACHE_LINE_BYTES);
+#ifdef __x86_64__
+  gfni_size = round_pow2 ((keylen + 1) * 8, CLIB_CACHE_LINE_BYTES);
+#endif
+
+  k = clib_mem_alloc_aligned (size + gfni_size, CLIB_CACHE_LINE_BYTES);
+  clib_memset_u8 (k, 0, size + gfni_size);
+  k->key_length = keylen;
+  k->gfni_offset = size;
+  clib_memcpy_fast (k->data, key, keylen);
+
+#ifdef __x86_64__
+  clib_toeplitz_hash_key_expand ((u64 *) ((u8 *) k + k->gfni_offset), k->data,
+				 k->key_length);
+#endif
+
+  return k;
+}
+
+__clib_export void
+clib_toeplitz_hash_key_free (clib_toeplitz_hash_key_t *k)
+{
+  clib_mem_free (k);
+}
diff --git a/src/vppinfra/vector/toeplitz.h b/src/vppinfra/vector/toeplitz.h
new file mode 100644
index 00000000000..76297f05195
--- /dev/null
+++ b/src/vppinfra/vector/toeplitz.h
@@ -0,0 +1,513 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Cisco Systems, Inc.
+ */
+
+#ifndef included_vector_toeplitz_h
+#define included_vector_toeplitz_h
+#include <vppinfra/clib.h>
+
+typedef struct
+{
+  u16 key_length;
+  u16 gfni_offset;
+  u8 data[];
+} clib_toeplitz_hash_key_t;
+
+clib_toeplitz_hash_key_t *clib_toeplitz_hash_key_init (u8 *key, u32 keylen);
+void clib_toeplitz_hash_key_free (clib_toeplitz_hash_key_t *k);
+
+#ifdef CLIB_HAVE_VEC256
+static_always_inline u32x8
+toeplitz_hash_one_x8 (u32x8 hash, u64x4 v4, u8 data, u8 off)
+{
+  u32x8 v8 = u32x8_shuffle2 (v4 << (off * 8), v4 << (off * 8 + 4),
+			     /*uppper 32 bits of each u64 in reverse order */
+			     15, 13, 11, 9, 7, 5, 3, 1);
+
+#ifdef CLIB_HAVE_VEC256_MASK_BITWISE_OPS
+  return u32x8_mask_xor (hash, v8, data);
+#else
+  static const u32x8 bits = { 1, 2, 4, 8, 16, 32, 64, 128 };
+  return hash ^ (((u32x8_splat (data) & bits) != u32x8_zero ()) & v8);
+#endif
+}
+#endif
+
+#if defined(__GFNI__) && defined(__AVX512F__)
+static const u8x64 __clib_toeplitz_hash_gfni_permute = {
+  /* clang-format off */
+  0x00, 0x01, 0x02, 0x03, 0x40, 0x41, 0x42, 0x43,
+  0x01, 0x02, 0x03, 0x04, 0x41, 0x42, 0x43, 0x44,
+  0x02, 0x03, 0x04, 0x05, 0x42, 0x43, 0x44, 0x45,
+  0x03, 0x04, 0x05, 0x06, 0x43, 0x44, 0x45, 0x46,
+  0x04, 0x05, 0x06, 0x07, 0x44, 0x45, 0x46, 0x47,
+  0x05, 0x06, 0x07, 0x08, 0x45, 0x46, 0x47, 0x48,
+  0x06, 0x07, 0x08, 0x09, 0x46, 0x47, 0x48, 0x49,
+  0x07, 0x08, 0x09, 0x0a, 0x47, 0x48, 0x49, 0x4a
+  /* clang-format on */
+};
+static_always_inline u64x8
+clib_toeplitz_hash_gfni_one (u8x64 d0, u64x8 m, int i)
+{
+
+  d0 = i == 1 ? (u8x64) u64x8_align_right (d0, d0, 1) : d0;
+  d0 = i == 2 ? (u8x64) u64x8_align_right (d0, d0, 2) : d0;
+  d0 = i == 3 ? (u8x64) u64x8_align_right (d0, d0, 3) : d0;
+  d0 = i == 4 ? (u8x64) u64x8_align_right (d0, d0, 4) : d0;
+  d0 = i == 5 ? (u8x64) u64x8_align_right (d0, d0, 5) : d0;
+  d0 = i == 6 ? (u8x64) u64x8_align_right (d0, d0, 6) : d0;
+
+  d0 = u8x64_permute (__clib_toeplitz_hash_gfni_permute, d0);
+
+  return (u64x8) _mm512_gf2p8affine_epi64_epi8 ((__m512i) d0, (__m512i) m, 0);
+}
+
+static_always_inline u64x8
+clib_toeplitz_hash_gfni_two (u8x64 d0, u8x64 d1, u64x8 m, int i)
+{
+
+  d0 = i == 1 ? (u8x64) u64x8_align_right (d0, d0, 1) : d0;
+  d1 = i == 1 ? (u8x64) u64x8_align_right (d1, d1, 1) : d1;
+  d0 = i == 2 ? (u8x64) u64x8_align_right (d0, d0, 2) : d0;
+  d1 = i == 2 ? (u8x64) u64x8_align_right (d1, d1, 2) : d1;
+  d0 = i == 3 ? (u8x64) u64x8_align_right (d0, d0, 3) : d0;
+  d1 = i == 3 ? (u8x64) u64x8_align_right (d1, d1, 3) : d1;
+  d0 = i == 4 ? (u8x64) u64x8_align_right (d0, d0, 4) : d0;
+  d1 = i == 4 ? (u8x64) u64x8_align_right (d1, d1, 4) : d1;
+  d0 = i == 5 ? (u8x64) u64x8_align_right (d0, d0, 5) : d0;
+  d1 = i == 5 ? (u8x64) u64x8_align_right (d1, d1, 5) : d1;
+  d0 = i == 6 ? (u8x64) u64x8_align_right (d0, d0, 6) : d0;
+  d1 = i == 6 ? (u8x64) u64x8_align_right (d1, d1, 6) : d1;
+
+  d0 = u8x64_permute2 (__clib_toeplitz_hash_gfni_permute, d0, d1);
+
+  return (u64x8) _mm512_gf2p8affine_epi64_epi8 ((__m512i) d0, (__m512i) m, 0);
+}
+#endif
+
+static_always_inline u32
+clib_toeplitz_hash (clib_toeplitz_hash_key_t *k, u8 *data, int n_bytes)
+{
+  u8 *key = k->data;
+  /* key must be 4 bytes longer than data */
+  ASSERT (k->key_length - n_bytes >= 4);
+
+#if defined(__GFNI__) && defined(__AVX512F__)
+  u8x64 d0;
+  u64x8 h0 = {};
+  u64x8u *m = (u64x8u *) ((u8 *) k + k->gfni_offset);
+
+  /* move data ptr backwards for 3 byte so mask load "prepends" three zeros */
+  data -= 3;
+  n_bytes += 3;
+
+  if (n_bytes < 64)
+    {
+      d0 = u8x64_mask_load_zero ((u8 *) data, pow2_mask (n_bytes - 3) << 3);
+      goto last8;
+    }
+
+  d0 = u8x64_mask_load_zero ((u8 *) data, -1ULL << 3);
+next56:
+  h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_one (d0, m[0], 0),
+		   clib_toeplitz_hash_gfni_one (d0, m[1], 1));
+  h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_one (d0, m[2], 2),
+		   clib_toeplitz_hash_gfni_one (d0, m[3], 3));
+  h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_one (d0, m[4], 4),
+		   clib_toeplitz_hash_gfni_one (d0, m[5], 5));
+  h0 ^= clib_toeplitz_hash_gfni_one (d0, m[6], 6);
+  n_bytes -= 56;
+  data += 56;
+  m += 7;
+
+  if (n_bytes >= 64)
+    {
+      d0 = *(u8x64u *) data;
+      goto next56;
+    }
+
+  if (n_bytes == 0)
+    goto done;
+
+  d0 = u8x64_mask_load_zero ((u8 *) data, pow2_mask (n_bytes));
+last8:
+  h0 ^= clib_toeplitz_hash_gfni_one (d0, m[0], 0);
+  n_bytes -= 8;
+
+  if (n_bytes > 0)
+    {
+      m += 1;
+      d0 = (u8x64) u64x8_align_right (u64x8_zero (), d0, 1);
+      goto last8;
+    }
+
+done:
+  return u64x8_hxor (h0);
+#elif defined(CLIB_HAVE_VEC256)
+  u64x4 v4, shift = { 0, 1, 2, 3 };
+  u32x8 h0 = {};
+
+  while (n_bytes >= 4)
+    {
+      v4 = u64x4_splat (clib_net_to_host_u64 (*(u64u *) key)) << shift;
+
+      h0 = toeplitz_hash_one_x8 (h0, v4, data[0], 0);
+      h0 = toeplitz_hash_one_x8 (h0, v4, data[1], 1);
+      h0 = toeplitz_hash_one_x8 (h0, v4, data[2], 2);
+      h0 = toeplitz_hash_one_x8 (h0, v4, data[3], 3);
+
+      data += 4;
+      key += 4;
+      n_bytes -= 4;
+    }
+
+  if (n_bytes)
+    {
+      u64 v = (u64) clib_net_to_host_u32 ((u64) (*(u32u *) key)) << 32;
+      v |= (u64) key[4] << 24;
+
+      if (n_bytes == 3)
+	{
+	  v |= (u64) key[5] << 16;
+	  v |= (u64) key[6] << 8;
+	  v4 = u64x4_splat (v) << shift;
+	  h0 = toeplitz_hash_one_x8 (h0, v4, data[0], 0);
+	  h0 = toeplitz_hash_one_x8 (h0, v4, data[1], 1);
+	  h0 = toeplitz_hash_one_x8 (h0, v4, data[2], 2);
+	}
+      else if (n_bytes == 2)
+	{
+	  v |= (u64) key[5] << 16;
+	  v4 = u64x4_splat (v) << shift;
+	  h0 = toeplitz_hash_one_x8 (h0, v4, data[0], 0);
+	  h0 = toeplitz_hash_one_x8 (h0, v4, data[1], 1);
+	}
+      else
+	{
+	  v4 = u64x4_splat (v) << shift;
+	  h0 = toeplitz_hash_one_x8 (h0, v4, data[0], 0);
+	}
+    }
+
+  return u32x8_hxor (h0);
+#endif
+  u64 v, hash = 0;
+
+  while (n_bytes >= 4)
+    {
+      v = clib_net_to_host_u64 (*(u64u *) key);
+
+      for (u8 bit = 1 << 7, byte = data[0]; bit; bit >>= 1, v <<= 1)
+	hash ^= byte & bit ? v : 0;
+      for (u8 bit = 1 << 7, byte = data[1]; bit; bit >>= 1, v <<= 1)
+	hash ^= byte & bit ? v : 0;
+      for (u8 bit = 1 << 7, byte = data[2]; bit; bit >>= 1, v <<= 1)
+	hash ^= byte & bit ? v : 0;
+      for (u8 bit = 1 << 7, byte = data[3]; bit; bit >>= 1, v <<= 1)
+	hash ^= byte & bit ? v : 0;
+
+      data += 4;
+      key += 4;
+      n_bytes -= 4;
+    }
+
+  if (n_bytes)
+    {
+      v = (u64) clib_net_to_host_u32 ((u64) (*(u32u *) key)) << 32;
+      v |= (u64) key[4] << 24;
+      for (u8 bit = 1 << 7, byte = data[0]; bit; bit >>= 1, v <<= 1)
+	hash ^= byte & bit ? v : 0;
+      if (n_bytes > 1)
+	{
+	  v |= (u64) key[5] << 24;
+	  for (u8 bit = 1 << 7, byte = data[1]; bit; bit >>= 1, v <<= 1)
+	    hash ^= byte & bit ? v : 0;
+	}
+      if (n_bytes > 2)
+	{
+	  v |= (u64) key[6] << 24;
+	  for (u8 bit = 1 << 7, byte = data[2]; bit; bit >>= 1, v <<= 1)
+	    hash ^= byte & bit ? v : 0;
+	}
+    }
+  return hash >> 32;
+}
+
+static_always_inline void
+clib_toeplitz_hash_x4 (clib_toeplitz_hash_key_t *k, u8 *data0, u8 *data1,
+		       u8 *data2, u8 *data3, u32 *hash0, u32 *hash1,
+		       u32 *hash2, u32 *hash3, int n_bytes)
+{
+  /* key must be 4 bytes longer than data */
+  ASSERT (k->key_length - n_bytes >= 4);
+#if defined(__GFNI__) && defined(__AVX512F__)
+  u64x8u *m = (u64x8u *) ((u8 *) k + k->gfni_offset);
+  u8x64 d0, d1, d2, d3;
+  u64x8 h0 = {}, h2 = {};
+  u64 h, mask;
+
+  /* move data ptr backwards for 3 byte so mask load "prepends" three zeros */
+  data0 -= 3;
+  data1 -= 3;
+  data2 -= 3;
+  data3 -= 3;
+  n_bytes += 3;
+
+  if (n_bytes < 64)
+    {
+      mask = pow2_mask (n_bytes - 3) << 3;
+      d0 = u8x64_mask_load_zero ((u8 *) data0, mask);
+      d1 = u8x64_mask_load_zero ((u8 *) data1, mask);
+      d2 = u8x64_mask_load_zero ((u8 *) data2, mask);
+      d3 = u8x64_mask_load_zero ((u8 *) data3, mask);
+      goto last8;
+    }
+
+  mask = -1ULL << 3;
+  d0 = u8x64_mask_load_zero ((u8 *) data0, mask);
+  d1 = u8x64_mask_load_zero ((u8 *) data1, mask);
+  d2 = u8x64_mask_load_zero ((u8 *) data2, mask);
+  d3 = u8x64_mask_load_zero ((u8 *) data3, mask);
+next56:
+  h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_two (d0, d1, m[0], 0),
+		   clib_toeplitz_hash_gfni_two (d0, d1, m[1], 1));
+  h2 = u64x8_xor3 (h2, clib_toeplitz_hash_gfni_two (d2, d3, m[0], 0),
+		   clib_toeplitz_hash_gfni_two (d2, d3, m[1], 1));
+
+  h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_two (d0, d1, m[2], 2),
+		   clib_toeplitz_hash_gfni_two (d0, d1, m[3], 3));
+  h2 = u64x8_xor3 (h2, clib_toeplitz_hash_gfni_two (d2, d3, m[2], 2),
+		   clib_toeplitz_hash_gfni_two (d2, d3, m[3], 3));
+
+  h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_two (d0, d1, m[4], 4),
+		   clib_toeplitz_hash_gfni_two (d0, d1, m[5], 5));
+  h2 = u64x8_xor3 (h2, clib_toeplitz_hash_gfni_two (d2, d3, m[4], 4),
+		   clib_toeplitz_hash_gfni_two (d2, d3, m[5], 5));
+
+  h0 ^= clib_toeplitz_hash_gfni_two (d0, d1, m[6], 6);
+  h2 ^= clib_toeplitz_hash_gfni_two (d2, d3, m[6], 6);
+
+  n_bytes -= 56;
+  data0 += 56;
+  data1 += 56;
+  data2 += 56;
+  data3 += 56;
+  m += 7;
+
+  if (n_bytes >= 64)
+    {
+      d0 = *(u8x64u *) data0;
+      d1 = *(u8x64u *) data1;
+      d2 = *(u8x64u *) data2;
+      d3 = *(u8x64u *) data3;
+      goto next56;
+    }
+
+  if (n_bytes == 0)
+    goto done;
+
+  mask = pow2_mask (n_bytes);
+  d0 = u8x64_mask_load_zero ((u8 *) data0, mask);
+  d1 = u8x64_mask_load_zero ((u8 *) data1, mask);
+  d2 = u8x64_mask_load_zero ((u8 *) data2, mask);
+  d3 = u8x64_mask_load_zero ((u8 *) data3, mask);
+last8:
+  h0 ^= clib_toeplitz_hash_gfni_two (d0, d1, m[0], 0);
+  h2 ^= clib_toeplitz_hash_gfni_two (d2, d3, m[0], 0);
+  n_bytes -= 8;
+
+  if (n_bytes > 0)
+    {
+      u64x8 zero = {};
+      m += 1;
+      d0 = (u8x64) u64x8_align_right (zero, d0, 1);
+      d1 = (u8x64) u64x8_align_right (zero, d1, 1);
+      d2 = (u8x64) u64x8_align_right (zero, d2, 1);
+      d3 = (u8x64) u64x8_align_right (zero, d3, 1);
+      goto last8;
+    }
+
+done:
+  h = u64x8_hxor (h0);
+  *hash0 = h;
+  *hash1 = h >> 32;
+  h = u64x8_hxor (h2);
+  *hash2 = h;
+  *hash3 = h >> 32;
+#elif defined(CLIB_HAVE_VEC256)
+  u8 *key = k->data;
+  u64x4 v4, shift = { 0, 1, 2, 3 };
+  u32x8 h0 = {}, h1 = {}, h2 = {}, h3 = {};
+
+  while (n_bytes >= 4)
+    {
+      v4 = u64x4_splat (clib_net_to_host_u64 (*(u64u *) key)) << shift;
+
+      h0 = toeplitz_hash_one_x8 (h0, v4, data0[0], 0);
+      h1 = toeplitz_hash_one_x8 (h1, v4, data1[0], 0);
+      h2 = toeplitz_hash_one_x8 (h2, v4, data2[0], 0);
+      h3 = toeplitz_hash_one_x8 (h3, v4, data3[0], 0);
+
+      h0 = toeplitz_hash_one_x8 (h0, v4, data0[1], 1);
+      h1 = toeplitz_hash_one_x8 (h1, v4, data1[1], 1);
+      h2 = toeplitz_hash_one_x8 (h2, v4, data2[1], 1);
+      h3 = toeplitz_hash_one_x8 (h3, v4, data3[1], 1);
+
+      h0 = toeplitz_hash_one_x8 (h0, v4, data0[2], 2);
+      h1 = toeplitz_hash_one_x8 (h1, v4, data1[2], 2);
+      h2 = toeplitz_hash_one_x8 (h2, v4, data2[2], 2);
+      h3 = toeplitz_hash_one_x8 (h3, v4, data3[2], 2);
+
+      h0 = toeplitz_hash_one_x8 (h0, v4, data0[3], 3);
+      h1 = toeplitz_hash_one_x8 (h1, v4, data1[3], 3);
+      h2 = toeplitz_hash_one_x8 (h2, v4, data2[3], 3);
+      h3 = toeplitz_hash_one_x8 (h3, v4, data3[3], 3);
+
+      data0 += 4;
+      data1 += 4;
+      data2 += 4;
+      data3 += 4;
+      key += 4;
+      n_bytes -= 4;
+    }
+
+  if (n_bytes)
+    {
+      u64 v = (u64) clib_net_to_host_u32 ((u64) (*(u32u *) key)) << 32;
+      v |= (u64) key[4] << 24;
+
+      if (n_bytes == 3)
+	{
+	  v |= (u64) key[5] << 16;
+	  v |= (u64) key[6] << 8;
+	  v4 = u64x4_splat (v) << shift;
+	  h0 = toeplitz_hash_one_x8 (h0, v4, data0[0], 0);
+	  h1 = toeplitz_hash_one_x8 (h1, v4, data1[0], 0);
+	  h2 = toeplitz_hash_one_x8 (h2, v4, data2[0], 0);
+	  h3 = toeplitz_hash_one_x8 (h3, v4, data3[0], 0);
+
+	  h0 = toeplitz_hash_one_x8 (h0, v4, data0[1], 1);
+	  h1 = toeplitz_hash_one_x8 (h1, v4, data1[1], 1);
+	  h2 = toeplitz_hash_one_x8 (h2, v4, data2[1], 1);
+	  h3 = toeplitz_hash_one_x8 (h3, v4, data3[1], 1);
+
+	  h0 = toeplitz_hash_one_x8 (h0, v4, data0[2], 2);
+	  h1 = toeplitz_hash_one_x8 (h1, v4, data1[2], 2);
+	  h2 = toeplitz_hash_one_x8 (h2, v4, data2[2], 2);
+	  h3 = toeplitz_hash_one_x8 (h3, v4, data3[2], 2);
+	}
+      else if (n_bytes == 2)
+	{
+	  v |= (u64) key[5] << 16;
+	  v4 = u64x4_splat (v) << shift;
+	  h0 = toeplitz_hash_one_x8 (h0, v4, data0[0], 0);
+	  h1 = toeplitz_hash_one_x8 (h1, v4, data1[0], 0);
+	  h2 = toeplitz_hash_one_x8 (h2, v4, data2[0], 0);
+	  h3 = toeplitz_hash_one_x8 (h3, v4, data3[0], 0);
+
+	  h0 = toeplitz_hash_one_x8 (h0, v4, data0[1], 1);
+	  h1 = toeplitz_hash_one_x8 (h1, v4, data1[1], 1);
+	  h2 = toeplitz_hash_one_x8 (h2, v4, data2[1], 1);
+	  h3 = toeplitz_hash_one_x8 (h3, v4, data3[1], 1);
+	}
+      else
+	{
+	  v4 = u64x4_splat (v) << shift;
+	  h0 = toeplitz_hash_one_x8 (h0, v4, data0[0], 0);
+	  h1 = toeplitz_hash_one_x8 (h1, v4, data1[0], 0);
+	  h2 = toeplitz_hash_one_x8 (h2, v4, data2[0], 0);
+	  h3 = toeplitz_hash_one_x8 (h3, v4, data3[0], 0);
+	}
+    }
+
+  *hash0 = u32x8_hxor (h0);
+  *hash1 = u32x8_hxor (h1);
+  *hash2 = u32x8_hxor (h2);
+  *hash3 = u32x8_hxor (h3);
+#else
+  u8 *key = k->data;
+  u64 v, h0 = 0, h1 = 0, h2 = 0, h3 = 0;
+
+  while (n_bytes >= 4)
+    {
+      v = clib_net_to_host_u64 (*(u64u *) key);
+
+      for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1)
+	{
+	  h0 ^= data0[0] & bit ? v : 0;
+	  h1 ^= data1[0] & bit ? v : 0;
+	  h2 ^= data2[0] & bit ? v : 0;
+	  h3 ^= data3[0] & bit ? v : 0;
+	}
+      for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1)
+	{
+	  h0 ^= data0[1] & bit ? v : 0;
+	  h1 ^= data1[1] & bit ? v : 0;
+	  h2 ^= data2[1] & bit ? v : 0;
+	  h3 ^= data3[1] & bit ? v : 0;
+	}
+      for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1)
+	{
+	  h0 ^= data0[2] & bit ? v : 0;
+	  h1 ^= data1[2] & bit ? v : 0;
+	  h2 ^= data2[2] & bit ? v : 0;
+	  h3 ^= data3[2] & bit ? v : 0;
+	}
+      for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1)
+	{
+	  h0 ^= data0[3] & bit ? v : 0;
+	  h1 ^= data1[3] & bit ? v : 0;
+	  h2 ^= data2[3] & bit ? v : 0;
+	  h3 ^= data3[3] & bit ? v : 0;
+	}
+
+      data0 += 4;
+      data1 += 4;
+      data2 += 4;
+      data3 += 4;
+      key += 4;
+      n_bytes -= 4;
+    }
+
+  if (n_bytes)
+    {
+      v = (u64) clib_net_to_host_u32 ((u64) (*(u32u *) key)) << 32;
+      v |= (u64) key[4] << 24;
+      for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1)
+	{
+	  h0 ^= data0[0] & bit ? v : 0;
+	  h1 ^= data1[0] & bit ? v : 0;
+	  h2 ^= data2[0] & bit ? v : 0;
+	  h3 ^= data3[0] & bit ? v : 0;
+	}
+      if (n_bytes > 1)
+	{
+	  v |= (u64) key[5] << 24;
+	  for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1)
+	    {
+	      h0 ^= data0[1] & bit ? v : 0;
+	      h1 ^= data1[1] & bit ? v : 0;
+	      h2 ^= data2[1] & bit ? v : 0;
+	      h3 ^= data3[1] & bit ? v : 0;
+	    }
+	}
+      if (n_bytes > 2)
+	{
+	  v |= (u64) key[6] << 24;
+	  for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1)
+	    {
+	      h0 ^= data0[2] & bit ? v : 0;
+	      h1 ^= data1[2] & bit ? v : 0;
+	      h2 ^= data2[2] & bit ? v : 0;
+	      h3 ^= data3[2] & bit ? v : 0;
+	    }
+	}
+    }
+  *hash0 = h0 >> 32;
+  *hash1 = h1 >> 32;
+  *hash2 = h2 >> 32;
+  *hash3 = h3 >> 32;
+#endif
+}
+
+#endif