diff options
Diffstat (limited to 'src/vppinfra/vector')
-rw-r--r-- | src/vppinfra/vector/array_mask.h | 119 | ||||
-rw-r--r-- | src/vppinfra/vector/compress.h | 140 | ||||
-rw-r--r-- | src/vppinfra/vector/count_equal.h | 306 | ||||
-rw-r--r-- | src/vppinfra/vector/index_to_ptr.h | 257 | ||||
-rw-r--r-- | src/vppinfra/vector/ip_csum.h | 339 | ||||
-rw-r--r-- | src/vppinfra/vector/mask_compare.h | 207 | ||||
-rw-r--r-- | src/vppinfra/vector/test/array_mask.c | 124 | ||||
-rw-r--r-- | src/vppinfra/vector/test/compress.c | 248 | ||||
-rw-r--r-- | src/vppinfra/vector/test/mask_compare.c | 95 | ||||
-rw-r--r-- | src/vppinfra/vector/test/test.c | 53 | ||||
-rw-r--r-- | src/vppinfra/vector/test/test.h | 35 | ||||
-rw-r--r-- | src/vppinfra/vector/toeplitz.c | 122 | ||||
-rw-r--r-- | src/vppinfra/vector/toeplitz.h | 513 |
13 files changed, 1942 insertions, 616 deletions
diff --git a/src/vppinfra/vector/array_mask.h b/src/vppinfra/vector/array_mask.h index 778ed3e638f..3d4a82ac01b 100644 --- a/src/vppinfra/vector/array_mask.h +++ b/src/vppinfra/vector/array_mask.h @@ -17,61 +17,114 @@ static_always_inline void clib_array_mask_u32 (u32 *src, u32 mask, u32 n_elts) { - u32 i; #if defined(CLIB_HAVE_VEC512) u32x16 mask16 = u32x16_splat (mask); - - for (i = 0; i + 16 <= n_elts; i += 16) - *((u32x16u *) (src + i)) &= mask16; - n_elts -= i; - if (n_elts) + if (n_elts <= 16) { - u16 m = pow2_mask (n_elts); - u32x16_mask_store (u32x16_mask_load_zero (src + i, m) & mask16, src + i, - m); + u32 m = pow2_mask (n_elts); + u32x16 r = u32x16_mask_load_zero (src, m); + u32x16_mask_store (r & mask16, src, m); + return; } - return; + for (; n_elts >= 16; n_elts -= 16, src += 16) + *((u32x16u *) src) &= mask16; + *((u32x16u *) (src + n_elts - 16)) &= mask16; #elif defined(CLIB_HAVE_VEC256) u32x8 mask8 = u32x8_splat (mask); - - for (i = 0; i + 8 <= n_elts; i += 8) - *((u32x8u *) (src + i)) &= mask8; - n_elts -= i; - src += i; #if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE) - if (n_elts) + if (n_elts <= 8) { - u8 m = pow2_mask (n_elts); - u32x8_mask_store (u32x8_mask_load_zero (src, m) & mask8, src, m); + u32 m = pow2_mask (n_elts); + u32x8 r = u32x8_mask_load_zero (src, m); + u32x8_mask_store (r & mask8, src, m); + return; + } +#else + if (PREDICT_FALSE (n_elts < 4)) + { + if (n_elts & 2) + { + src[0] &= mask; + src[1] &= mask; + src += 2; + } + if (n_elts & 1) + src[0] &= mask; + return; + } + if (n_elts <= 8) + { + u32x4 mask4 = u32x4_splat (mask); + *(u32x4u *) src &= mask4; + *(u32x4u *) (src + n_elts - 4) &= mask4; + return; } - return; #endif + + for (; n_elts >= 8; n_elts -= 8, src += 8) + *((u32x8u *) src) &= mask8; + *((u32x8u *) (src + n_elts - 8)) &= mask8; #elif defined(CLIB_HAVE_VEC128) u32x4 mask4 = u32x4_splat (mask); - for (i = 0; i + 4 <= n_elts; i += 4) - *((u32x4u *) (src + i)) &= mask4; - n_elts -= i; - src += i; - switch (n_elts) + if (PREDICT_FALSE (n_elts < 4)) { - case 3: - src[2] &= mask; - case 2: - src[1] &= mask; - case 1: - src[0] &= mask; - case 0: - default:; + if (n_elts & 2) + { + src[0] &= mask; + src[1] &= mask; + src += 2; + } + if (n_elts & 1) + src[0] &= mask; + return; } + + for (; n_elts >= 4; n_elts -= 4, src += 4) + *((u32x4u *) src) &= mask4; + *((u32x4u *) (src + n_elts - 4)) &= mask4; return; -#endif +#else while (n_elts > 0) { src[0] &= mask; src++; n_elts--; } +#endif +} + +static_always_inline void +clib_array_mask_set_u32_x64 (u32 *a, u32 v, uword bmp, int n_elts) +{ +#if defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE) + u32x16 r = u32x16_splat (v); + for (; n_elts > 0; n_elts -= 16, a += 16, bmp >>= 16) + u32x16_mask_store (r, a, bmp); +#elif defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE) + u32x8 r = u32x8_splat (v); + for (; n_elts > 0; n_elts -= 8, a += 8, bmp >>= 8) + u32x8_mask_store (r, a, bmp); +#else + while (bmp) + { + a[get_lowest_set_bit_index (bmp)] = v; + bmp = clear_lowest_set_bit (bmp); + } +#endif +} + +static_always_inline void +clib_array_mask_set_u32 (u32 *a, u32 v, uword *bmp, u32 n_elts) +{ + while (n_elts >= uword_bits) + { + clib_array_mask_set_u32_x64 (a, v, bmp++[0], uword_bits); + a += uword_bits; + n_elts -= uword_bits; + } + + clib_array_mask_set_u32_x64 (a, v, bmp[0] & pow2_mask (n_elts), n_elts); } #endif diff --git a/src/vppinfra/vector/compress.h b/src/vppinfra/vector/compress.h index adb6503f711..5429113984b 100644 --- a/src/vppinfra/vector/compress.h +++ b/src/vppinfra/vector/compress.h @@ -27,12 +27,40 @@ clib_compress_u64_x64 (u64 *dst, u64 *src, u64 mask) mask >>= 4; } #else - while (mask) + u32 i; + foreach_set_bit_index (i, mask) + dst++[0] = src[i]; +#endif + return dst; +} + +static_always_inline u64 * +clib_compress_u64_x64_masked (u64 *dst, u64 *src, u64 mask) +{ +#if defined(CLIB_HAVE_VEC512_COMPRESS) && \ + defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE) + u64x8u *sv = (u64x8u *) src; + for (int i = 0; i < 8; i++) { - u16 bit = count_trailing_zeros (mask); - mask = clear_lowest_set_bit (mask); - dst++[0] = src[bit]; + u64x8u s = u64x8_mask_load_zero (&sv[i], mask); + u64x8_compress_store (s, mask, dst); + dst += _popcnt32 ((u8) mask); + mask >>= 8; } +#elif defined(CLIB_HAVE_VEC256_COMPRESS) && \ + defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE) + u64x4u *sv = (u64x4u *) src; + for (int i = 0; i < 16; i++) + { + u64x4u s = u64x4_mask_load_zero (&sv[i], mask); + u64x4_compress_store (s, mask, dst); + dst += _popcnt32 (((u8) mask) & 0x0f); + mask >>= 4; + } +#else + u32 i; + foreach_set_bit_index (i, mask) + dst++[0] = src[i]; #endif return dst; } @@ -69,7 +97,9 @@ clib_compress_u64 (u64 *dst, u64 *src, u64 *mask, u32 n_elts) if (PREDICT_TRUE (n_elts == 0)) return dst - dst0; - return clib_compress_u64_x64 (dst, src, mask[0] & pow2_mask (n_elts)) - dst0; + return clib_compress_u64_x64_masked (dst, src, + mask[0] & pow2_mask (n_elts)) - + dst0; } static_always_inline u32 * @@ -93,12 +123,41 @@ clib_compress_u32_x64 (u32 *dst, u32 *src, u64 mask) mask >>= 8; } #else - while (mask) + u32 i; + foreach_set_bit_index (i, mask) + dst++[0] = src[i]; +#endif + return dst; +} + +static_always_inline u32 * +clib_compress_u32_x64_masked (u32 *dst, u32 *src, u64 mask) +{ +#if defined(CLIB_HAVE_VEC512_COMPRESS) && \ + defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE) + u32x16u *sv = (u32x16u *) src; + for (int i = 0; i < 4; i++) { - u16 bit = count_trailing_zeros (mask); - mask = clear_lowest_set_bit (mask); - dst++[0] = src[bit]; + u32x16u s = u32x16_mask_load_zero (&sv[i], mask); + u32x16_compress_store (s, mask, dst); + dst += _popcnt32 ((u16) mask); + mask >>= 16; } + +#elif defined(CLIB_HAVE_VEC256_COMPRESS) && \ + defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE) + u32x8u *sv = (u32x8u *) src; + for (int i = 0; i < 8; i++) + { + u32x8u s = u32x8_mask_load_zero (&sv[i], mask); + u32x8_compress_store (s, mask, dst); + dst += _popcnt32 ((u8) mask); + mask >>= 8; + } +#else + u32 i; + foreach_set_bit_index (i, mask) + dst++[0] = src[i]; #endif return dst; } @@ -135,7 +194,9 @@ clib_compress_u32 (u32 *dst, u32 *src, u64 *mask, u32 n_elts) if (PREDICT_TRUE (n_elts == 0)) return dst - dst0; - return clib_compress_u32_x64 (dst, src, mask[0] & pow2_mask (n_elts)) - dst0; + return clib_compress_u32_x64_masked (dst, src, + mask[0] & pow2_mask (n_elts)) - + dst0; } static_always_inline u16 * @@ -150,12 +211,30 @@ clib_compress_u16_x64 (u16 *dst, u16 *src, u64 mask) mask >>= 32; } #else - while (mask) + u32 i; + foreach_set_bit_index (i, mask) + dst++[0] = src[i]; +#endif + return dst; +} + +static_always_inline u16 * +clib_compress_u16_x64_masked (u16 *dst, u16 *src, u64 mask) +{ +#if defined(CLIB_HAVE_VEC512_COMPRESS_U8_U16) && \ + defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE) + u16x32u *sv = (u16x32u *) src; + for (int i = 0; i < 2; i++) { - u16 bit = count_trailing_zeros (mask); - mask = clear_lowest_set_bit (mask); - dst++[0] = src[bit]; + u16x32u s = u16x32_mask_load_zero (&sv[i], mask); + u16x32_compress_store (s, mask, dst); + dst += _popcnt32 ((u32) mask); + mask >>= 32; } +#else + u32 i; + foreach_set_bit_index (i, mask) + dst++[0] = src[i]; #endif return dst; } @@ -192,7 +271,9 @@ clib_compress_u16 (u16 *dst, u16 *src, u64 *mask, u32 n_elts) if (PREDICT_TRUE (n_elts == 0)) return dst - dst0; - return clib_compress_u16_x64 (dst, src, mask[0] & pow2_mask (n_elts)) - dst0; + return clib_compress_u16_x64_masked (dst, src, + mask[0] & pow2_mask (n_elts)) - + dst0; } static_always_inline u8 * @@ -203,12 +284,26 @@ clib_compress_u8_x64 (u8 *dst, u8 *src, u64 mask) u8x64_compress_store (sv[0], mask, dst); dst += _popcnt64 (mask); #else - while (mask) - { - u16 bit = count_trailing_zeros (mask); - mask = clear_lowest_set_bit (mask); - dst++[0] = src[bit]; - } + u32 i; + foreach_set_bit_index (i, mask) + dst++[0] = src[i]; +#endif + return dst; +} + +static_always_inline u8 * +clib_compress_u8_x64_masked (u8 *dst, u8 *src, u64 mask) +{ +#if defined(CLIB_HAVE_VEC512_COMPRESS_U8_U16) && \ + defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE) + u8x64u *sv = (u8x64u *) src; + u8x64u s = u8x64_mask_load_zero (sv, mask); + u8x64_compress_store (s, mask, dst); + dst += _popcnt64 (mask); +#else + u32 i; + foreach_set_bit_index (i, mask) + dst++[0] = src[i]; #endif return dst; } @@ -245,7 +340,8 @@ clib_compress_u8 (u8 *dst, u8 *src, u64 *mask, u32 n_elts) if (PREDICT_TRUE (n_elts == 0)) return dst - dst0; - return clib_compress_u8_x64 (dst, src, mask[0] & pow2_mask (n_elts)) - dst0; + return clib_compress_u8_x64_masked (dst, src, mask[0] & pow2_mask (n_elts)) - + dst0; } #endif diff --git a/src/vppinfra/vector/count_equal.h b/src/vppinfra/vector/count_equal.h new file mode 100644 index 00000000000..ca2fbb7fd39 --- /dev/null +++ b/src/vppinfra/vector/count_equal.h @@ -0,0 +1,306 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright(c) 2021 Cisco Systems, Inc. + */ + +#ifndef included_vector_count_equal_h +#define included_vector_count_equal_h +#include <vppinfra/clib.h> + +static_always_inline uword +clib_count_equal_u64 (u64 *data, uword max_count) +{ + uword count; + u64 first; + + if (max_count <= 1) + return max_count; + if (data[0] != data[1]) + return 1; + + count = 0; + first = data[0]; + +#if defined(CLIB_HAVE_VEC256) + u64x4 splat = u64x4_splat (first); + while (count + 3 < max_count) + { + u64 bmp; + bmp = u8x32_msb_mask ((u8x32) (u64x4_load_unaligned (data) == splat)); + if (bmp != 0xffffffff) + { + count += count_trailing_zeros (~bmp) / 8; + return count; + } + + data += 4; + count += 4; + } +#else + count += 2; + data += 2; + while (count + 3 < max_count && ((data[0] ^ first) | (data[1] ^ first) | + (data[2] ^ first) | (data[3] ^ first)) == 0) + { + data += 4; + count += 4; + } +#endif + while (count < max_count && (data[0] == first)) + { + data += 1; + count += 1; + } + return count; +} + +static_always_inline uword +clib_count_equal_u32 (u32 *data, uword max_count) +{ + uword count; + u32 first; + + if (max_count <= 1) + return max_count; + if (data[0] != data[1]) + return 1; + + count = 0; + first = data[0]; + +#if defined(CLIB_HAVE_VEC512) + u32x16 splat = u32x16_splat (first); + while (count + 15 < max_count) + { + u32 bmp; + bmp = u32x16_is_equal_mask (u32x16_load_unaligned (data), splat); + if (bmp != pow2_mask (16)) + return count + count_trailing_zeros (~bmp); + + data += 16; + count += 16; + } + if (count == max_count) + return count; + else + { + u32 mask = pow2_mask (max_count - count); + u32 bmp = + u32x16_is_equal_mask (u32x16_mask_load_zero (data, mask), splat) & + mask; + return count + count_trailing_zeros (~bmp); + } +#elif defined(CLIB_HAVE_VEC256) + u32x8 splat = u32x8_splat (first); + while (count + 7 < max_count) + { + u32 bmp; +#ifdef __AVX512F__ + bmp = u32x8_is_equal_mask (u32x8_load_unaligned (data), splat); + if (bmp != pow2_mask (8)) + return count + count_trailing_zeros (~bmp); +#else + bmp = u8x32_msb_mask ((u8x32) (u32x8_load_unaligned (data) == splat)); + if (bmp != 0xffffffff) + return count + count_trailing_zeros (~bmp) / 4; +#endif + + data += 8; + count += 8; + } + if (count == max_count) + return count; +#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE) + else + { + u32 mask = pow2_mask (max_count - count); + u32 bmp = + u32x8_is_equal_mask (u32x8_mask_load_zero (data, mask), splat) & mask; + return count + count_trailing_zeros (~bmp); + } +#endif +#elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK) + u32x4 splat = u32x4_splat (first); + while (count + 3 < max_count) + { + u64 bmp; + bmp = u8x16_msb_mask ((u8x16) (u32x4_load_unaligned (data) == splat)); + if (bmp != pow2_mask (4 * 4)) + { + count += count_trailing_zeros (~bmp) / 4; + return count; + } + + data += 4; + count += 4; + } +#else + count += 2; + data += 2; + while (count + 3 < max_count && ((data[0] ^ first) | (data[1] ^ first) | + (data[2] ^ first) | (data[3] ^ first)) == 0) + { + data += 4; + count += 4; + } +#endif + while (count < max_count && (data[0] == first)) + { + data += 1; + count += 1; + } + return count; +} + +static_always_inline uword +clib_count_equal_u16 (u16 *data, uword max_count) +{ + uword count; + u16 first; + + if (max_count <= 1) + return max_count; + if (data[0] != data[1]) + return 1; + + count = 0; + first = data[0]; + +#if defined(CLIB_HAVE_VEC256) + u16x16 splat = u16x16_splat (first); + while (count + 15 < max_count) + { + u64 bmp; + bmp = u8x32_msb_mask ((u8x32) (u16x16_load_unaligned (data) == splat)); + if (bmp != 0xffffffff) + { + count += count_trailing_zeros (~bmp) / 2; + return count; + } + + data += 16; + count += 16; + } +#elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK) + u16x8 splat = u16x8_splat (first); + while (count + 7 < max_count) + { + u64 bmp; + bmp = u8x16_msb_mask ((u8x16) (u16x8_load_unaligned (data) == splat)); + if (bmp != 0xffff) + { + count += count_trailing_zeros (~bmp) / 2; + return count; + } + + data += 8; + count += 8; + } +#else + count += 2; + data += 2; + while (count + 3 < max_count && ((data[0] ^ first) | (data[1] ^ first) | + (data[2] ^ first) | (data[3] ^ first)) == 0) + { + data += 4; + count += 4; + } +#endif + while (count < max_count && (data[0] == first)) + { + data += 1; + count += 1; + } + return count; +} + +static_always_inline uword +clib_count_equal_u8 (u8 *data, uword max_count) +{ + uword count; + u8 first; + + if (max_count <= 1) + return max_count; + if (data[0] != data[1]) + return 1; + + count = 0; + first = data[0]; + +#if defined(CLIB_HAVE_VEC512) + u8x64 splat = u8x64_splat (first); + while (count + 63 < max_count) + { + u64 bmp; + bmp = u8x64_is_equal_mask (u8x64_load_unaligned (data), splat); + if (bmp != -1) + return count + count_trailing_zeros (~bmp); + + data += 64; + count += 64; + } + if (count == max_count) + return count; +#if defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE) + else + { + u64 mask = pow2_mask (max_count - count); + u64 bmp = + u8x64_is_equal_mask (u8x64_mask_load_zero (data, mask), splat) & mask; + return count + count_trailing_zeros (~bmp); + } +#endif +#elif defined(CLIB_HAVE_VEC256) + u8x32 splat = u8x32_splat (first); + while (count + 31 < max_count) + { + u64 bmp; + bmp = u8x32_msb_mask ((u8x32) (u8x32_load_unaligned (data) == splat)); + if (bmp != 0xffffffff) + return count + count_trailing_zeros (~bmp); + + data += 32; + count += 32; + } + if (count == max_count) + return count; +#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE) + else + { + u32 mask = pow2_mask (max_count - count); + u64 bmp = + u8x32_msb_mask (u8x32_mask_load_zero (data, mask) == splat) & mask; + return count + count_trailing_zeros (~bmp); + } +#endif +#elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK) + u8x16 splat = u8x16_splat (first); + while (count + 15 < max_count) + { + u64 bmp; + bmp = u8x16_msb_mask ((u8x16) (u8x16_load_unaligned (data) == splat)); + if (bmp != 0xffff) + return count + count_trailing_zeros (~bmp); + + data += 16; + count += 16; + } +#else + count += 2; + data += 2; + while (count + 3 < max_count && ((data[0] ^ first) | (data[1] ^ first) | + (data[2] ^ first) | (data[3] ^ first)) == 0) + { + data += 4; + count += 4; + } +#endif + while (count < max_count && (data[0] == first)) + { + data += 1; + count += 1; + } + return count; +} + +#endif diff --git a/src/vppinfra/vector/index_to_ptr.h b/src/vppinfra/vector/index_to_ptr.h new file mode 100644 index 00000000000..3985b757d54 --- /dev/null +++ b/src/vppinfra/vector/index_to_ptr.h @@ -0,0 +1,257 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright(c) 2021 Cisco Systems, Inc. + */ + +#ifndef included_vector_index_to_ptr_h +#define included_vector_index_to_ptr_h +#include <vppinfra/clib.h> + +#ifdef CLIB_HAVE_VEC128 +static_always_inline void +clib_index_to_ptr_u32x4 (u32 *indices, void **ptrs, i32 i, u64x2 ov, u8 shift) +{ + u32x4 iv4 = u32x4_load_unaligned (indices + i); + u64x2 pv2; + pv2 = u64x2_from_u32x4 (iv4); + u64x2_store_unaligned ((pv2 << shift) + ov, ptrs + i); +#ifdef __aarch64__ + pv2 = u64x2_from_u32x4_high (iv4); +#else + pv2 = u64x2_from_u32x4 ((u32x4) u8x16_word_shift_right (iv4, 8)); +#endif + u64x2_store_unaligned ((pv2 << shift) + ov, ptrs + i + 2); +} +#endif + +/** \brief Convert array of indices to pointers with base and shift + + @param indices source array of u32 indices + @param base base pointer + @param shift numbers of bits to be shifted + @param ptrs destinatin array of pointers + @param n_elts number of elements in the source array +*/ + +static_always_inline void +clib_index_to_ptr_u32 (u32 *indices, void *base, u8 shift, void **ptrs, + u32 n_elts) +{ +#if defined CLIB_HAVE_VEC512 + if (n_elts >= 8) + { + u64x8 off = u64x8_splat ((u64) base); + u64x8 b0, b1, b2, b3, b4, b5, b6, b7; + + while (n_elts >= 64) + { + b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices)); + b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8)); + b2 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 16)); + b3 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 24)); + b4 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 32)); + b5 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 40)); + b6 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 48)); + b7 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 56)); + u64x8_store_unaligned ((b0 << shift) + off, ptrs); + u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8); + u64x8_store_unaligned ((b2 << shift) + off, ptrs + 16); + u64x8_store_unaligned ((b3 << shift) + off, ptrs + 24); + u64x8_store_unaligned ((b4 << shift) + off, ptrs + 32); + u64x8_store_unaligned ((b5 << shift) + off, ptrs + 40); + u64x8_store_unaligned ((b6 << shift) + off, ptrs + 48); + u64x8_store_unaligned ((b7 << shift) + off, ptrs + 56); + ptrs += 64; + indices += 64; + n_elts -= 64; + } + + if (n_elts == 0) + return; + + if (n_elts >= 32) + { + b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices)); + b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8)); + b2 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 16)); + b3 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 24)); + u64x8_store_unaligned ((b0 << shift) + off, ptrs); + u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8); + u64x8_store_unaligned ((b2 << shift) + off, ptrs + 16); + u64x8_store_unaligned ((b3 << shift) + off, ptrs + 24); + ptrs += 32; + indices += 32; + n_elts -= 32; + } + if (n_elts >= 16) + { + b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices)); + b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8)); + u64x8_store_unaligned ((b0 << shift) + off, ptrs); + u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8); + ptrs += 16; + indices += 16; + n_elts -= 16; + } + if (n_elts >= 8) + { + b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices)); + u64x8_store_unaligned ((b0 << shift) + off, ptrs); + ptrs += 8; + indices += 8; + n_elts -= 8; + } + + if (n_elts == 0) + return; + + b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + n_elts - 8)); + u64x8_store_unaligned ((b0 << shift) + off, ptrs + n_elts - 8); + } + else + { + u32 mask = pow2_mask (n_elts); + u64x8 r = u64x8_from_u32x8 (u32x8_mask_load_zero (indices, mask)); + u64x8_mask_store ((r << shift) + u64x8_splat ((u64) base), ptrs, mask); + return; + } +#elif defined CLIB_HAVE_VEC256 + if (n_elts >= 4) + { + u64x4 off = u64x4_splat ((u64) base); + u64x4 b0, b1, b2, b3, b4, b5, b6, b7; + + while (n_elts >= 32) + { + b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices)); + b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4)); + b2 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 8)); + b3 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 12)); + b4 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 16)); + b5 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 20)); + b6 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 24)); + b7 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 28)); + u64x4_store_unaligned ((b0 << shift) + off, ptrs); + u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4); + u64x4_store_unaligned ((b2 << shift) + off, ptrs + 8); + u64x4_store_unaligned ((b3 << shift) + off, ptrs + 12); + u64x4_store_unaligned ((b4 << shift) + off, ptrs + 16); + u64x4_store_unaligned ((b5 << shift) + off, ptrs + 20); + u64x4_store_unaligned ((b6 << shift) + off, ptrs + 24); + u64x4_store_unaligned ((b7 << shift) + off, ptrs + 28); + ptrs += 32; + indices += 32; + n_elts -= 32; + } + + if (n_elts == 0) + return; + + if (n_elts >= 16) + { + b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices)); + b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4)); + b2 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 8)); + b3 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 12)); + u64x4_store_unaligned ((b0 << shift) + off, ptrs); + u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4); + u64x4_store_unaligned ((b2 << shift) + off, ptrs + 8); + u64x4_store_unaligned ((b3 << shift) + off, ptrs + 12); + ptrs += 16; + indices += 16; + n_elts -= 16; + } + if (n_elts >= 8) + { + b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices)); + b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4)); + u64x4_store_unaligned ((b0 << shift) + off, ptrs); + u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4); + ptrs += 8; + indices += 8; + n_elts -= 8; + } + if (n_elts > 4) + { + b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices)); + u64x4_store_unaligned ((b0 << shift) + off, ptrs); + ptrs += 4; + indices += 4; + n_elts -= 4; + } + + b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + n_elts - 4)); + u64x4_store_unaligned ((b0 << shift) + off, ptrs + n_elts - 4); + return; + } +#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE + else + { + u32 mask = pow2_mask (n_elts); + u64x4 r = u64x4_from_u32x4 (u32x4_mask_load_zero (indices, mask)); + u64x4_mask_store ((r << shift) + u64x4_splat ((u64) base), ptrs, mask); + return; + } +#endif +#elif defined(CLIB_HAVE_VEC128) + if (n_elts >= 4) + { + u64x2 ov = u64x2_splat ((u64) base); + u32 *i = (u32 *) indices; + void **p = (void **) ptrs; + u32 n = n_elts; + + while (n >= 32) + { + clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift); + clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift); + clib_index_to_ptr_u32x4 (indices, ptrs, 8, ov, shift); + clib_index_to_ptr_u32x4 (indices, ptrs, 12, ov, shift); + clib_index_to_ptr_u32x4 (indices, ptrs, 16, ov, shift); + clib_index_to_ptr_u32x4 (indices, ptrs, 20, ov, shift); + clib_index_to_ptr_u32x4 (indices, ptrs, 24, ov, shift); + clib_index_to_ptr_u32x4 (indices, ptrs, 28, ov, shift); + indices += 32; + ptrs += 32; + n -= 32; + } + + if (n == 0) + return; + + if (n >= 16) + { + clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift); + clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift); + clib_index_to_ptr_u32x4 (indices, ptrs, 8, ov, shift); + clib_index_to_ptr_u32x4 (indices, ptrs, 12, ov, shift); + indices += 16; + ptrs += 16; + n -= 16; + } + + if (n >= 8) + { + clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift); + clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift); + indices += 8; + ptrs += 8; + n -= 8; + } + + if (n > 4) + clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift); + + clib_index_to_ptr_u32x4 (i, p, n_elts - 4, ov, shift); + return; + } +#endif + while (n_elts) + { + ptrs[0] = base + ((u64) indices[0] << shift); + ptrs += 1; + indices += 1; + n_elts -= 1; + } +} + +#endif diff --git a/src/vppinfra/vector/ip_csum.h b/src/vppinfra/vector/ip_csum.h new file mode 100644 index 00000000000..2cea9b448ea --- /dev/null +++ b/src/vppinfra/vector/ip_csum.h @@ -0,0 +1,339 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright(c) 2021 Cisco Systems, Inc. + */ + +#ifndef included_vector_ip_csum_h +#define included_vector_ip_csum_h +#include <vppinfra/clib.h> +typedef struct +{ + u64 sum; + u8 odd; +} clib_ip_csum_t; + +#if defined(CLIB_HAVE_VEC128) +static_always_inline u64x2 +clib_ip_csum_cvt_and_add_4 (u32x4 v) +{ + return ((u64x2) u32x4_interleave_lo ((u32x4) v, u32x4_zero ()) + + (u64x2) u32x4_interleave_hi ((u32x4) v, u32x4_zero ())); +} +static_always_inline u64 +clib_ip_csum_hadd_2 (u64x2 v) +{ + return v[0] + v[1]; +} +#endif + +#if defined(CLIB_HAVE_VEC256) +static_always_inline u64x4 +clib_ip_csum_cvt_and_add_8 (u32x8 v) +{ + return ((u64x4) u32x8_interleave_lo ((u32x8) v, u32x8_zero ()) + + (u64x4) u32x8_interleave_hi ((u32x8) v, u32x8_zero ())); +} +static_always_inline u64 +clib_ip_csum_hadd_4 (u64x4 v) +{ + return clib_ip_csum_hadd_2 (u64x4_extract_lo (v) + u64x4_extract_hi (v)); +} +#endif + +#if defined(CLIB_HAVE_VEC512) +static_always_inline u64x8 +clib_ip_csum_cvt_and_add_16 (u32x16 v) +{ + return ((u64x8) u32x16_interleave_lo ((u32x16) v, u32x16_zero ()) + + (u64x8) u32x16_interleave_hi ((u32x16) v, u32x16_zero ())); +} +static_always_inline u64 +clib_ip_csum_hadd_8 (u64x8 v) +{ + return clib_ip_csum_hadd_4 (u64x8_extract_lo (v) + u64x8_extract_hi (v)); +} +#endif + +static_always_inline void +clib_ip_csum_inline (clib_ip_csum_t *c, u8 *dst, u8 *src, u16 count, + int is_copy) +{ + if (c->odd) + { + c->odd = 0; + c->sum += (u16) src[0] << 8; + count--; + src++; + if (is_copy) + dst++[0] = src[0]; + } + +#if defined(CLIB_HAVE_VEC512) + u64x8 sum8 = {}; + + while (count >= 512) + { + u32x16u *s = (u32x16u *) src; + sum8 += clib_ip_csum_cvt_and_add_16 (s[0]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[1]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[2]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[3]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[8]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[5]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[6]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[7]); + count -= 512; + src += 512; + if (is_copy) + { + u32x16u *d = (u32x16u *) dst; + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + d[4] = s[4]; + d[5] = s[5]; + d[6] = s[6]; + d[7] = s[7]; + dst += 512; + } + } + + while (count >= 64) + { + u32x16u *s = (u32x16u *) src; + sum8 += clib_ip_csum_cvt_and_add_16 (s[0]); + count -= 64; + src += 64; + if (is_copy) + { + u32x16u *d = (u32x16u *) dst; + d[0] = s[0]; + dst += 512; + } + } + +#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE + if (count) + { + u64 mask = pow2_mask (count); + u32x16 v = (u32x16) u8x64_mask_load_zero (src, mask); + sum8 += clib_ip_csum_cvt_and_add_16 (v); + c->odd = count & 1; + if (is_copy) + u32x16_mask_store (v, dst, mask); + } + c->sum += clib_ip_csum_hadd_8 (sum8); + return; +#endif + + c->sum += clib_ip_csum_hadd_8 (sum8); +#elif defined(CLIB_HAVE_VEC256) + u64x4 sum4 = {}; + + while (count >= 256) + { + u32x8u *s = (u32x8u *) src; + sum4 += clib_ip_csum_cvt_and_add_8 (s[0]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[1]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[2]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[3]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[4]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[5]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[6]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[7]); + count -= 256; + src += 256; + if (is_copy) + { + u32x8u *d = (u32x8u *) dst; + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + d[4] = s[4]; + d[5] = s[5]; + d[6] = s[6]; + d[7] = s[7]; + dst += 256; + } + } + + while (count >= 32) + { + u32x8u *s = (u32x8u *) src; + sum4 += clib_ip_csum_cvt_and_add_8 (s[0]); + count -= 32; + src += 32; + if (is_copy) + { + u32x8u *d = (u32x8u *) dst; + d[0] = s[0]; + dst += 32; + } + } + +#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE + if (count) + { + u32 mask = pow2_mask (count); + u32x8 v = (u32x8) u8x32_mask_load_zero (src, mask); + sum4 += clib_ip_csum_cvt_and_add_8 (v); + c->odd = count & 1; + if (is_copy) + u32x8_mask_store (v, dst, mask); + } + c->sum += clib_ip_csum_hadd_4 (sum4); + return; +#endif + + c->sum += clib_ip_csum_hadd_4 (sum4); +#elif defined(CLIB_HAVE_VEC128) + u64x2 sum2 = {}; + + while (count >= 128) + { + u32x4u *s = (u32x4u *) src; + sum2 += clib_ip_csum_cvt_and_add_4 (s[0]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[1]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[2]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[3]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[4]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[5]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[6]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[7]); + count -= 128; + src += 128; + if (is_copy) + { + u32x4u *d = (u32x4u *) dst; + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + d[4] = s[4]; + d[5] = s[5]; + d[6] = s[6]; + d[7] = s[7]; + dst += 128; + } + } + + while (count >= 16) + { + u32x4u *s = (u32x4u *) src; + sum2 += clib_ip_csum_cvt_and_add_4 (s[0]); + count -= 16; + src += 16; + if (is_copy) + { + u32x4u *d = (u32x4u *) dst; + d[0] = s[0]; + dst += 16; + } + } + c->sum += clib_ip_csum_hadd_2 (sum2); +#else + while (count >= 4) + { + u32 v = *((u32 *) src); + c->sum += v; + count -= 4; + src += 4; + if (is_copy) + { + *(u32 *) dst = v; + dst += 4; + } + } +#endif + while (count >= 2) + { + u16 v = *((u16 *) src); + c->sum += v; + count -= 2; + src += 2; + if (is_copy) + { + *(u16 *) dst = v; + dst += 2; + } + } + + if (count) + { + c->odd = 1; + c->sum += (u16) src[0]; + if (is_copy) + dst[0] = src[0]; + } +} + +static_always_inline u16 +clib_ip_csum_fold (clib_ip_csum_t *c) +{ + u64 sum = c->sum; +#if defined(__x86_64__) && defined(__BMI2__) + u64 tmp = sum; + asm volatile( + /* using ADC is much faster than mov, shift, add sequence + * compiler produces */ + "shr $32, %[sum] \n\t" + "add %k[tmp], %k[sum] \n\t" + "mov $16, %k[tmp] \n\t" + "shrx %k[tmp], %k[sum], %k[tmp] \n\t" + "adc %w[tmp], %w[sum] \n\t" + "adc $0, %w[sum] \n\t" + : [ sum ] "+&r"(sum), [ tmp ] "+&r"(tmp)); +#else + sum = ((u32) sum) + (sum >> 32); + sum = ((u16) sum) + (sum >> 16); + sum = ((u16) sum) + (sum >> 16); +#endif + return (~((u16) sum)); +} + +static_always_inline void +clib_ip_csum_chunk (clib_ip_csum_t *c, u8 *src, u16 count) +{ + return clib_ip_csum_inline (c, 0, src, count, 0); +} + +static_always_inline void +clib_ip_csum_and_copy_chunk (clib_ip_csum_t *c, u8 *src, u8 *dst, u16 count) +{ + return clib_ip_csum_inline (c, dst, src, count, 1); +} + +static_always_inline u16 +clib_ip_csum (u8 *src, u16 count) +{ + clib_ip_csum_t c = {}; + if (COMPILE_TIME_CONST (count) && count == 12) + { + for (int i = 0; i < 3; i++) + c.sum += ((u32 *) src)[i]; + } + else if (COMPILE_TIME_CONST (count) && count == 20) + { + for (int i = 0; i < 5; i++) + c.sum += ((u32 *) src)[i]; + } + else if (COMPILE_TIME_CONST (count) && count == 40) + { + for (int i = 0; i < 10; i++) + c.sum += ((u32 *) src)[i]; + } + else + clib_ip_csum_inline (&c, 0, src, count, 0); + return clib_ip_csum_fold (&c); +} + +static_always_inline u16 +clib_ip_csum_and_copy (u8 *dst, u8 *src, u16 count) +{ + clib_ip_csum_t c = {}; + clib_ip_csum_inline (&c, dst, src, count, 1); + return clib_ip_csum_fold (&c); +} + +#endif diff --git a/src/vppinfra/vector/mask_compare.h b/src/vppinfra/vector/mask_compare.h index cac48a31f47..fc72d7dac35 100644 --- a/src/vppinfra/vector/mask_compare.h +++ b/src/vppinfra/vector/mask_compare.h @@ -8,7 +8,7 @@ #include <vppinfra/memcpy.h> static_always_inline u64 -clib_mask_compare_u16_x64 (u16 v, u16 *a, u32 n_elts) +clib_mask_compare_u16_x64 (u16 v, u16 *a) { u64 mask = 0; #if defined(CLIB_HAVE_VEC512) @@ -47,6 +47,38 @@ clib_mask_compare_u16_x64 (u16 v, u16 *a, u32 n_elts) (u64) i8x16_msb_mask (i8x16_pack (v8 == av[4], v8 == av[5])) << 32 | (u64) i8x16_msb_mask (i8x16_pack (v8 == av[6], v8 == av[7])) << 48); #else + for (int i = 0; i < 64; i++) + if (a[i] == v) + mask |= 1ULL << i; +#endif + return mask; +} + +static_always_inline u64 +clib_mask_compare_u16_x64_n (u16 v, u16 *a, u32 n_elts) +{ + u64 mask = 0; + CLIB_UNUSED (u64 data_mask) = pow2_mask (n_elts); +#if defined(CLIB_HAVE_VEC512) + u16x32 v32 = u16x32_splat (v); + u16x32u *av = (u16x32u *) a; + mask = ((u64) u16x32_is_equal_mask ( + u16x32_mask_load_zero (&av[0], data_mask), v32) | + (u64) u16x32_is_equal_mask ( + u16x32_mask_load_zero (&av[1], data_mask >> 32), v32) + << 32); +#elif defined(CLIB_HAVE_VEC256) && defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE) + u16x16 v16 = u16x16_splat (v); + u16x16u *av = (u16x16u *) a; + i8x32 x; + + x = i8x32_pack (v16 == u16x16_mask_load_zero (&av[0], data_mask), + v16 == u16x16_mask_load_zero (&av[1], data_mask >> 16)); + mask = i8x32_msb_mask ((i8x32) u64x4_permute (x, 0, 2, 1, 3)); + x = i8x32_pack (v16 == u16x16_mask_load_zero (&av[2], data_mask >> 32), + v16 == u16x16_mask_load_zero (&av[3], data_mask >> 48)); + mask |= (u64) i8x32_msb_mask ((i8x32) u64x4_permute (x, 0, 2, 1, 3)) << 32; +#else for (int i = 0; i < n_elts; i++) if (a[i] == v) mask |= 1ULL << i; @@ -68,7 +100,7 @@ clib_mask_compare_u16 (u16 v, u16 *a, u64 *mask, u32 n_elts) { while (n_elts >= 64) { - mask++[0] = clib_mask_compare_u16_x64 (v, a, 64); + mask++[0] = clib_mask_compare_u16_x64 (v, a); n_elts -= 64; a += 64; } @@ -76,11 +108,11 @@ clib_mask_compare_u16 (u16 v, u16 *a, u64 *mask, u32 n_elts) if (PREDICT_TRUE (n_elts == 0)) return; - mask[0] = clib_mask_compare_u16_x64 (v, a, n_elts) & pow2_mask (n_elts); + mask[0] = clib_mask_compare_u16_x64_n (v, a, n_elts) & pow2_mask (n_elts); } static_always_inline u64 -clib_mask_compare_u32_x64 (u32 v, u32 *a, u32 n_elts) +clib_mask_compare_u32_x64 (u32 v, u32 *a) { u64 mask = 0; #if defined(CLIB_HAVE_VEC512) @@ -131,6 +163,57 @@ clib_mask_compare_u32_x64 (u32 v, u32 *a, u32 n_elts) } #else + for (int i = 0; i < 64; i++) + if (a[i] == v) + mask |= 1ULL << i; +#endif + return mask; +} + +static_always_inline u64 +clib_mask_compare_u32_x64_n (u32 v, u32 *a, u32 n_elts) +{ + u64 mask = 0; + CLIB_UNUSED (u64 data_mask) = pow2_mask (n_elts); +#if defined(CLIB_HAVE_VEC512) + u32x16 v16 = u32x16_splat (v); + u32x16u *av = (u32x16u *) a; + mask = ((u64) u32x16_is_equal_mask ( + u32x16_mask_load_zero (&av[0], data_mask), v16) | + (u64) u32x16_is_equal_mask ( + u32x16_mask_load_zero (&av[1], data_mask >> 16), v16) + << 16 | + (u64) u32x16_is_equal_mask ( + u32x16_mask_load_zero (&av[2], data_mask >> 32), v16) + << 32 | + (u64) u32x16_is_equal_mask ( + u32x16_mask_load_zero (&av[3], data_mask >> 48), v16) + << 48); +#elif defined(CLIB_HAVE_VEC256) && defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE) + u32x8 v8 = u32x8_splat (v); + u32x8u *av = (u32x8u *) a; + u32x8 m = { 0, 4, 1, 5, 2, 6, 3, 7 }; + i8x32 c; + + c = i8x32_pack ( + i16x16_pack ( + (i32x8) (v8 == u32x8_mask_load_zero (&av[0], data_mask)), + (i32x8) (v8 == u32x8_mask_load_zero (&av[1], data_mask >> 8))), + i16x16_pack ( + (i32x8) (v8 == u32x8_mask_load_zero (&av[2], data_mask >> 16)), + (i32x8) (v8 == u32x8_mask_load_zero (&av[3], data_mask >> 24)))); + mask = i8x32_msb_mask ((i8x32) u32x8_permute ((u32x8) c, m)); + + c = i8x32_pack ( + i16x16_pack ( + (i32x8) (v8 == u32x8_mask_load_zero (&av[4], data_mask >> 32)), + (i32x8) (v8 == u32x8_mask_load_zero (&av[5], data_mask >> 40))), + i16x16_pack ( + (i32x8) (v8 == u32x8_mask_load_zero (&av[6], data_mask >> 48)), + (i32x8) (v8 == u32x8_mask_load_zero (&av[7], data_mask >> 56)))); + mask |= (u64) i8x32_msb_mask ((i8x32) u32x8_permute ((u32x8) c, m)) << 32; + mask |= (u64) i8x32_msb_mask ((i8x32) u32x8_permute ((u32x8) c, m)) << 32; +#else for (int i = 0; i < n_elts; i++) if (a[i] == v) mask |= 1ULL << i; @@ -152,7 +235,119 @@ clib_mask_compare_u32 (u32 v, u32 *a, u64 *bitmap, u32 n_elts) { while (n_elts >= 64) { - bitmap++[0] = clib_mask_compare_u32_x64 (v, a, 64); + bitmap++[0] = clib_mask_compare_u32_x64 (v, a); + n_elts -= 64; + a += 64; + } + + if (PREDICT_TRUE (n_elts == 0)) + return; + + bitmap[0] = clib_mask_compare_u32_x64_n (v, a, n_elts) & pow2_mask (n_elts); +} + +static_always_inline u64 +clib_mask_compare_u64_x64 (u64 v, u64 *a) +{ + u64 mask = 0; +#if defined(CLIB_HAVE_VEC512) + u64x8 v8 = u64x8_splat (v); + u64x8u *av = (u64x8u *) a; + mask = ((u64) u64x8_is_equal_mask (av[0], v8) | + (u64) u64x8_is_equal_mask (av[1], v8) << 8 | + (u64) u64x8_is_equal_mask (av[2], v8) << 16 | + (u64) u64x8_is_equal_mask (av[3], v8) << 24 | + (u64) u64x8_is_equal_mask (av[4], v8) << 32 | + (u64) u64x8_is_equal_mask (av[5], v8) << 40 | + (u64) u64x8_is_equal_mask (av[6], v8) << 48 | + (u64) u64x8_is_equal_mask (av[7], v8) << 56); + +#elif defined(CLIB_HAVE_VEC256) && defined(__BMI2__) + u64x4 v4 = u64x4_splat (v); + u64x4u *av = (u64x4u *) a; + + for (int i = 0; i < 16; i += 2) + { + u64 l = u8x32_msb_mask (v4 == av[i]); + u64 h = u8x32_msb_mask (v4 == av[i + 1]); + mask |= _pext_u64 (l | h << 32, 0x0101010101010101) << (i * 4); + } +#else + for (int i = 0; i < 64; i++) + if (a[i] == v) + mask |= 1ULL << i; +#endif + return mask; +} + +static_always_inline u64 +clib_mask_compare_u64_x64_n (u64 v, u64 *a, u32 n_elts) +{ + u64 mask = 0; + CLIB_UNUSED (u64 data_mask) = pow2_mask (n_elts); +#if defined(CLIB_HAVE_VEC512) + u64x8 v8 = u64x8_splat (v); + u64x8u *av = (u64x8u *) a; + mask = + ((u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[0], data_mask), v8) | + (u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[1], data_mask >> 8), + v8) + << 8 | + (u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[2], data_mask >> 16), + v8) + << 16 | + (u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[3], data_mask >> 24), + v8) + << 24 | + (u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[4], data_mask >> 32), + v8) + << 32 | + (u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[5], data_mask >> 40), + v8) + << 40 | + (u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[6], data_mask >> 48), + v8) + << 48 | + (u64) u64x8_is_equal_mask (u64x8_mask_load_zero (&av[7], data_mask >> 56), + v8) + << 56); + +#elif defined(CLIB_HAVE_VEC256) && defined(__BMI2__) && \ + defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE) + u64x4 v4 = u64x4_splat (v); + u64x4u *av = (u64x4u *) a; + + for (int i = 0; i < 16; i += 2) + { + u64 l = u8x32_msb_mask (v4 == u64x4_mask_load_zero (&av[i], data_mask)); + u64 h = u8x32_msb_mask ( + v4 == u64x4_mask_load_zero (&av[i + 1], data_mask >> 4)); + mask |= _pext_u64 (l | h << 32, 0x0101010101010101) << (i * 4); + data_mask >>= 8; + } +#else + for (int i = 0; i < n_elts; i++) + if (a[i] == v) + mask |= 1ULL << i; +#endif + return mask; +} + +/** \brief Compare 64-bit elemments with provied value and return bitmap + + @param v value to compare elements with + @param a array of u64 elements + @param mask array of u64 where reuslting mask will be stored + @param n_elts number of elements in the array + @return none +*/ + +static_always_inline void +clib_mask_compare_u64 (u64 v, u64 *a, u64 *bitmap, u32 n_elts) +{ + while (n_elts >= 64) + { + bitmap++[0] = clib_mask_compare_u64_x64 (v, a); n_elts -= 64; a += 64; } @@ -160,7 +355,7 @@ clib_mask_compare_u32 (u32 v, u32 *a, u64 *bitmap, u32 n_elts) if (PREDICT_TRUE (n_elts == 0)) return; - bitmap[0] = clib_mask_compare_u32_x64 (v, a, n_elts) & pow2_mask (n_elts); + bitmap[0] = clib_mask_compare_u64_x64_n (v, a, n_elts) & pow2_mask (n_elts); } #endif diff --git a/src/vppinfra/vector/test/array_mask.c b/src/vppinfra/vector/test/array_mask.c deleted file mode 100644 index a1f4da728d4..00000000000 --- a/src/vppinfra/vector/test/array_mask.c +++ /dev/null @@ -1,124 +0,0 @@ -/* SPDX-License-Identifier: Apache-2.0 - * Copyright(c) 2021 Cisco Systems, Inc. - */ - -#include <vppinfra/format.h> -#include <vppinfra/vector/test/test.h> -#include <vppinfra/vector/array_mask.h> - -__clib_test_fn void -clib_array_mask_u32_wrapper (u32 *src, u32 mask, u32 n_elts) -{ - clib_array_mask_u32 (src, mask, n_elts); -} - -typedef struct -{ - u32 mask; - u32 expected[256]; -} array_mask_test_t; - -static array_mask_test_t tests[] = { - /* mask values 0x1, output array of alternating 0 1 0 1 .. */ - { .mask = 1, - .expected = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 } }, - /* mask values 0xFFFFFFFF, output array of 0, 1, 2, .., 255 */ - { .mask = ~0U, - .expected = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, - 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, - 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, - 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, - 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, - 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, - 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, - 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, - 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, - 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, - 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, - 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, - 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, - 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, - 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, - 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, - 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, - 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, - 252, 253, 254, 255 } }, - /* mask values 0xF, output array of 0, .., 15, 0, .., 15, 0, .., 15 */ - { .mask = 15, - .expected = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } }, - /* mask values 0x1, output array of 1, 0, 1, 0,.. */ - { .mask = 1, .expected = { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 } }, -}; - -static clib_error_t * -test_clib_array_mask_u32 (clib_error_t *err) -{ - u32 i, j; - for (i = 0; i < ARRAY_LEN (tests) - 1; i++) - { - u32 src[256]; - for (j = 0; j < ARRAY_LEN (src); j++) - src[j] = j; - - array_mask_test_t *t = tests + i; - clib_array_mask_u32_wrapper (src, t->mask, ARRAY_LEN (src)); - for (j = 0; j < ARRAY_LEN (src); j++) - { - if (src[j] != t->expected[j]) - return clib_error_return (err, - "testcase %u failed at " - "(src[%u] = 0x%x, expected 0x%x)", - i, j, src[j], t->expected[j]); - } - } - - u32 src[10] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; - array_mask_test_t *t = tests + i; - - clib_array_mask_u32_wrapper (src, t->mask, ARRAY_LEN (src)); - for (j = 0; j < ARRAY_LEN (src); j++) - { - if (src[j] != t->expected[j]) - return clib_error_return (err, - "testcase %u failed at " - "(src[%u] = 0x%x, expected 0x%x)", - i, j, src[j], t->expected[j]); - } - - return err; -} - -REGISTER_TEST (clib_array_mask_u32) = { - .name = "clib_array_mask_u32", - .fn = test_clib_array_mask_u32, -}; diff --git a/src/vppinfra/vector/test/compress.c b/src/vppinfra/vector/test/compress.c deleted file mode 100644 index 9bc53ff1e41..00000000000 --- a/src/vppinfra/vector/test/compress.c +++ /dev/null @@ -1,248 +0,0 @@ -/* SPDX-License-Identifier: Apache-2.0 - * Copyright(c) 2021 Cisco Systems, Inc. - */ - -#include <vppinfra/format.h> -#include <vppinfra/vector/test/test.h> -#include <vppinfra/vector/compress.h> - -__clib_test_fn u32 -clib_compress_u64_wrapper (u64 *dst, u64 *src, u64 *mask, u32 n_elts) -{ - return clib_compress_u64 (dst, src, mask, n_elts); -} - -__clib_test_fn u32 -clib_compress_u32_wrapper (u32 *dst, u32 *src, u64 *mask, u32 n_elts) -{ - return clib_compress_u32 (dst, src, mask, n_elts); -} - -__clib_test_fn u32 -clib_compress_u16_wrapper (u16 *dst, u16 *src, u64 *mask, u32 n_elts) -{ - return clib_compress_u16 (dst, src, mask, n_elts); -} - -__clib_test_fn u32 -clib_compress_u8_wrapper (u8 *dst, u8 *src, u64 *mask, u32 n_elts) -{ - return clib_compress_u8 (dst, src, mask, n_elts); -} - -typedef struct -{ - u64 mask[10]; - u32 n_elts; -} compress_test_t; - -static compress_test_t tests[] = { - { .mask = { 1 }, .n_elts = 1 }, - { .mask = { 2 }, .n_elts = 2 }, - { .mask = { 3 }, .n_elts = 2 }, - { .mask = { 0, 1 }, .n_elts = 66 }, - { .mask = { 0, 2 }, .n_elts = 69 }, - { .mask = { 0, 3 }, .n_elts = 66 }, - { .mask = { ~0ULL, ~0ULL, ~0ULL, ~0ULL }, .n_elts = 62 }, - { .mask = { ~0ULL, ~0ULL, ~0ULL, ~0ULL }, .n_elts = 255 }, - { .mask = { ~0ULL, 1, 1, ~0ULL }, .n_elts = 256 }, -}; - -static clib_error_t * -test_clib_compress_u64 (clib_error_t *err) -{ - u64 src[513]; - u64 dst[513]; - u32 i, j; - - for (i = 0; i < ARRAY_LEN (src); i++) - src[i] = i; - - for (i = 0; i < ARRAY_LEN (tests); i++) - { - compress_test_t *t = tests + i; - u64 *dp = dst; - u32 r; - - for (j = 0; j < ARRAY_LEN (dst); j++) - dst[j] = 0xa5a5a5a5a5a5a5a5; - - r = clib_compress_u64_wrapper (dst, src, t->mask, t->n_elts); - - for (j = 0; j < t->n_elts; j++) - { - if ((t->mask[j >> 6] & (1ULL << (j & 0x3f))) == 0) - continue; - if (dp[0] != src[j]) - return clib_error_return (err, - "wrong data in testcase %u at " - "(dst[%u] = 0x%lx, src[%u] = 0x%lx)", - i, dp - dst, dp[0], j, src[j]); - dp++; - } - - if (dst[dp - dst + 1] != 0xa5a5a5a5a5a5a5a5) - return clib_error_return (err, "buffer overrun in testcase %u", i); - - if (dp - dst != r) - return clib_error_return (err, "wrong number of elts in testcase %u", - i); - } - - return err; - - return err; -} - -static clib_error_t * -test_clib_compress_u32 (clib_error_t *err) -{ - u32 src[513]; - u32 dst[513]; - u32 i, j; - - for (i = 0; i < ARRAY_LEN (src); i++) - src[i] = i; - - for (i = 0; i < ARRAY_LEN (tests); i++) - { - compress_test_t *t = tests + i; - u32 *dp = dst; - u32 r; - - for (j = 0; j < ARRAY_LEN (dst); j++) - dst[j] = 0xa5a5a5a5; - - r = clib_compress_u32_wrapper (dst, src, t->mask, t->n_elts); - - for (j = 0; j < t->n_elts; j++) - { - if ((t->mask[j >> 6] & (1ULL << (j & 0x3f))) == 0) - continue; - - if (dp[0] != src[j]) - return clib_error_return (err, - "wrong data in testcase %u at " - "(dst[%u] = 0x%x, src[%u] = 0x%x)", - i, dp - dst, dp[0], j, src[j]); - dp++; - } - - if (dst[dp - dst + 1] != 0xa5a5a5a5) - return clib_error_return (err, "buffer overrun in testcase %u", i); - - if (dp - dst != r) - return clib_error_return (err, "wrong number of elts in testcase %u", - i); - } - - return err; -} - -static clib_error_t * -test_clib_compress_u16 (clib_error_t *err) -{ - u16 src[513]; - u16 dst[513]; - u32 i, j; - - for (i = 0; i < ARRAY_LEN (src); i++) - src[i] = i; - - for (i = 0; i < ARRAY_LEN (tests); i++) - { - compress_test_t *t = tests + i; - u16 *dp = dst; - u32 r; - - for (j = 0; j < ARRAY_LEN (dst); j++) - dst[j] = 0xa5a5; - - r = clib_compress_u16_wrapper (dst, src, t->mask, t->n_elts); - - for (j = 0; j < t->n_elts; j++) - { - if ((t->mask[j >> 6] & (1ULL << (j & 0x3f))) == 0) - continue; - if (dp[0] != src[j]) - return clib_error_return (err, - "wrong data in testcase %u at " - "(dst[%u] = 0x%x, src[%u] = 0x%x)", - i, dp - dst, dp[0], j, src[j]); - dp++; - } - - if (dst[dp - dst + 1] != 0xa5a5) - return clib_error_return (err, "buffer overrun in testcase %u", i); - - if (dp - dst != r) - return clib_error_return (err, "wrong number of elts in testcase %u", - i); - } - - return err; -} - -static clib_error_t * -test_clib_compress_u8 (clib_error_t *err) -{ - u8 src[513]; - u8 dst[513]; - u32 i, j; - - for (i = 0; i < ARRAY_LEN (src); i++) - src[i] = i; - - for (i = 0; i < ARRAY_LEN (tests); i++) - { - compress_test_t *t = tests + i; - u8 *dp = dst; - u32 r; - - for (j = 0; j < ARRAY_LEN (dst); j++) - dst[j] = 0xa5; - - r = clib_compress_u8_wrapper (dst, src, t->mask, t->n_elts); - - for (j = 0; j < t->n_elts; j++) - { - if ((t->mask[j >> 6] & (1ULL << (j & 0x3f))) == 0) - continue; - if (dp[0] != src[j]) - return clib_error_return (err, - "wrong data in testcase %u at " - "(dst[%u] = 0x%x, src[%u] = 0x%x)", - i, dp - dst, dp[0], j, src[j]); - dp++; - } - - if (dst[dp - dst + 1] != 0xa5) - return clib_error_return (err, "buffer overrun in testcase %u", i); - - if (dp - dst != r) - return clib_error_return (err, "wrong number of elts in testcase %u", - i); - } - - return err; -} - -REGISTER_TEST (clib_compress_u64) = { - .name = "clib_compress_u64", - .fn = test_clib_compress_u64, -}; - -REGISTER_TEST (clib_compress_u32) = { - .name = "clib_compress_u32", - .fn = test_clib_compress_u32, -}; - -REGISTER_TEST (clib_compress_u16) = { - .name = "clib_compress_u16", - .fn = test_clib_compress_u16, -}; - -REGISTER_TEST (clib_compress_u8) = { - .name = "clib_compress_u8", - .fn = test_clib_compress_u8, -}; diff --git a/src/vppinfra/vector/test/mask_compare.c b/src/vppinfra/vector/test/mask_compare.c deleted file mode 100644 index 64df0ee084a..00000000000 --- a/src/vppinfra/vector/test/mask_compare.c +++ /dev/null @@ -1,95 +0,0 @@ -/* SPDX-License-Identifier: Apache-2.0 - * Copyright(c) 2021 Cisco Systems, Inc. - */ - -#include <vppinfra/format.h> -#include <vppinfra/vector/test/test.h> -#include <vppinfra/vector/mask_compare.h> - -__clib_test_fn void -clib_mask_compare_u16_wrapper (u16 v, u16 *a, u64 *mask, u32 n_elts) -{ - clib_mask_compare_u16 (v, a, mask, n_elts); -} - -__clib_test_fn void -clib_mask_compare_u32_wrapper (u32 v, u32 *a, u64 *mask, u32 n_elts) -{ - clib_mask_compare_u32 (v, a, mask, n_elts); -} - -static clib_error_t * -test_clib_mask_compare_u16 (clib_error_t *err) -{ - u16 array[513]; - u64 mask[10]; - u32 i, j; - - for (i = 0; i < ARRAY_LEN (array); i++) - array[i] = i; - - for (i = 0; i < ARRAY_LEN (array); i++) - { - for (j = 0; j < ARRAY_LEN (mask); j++) - mask[j] = 0xa5a5a5a5a5a5a5a5; - - clib_mask_compare_u16_wrapper (i, array, mask, i + 1); - - for (j = 0; j < (i >> 6); j++) - { - if (mask[j]) - return clib_error_return (err, "mask at position %u not zero", j); - } - if (mask[j] != 1ULL << (i & 0x3f)) - return clib_error_return (err, - "mask at position %u is %lx, expected %lx", - j, mask[j], 1ULL << (i % 64)); - - if (mask[j + 1] != 0xa5a5a5a5a5a5a5a5) - return clib_error_return (err, "mask overrun at position %u", j + 1); - } - return err; -} - -REGISTER_TEST (clib_mask_compare_u16) = { - .name = "clib_mask_compare_u16", - .fn = test_clib_mask_compare_u16, -}; - -static clib_error_t * -test_clib_mask_compare_u32 (clib_error_t *err) -{ - u32 array[513]; - u64 mask[10]; - u32 i, j; - - for (i = 0; i < ARRAY_LEN (array); i++) - array[i] = i; - - for (i = 0; i < ARRAY_LEN (array); i++) - { - for (j = 0; j < ARRAY_LEN (mask); j++) - mask[j] = 0xa5a5a5a5a5a5a5a5; - - clib_mask_compare_u32_wrapper (i, array, mask, i + 1); - - for (j = 0; j < (i >> 6); j++) - { - if (mask[j]) - return clib_error_return (err, "mask at position %u not zero", j); - } - if (mask[j] != 1ULL << (i & 0x3f)) - return clib_error_return (err, - "mask at position %u is %lx, expected %lx", - j, mask[j], 1ULL << (i % 64)); - - if (mask[j + 1] != 0xa5a5a5a5a5a5a5a5) - return clib_error_return (err, "mask overrun at position %u", j + 1); - } - return err; -} - -REGISTER_TEST (clib_mask_compare_u32) = { - .name = "clib_mask_compare_u32", - .fn = test_clib_mask_compare_u32, -}; diff --git a/src/vppinfra/vector/test/test.c b/src/vppinfra/vector/test/test.c deleted file mode 100644 index 1a8b9d6ea10..00000000000 --- a/src/vppinfra/vector/test/test.c +++ /dev/null @@ -1,53 +0,0 @@ -/* SPDX-License-Identifier: Apache-2.0 - * Copyright(c) 2021 Cisco Systems, Inc. - */ - -#include <vppinfra/format.h> -#include <vppinfra/vector/test/test.h> - -test_registration_t *test_registrations[CLIB_MARCH_TYPE_N_VARIANTS] = {}; - -int -test_march_supported (clib_march_variant_type_t type) -{ -#define _(s, n) \ - if (CLIB_MARCH_VARIANT_TYPE_##s == type) \ - return clib_cpu_march_priority_##s (); - foreach_march_variant -#undef _ - return 0; -} - -int -main (int argc, char *argv[]) -{ - clib_mem_init (0, 64ULL << 20); - - for (int i = 0; i < CLIB_MARCH_TYPE_N_VARIANTS; i++) - { - test_registration_t *r = test_registrations[i]; - - if (r == 0 || test_march_supported (i) < 0) - continue; - - fformat (stdout, "\nMultiarch Variant: %U\n", format_march_variant, i); - fformat (stdout, - "-------------------------------------------------------\n"); - while (r) - { - clib_error_t *err; - err = (r->fn) (0); - fformat (stdout, "%-50s %s\n", r->name, err ? "FAIL" : "PASS"); - if (err) - { - clib_error_report (err); - fformat (stdout, "\n"); - } - - r = r->next; - } - } - - fformat (stdout, "\n"); - return 0; -} diff --git a/src/vppinfra/vector/test/test.h b/src/vppinfra/vector/test/test.h deleted file mode 100644 index bc499fb24e8..00000000000 --- a/src/vppinfra/vector/test/test.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: Apache-2.0 - * Copyright(c) 2021 Cisco Systems, Inc. - */ - -#ifndef included_test_test_h -#define included_test_test_h - -#include <vppinfra/cpu.h> - -typedef clib_error_t *(test_fn_t) (clib_error_t *); - -typedef struct test_registration_ -{ - char *name; - u8 multiarch : 1; - test_fn_t *fn; - struct test_registration_ *next; -} test_registration_t; - -extern test_registration_t *test_registrations[CLIB_MARCH_TYPE_N_VARIANTS]; - -#define __clib_test_fn static __clib_noinline __clib_section (".test_wrapper") - -#define REGISTER_TEST(x) \ - test_registration_t CLIB_MARCH_SFX (__test_##x); \ - static void __clib_constructor CLIB_MARCH_SFX (__test_registration_##x) ( \ - void) \ - { \ - test_registration_t *r = &CLIB_MARCH_SFX (__test_##x); \ - r->next = test_registrations[CLIB_MARCH_SFX (CLIB_MARCH_VARIANT_TYPE)]; \ - test_registrations[CLIB_MARCH_SFX (CLIB_MARCH_VARIANT_TYPE)] = r; \ - } \ - test_registration_t CLIB_MARCH_SFX (__test_##x) - -#endif diff --git a/src/vppinfra/vector/toeplitz.c b/src/vppinfra/vector/toeplitz.c new file mode 100644 index 00000000000..fcc4b64ad19 --- /dev/null +++ b/src/vppinfra/vector/toeplitz.c @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright(c) 2021 Cisco Systems, Inc. + */ + +#include <vppinfra/clib.h> +#include <vppinfra/mem.h> +#include <vppinfra/vector/toeplitz.h> + +static u8 default_key[40] = { + 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 0x41, 0x67, + 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 0xd0, 0xca, 0x2b, 0xcb, + 0xae, 0x7b, 0x30, 0xb4, 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, + 0xf2, 0x0c, 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa, +}; + +#ifdef __x86_64__ +static_always_inline void +clib_toeplitz_hash_key_expand_8 (u64x2 kv, u64x8u *m) +{ + u64x8 kv4, a, b, shift = { 0, 1, 2, 3, 4, 5, 6, 7 }; + + kv4 = (u64x8){ kv[0], kv[1], kv[0], kv[1], kv[0], kv[1], kv[0], kv[1] }; + + /* clang-format off */ + /* create 8 byte-swapped copies of the bytes 0 - 7 */ + a = (u64x8) u8x64_shuffle (kv4, + 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0, + 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0, + 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0, + 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0, + 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0, + 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0, + 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0, + 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0); + /* create 8 byte-swapped copies of the bytes 4 - 11 */ + b = (u64x8) u8x64_shuffle (kv4, + 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, + 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, + 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, + 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, + 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, + 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, + 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, + 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4); + /* clang-format on */ + + /* shift each 64-bit element for 0 - 7 bits */ + a <<= shift; + b <<= shift; + + /* clang-format off */ + /* construct eight 8x8 bit matrix used by gf2p8affine */ + * m = (u64x8) u8x64_shuffle2 (a, b, + 0x07, 0x0f, 0x17, 0x1f, 0x27, 0x2f, 0x37, 0x3f, + 0x06, 0x0e, 0x16, 0x1e, 0x26, 0x2e, 0x36, 0x3e, + 0x05, 0x0d, 0x15, 0x1d, 0x25, 0x2d, 0x35, 0x3d, + 0x04, 0x0c, 0x14, 0x1c, 0x24, 0x2c, 0x34, 0x3c, + 0x47, 0x4f, 0x57, 0x5f, 0x67, 0x6f, 0x77, 0x7f, + 0x46, 0x4e, 0x56, 0x5e, 0x66, 0x6e, 0x76, 0x7e, + 0x45, 0x4d, 0x55, 0x5d, 0x65, 0x6d, 0x75, 0x7d, + 0x44, 0x4c, 0x54, 0x5c, 0x64, 0x6c, 0x74, 0x7c); + /* clang-format on */ +} + +void +clib_toeplitz_hash_key_expand (u64 *matrixes, u8 *key, int size) +{ + u64x8u *m = (u64x8u *) matrixes; + u64x2 kv = {}, zero = {}; + + while (size >= 8) + { + kv = *(u64x2u *) key; + clib_toeplitz_hash_key_expand_8 (kv, m); + key += 8; + m++; + size -= 8; + } + + kv = u64x2_shuffle2 (kv, zero, 1, 2); + clib_toeplitz_hash_key_expand_8 (kv, m); +} +#endif + +__clib_export clib_toeplitz_hash_key_t * +clib_toeplitz_hash_key_init (u8 *key, u32 keylen) +{ + clib_toeplitz_hash_key_t *k; + u32 size, gfni_size = 0; + + if (key == 0) + { + key = default_key; + keylen = sizeof (default_key); + } + + size = + round_pow2 (sizeof (clib_toeplitz_hash_key_t) + round_pow2 (keylen, 16), + CLIB_CACHE_LINE_BYTES); +#ifdef __x86_64__ + gfni_size = round_pow2 ((keylen + 1) * 8, CLIB_CACHE_LINE_BYTES); +#endif + + k = clib_mem_alloc_aligned (size + gfni_size, CLIB_CACHE_LINE_BYTES); + clib_memset_u8 (k, 0, size + gfni_size); + k->key_length = keylen; + k->gfni_offset = size; + clib_memcpy_fast (k->data, key, keylen); + +#ifdef __x86_64__ + clib_toeplitz_hash_key_expand ((u64 *) ((u8 *) k + k->gfni_offset), k->data, + k->key_length); +#endif + + return k; +} + +__clib_export void +clib_toeplitz_hash_key_free (clib_toeplitz_hash_key_t *k) +{ + clib_mem_free (k); +} diff --git a/src/vppinfra/vector/toeplitz.h b/src/vppinfra/vector/toeplitz.h new file mode 100644 index 00000000000..76297f05195 --- /dev/null +++ b/src/vppinfra/vector/toeplitz.h @@ -0,0 +1,513 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright(c) 2021 Cisco Systems, Inc. + */ + +#ifndef included_vector_toeplitz_h +#define included_vector_toeplitz_h +#include <vppinfra/clib.h> + +typedef struct +{ + u16 key_length; + u16 gfni_offset; + u8 data[]; +} clib_toeplitz_hash_key_t; + +clib_toeplitz_hash_key_t *clib_toeplitz_hash_key_init (u8 *key, u32 keylen); +void clib_toeplitz_hash_key_free (clib_toeplitz_hash_key_t *k); + +#ifdef CLIB_HAVE_VEC256 +static_always_inline u32x8 +toeplitz_hash_one_x8 (u32x8 hash, u64x4 v4, u8 data, u8 off) +{ + u32x8 v8 = u32x8_shuffle2 (v4 << (off * 8), v4 << (off * 8 + 4), + /*uppper 32 bits of each u64 in reverse order */ + 15, 13, 11, 9, 7, 5, 3, 1); + +#ifdef CLIB_HAVE_VEC256_MASK_BITWISE_OPS + return u32x8_mask_xor (hash, v8, data); +#else + static const u32x8 bits = { 1, 2, 4, 8, 16, 32, 64, 128 }; + return hash ^ (((u32x8_splat (data) & bits) != u32x8_zero ()) & v8); +#endif +} +#endif + +#if defined(__GFNI__) && defined(__AVX512F__) +static const u8x64 __clib_toeplitz_hash_gfni_permute = { + /* clang-format off */ + 0x00, 0x01, 0x02, 0x03, 0x40, 0x41, 0x42, 0x43, + 0x01, 0x02, 0x03, 0x04, 0x41, 0x42, 0x43, 0x44, + 0x02, 0x03, 0x04, 0x05, 0x42, 0x43, 0x44, 0x45, + 0x03, 0x04, 0x05, 0x06, 0x43, 0x44, 0x45, 0x46, + 0x04, 0x05, 0x06, 0x07, 0x44, 0x45, 0x46, 0x47, + 0x05, 0x06, 0x07, 0x08, 0x45, 0x46, 0x47, 0x48, + 0x06, 0x07, 0x08, 0x09, 0x46, 0x47, 0x48, 0x49, + 0x07, 0x08, 0x09, 0x0a, 0x47, 0x48, 0x49, 0x4a + /* clang-format on */ +}; +static_always_inline u64x8 +clib_toeplitz_hash_gfni_one (u8x64 d0, u64x8 m, int i) +{ + + d0 = i == 1 ? (u8x64) u64x8_align_right (d0, d0, 1) : d0; + d0 = i == 2 ? (u8x64) u64x8_align_right (d0, d0, 2) : d0; + d0 = i == 3 ? (u8x64) u64x8_align_right (d0, d0, 3) : d0; + d0 = i == 4 ? (u8x64) u64x8_align_right (d0, d0, 4) : d0; + d0 = i == 5 ? (u8x64) u64x8_align_right (d0, d0, 5) : d0; + d0 = i == 6 ? (u8x64) u64x8_align_right (d0, d0, 6) : d0; + + d0 = u8x64_permute (__clib_toeplitz_hash_gfni_permute, d0); + + return (u64x8) _mm512_gf2p8affine_epi64_epi8 ((__m512i) d0, (__m512i) m, 0); +} + +static_always_inline u64x8 +clib_toeplitz_hash_gfni_two (u8x64 d0, u8x64 d1, u64x8 m, int i) +{ + + d0 = i == 1 ? (u8x64) u64x8_align_right (d0, d0, 1) : d0; + d1 = i == 1 ? (u8x64) u64x8_align_right (d1, d1, 1) : d1; + d0 = i == 2 ? (u8x64) u64x8_align_right (d0, d0, 2) : d0; + d1 = i == 2 ? (u8x64) u64x8_align_right (d1, d1, 2) : d1; + d0 = i == 3 ? (u8x64) u64x8_align_right (d0, d0, 3) : d0; + d1 = i == 3 ? (u8x64) u64x8_align_right (d1, d1, 3) : d1; + d0 = i == 4 ? (u8x64) u64x8_align_right (d0, d0, 4) : d0; + d1 = i == 4 ? (u8x64) u64x8_align_right (d1, d1, 4) : d1; + d0 = i == 5 ? (u8x64) u64x8_align_right (d0, d0, 5) : d0; + d1 = i == 5 ? (u8x64) u64x8_align_right (d1, d1, 5) : d1; + d0 = i == 6 ? (u8x64) u64x8_align_right (d0, d0, 6) : d0; + d1 = i == 6 ? (u8x64) u64x8_align_right (d1, d1, 6) : d1; + + d0 = u8x64_permute2 (__clib_toeplitz_hash_gfni_permute, d0, d1); + + return (u64x8) _mm512_gf2p8affine_epi64_epi8 ((__m512i) d0, (__m512i) m, 0); +} +#endif + +static_always_inline u32 +clib_toeplitz_hash (clib_toeplitz_hash_key_t *k, u8 *data, int n_bytes) +{ + u8 *key = k->data; + /* key must be 4 bytes longer than data */ + ASSERT (k->key_length - n_bytes >= 4); + +#if defined(__GFNI__) && defined(__AVX512F__) + u8x64 d0; + u64x8 h0 = {}; + u64x8u *m = (u64x8u *) ((u8 *) k + k->gfni_offset); + + /* move data ptr backwards for 3 byte so mask load "prepends" three zeros */ + data -= 3; + n_bytes += 3; + + if (n_bytes < 64) + { + d0 = u8x64_mask_load_zero ((u8 *) data, pow2_mask (n_bytes - 3) << 3); + goto last8; + } + + d0 = u8x64_mask_load_zero ((u8 *) data, -1ULL << 3); +next56: + h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_one (d0, m[0], 0), + clib_toeplitz_hash_gfni_one (d0, m[1], 1)); + h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_one (d0, m[2], 2), + clib_toeplitz_hash_gfni_one (d0, m[3], 3)); + h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_one (d0, m[4], 4), + clib_toeplitz_hash_gfni_one (d0, m[5], 5)); + h0 ^= clib_toeplitz_hash_gfni_one (d0, m[6], 6); + n_bytes -= 56; + data += 56; + m += 7; + + if (n_bytes >= 64) + { + d0 = *(u8x64u *) data; + goto next56; + } + + if (n_bytes == 0) + goto done; + + d0 = u8x64_mask_load_zero ((u8 *) data, pow2_mask (n_bytes)); +last8: + h0 ^= clib_toeplitz_hash_gfni_one (d0, m[0], 0); + n_bytes -= 8; + + if (n_bytes > 0) + { + m += 1; + d0 = (u8x64) u64x8_align_right (u64x8_zero (), d0, 1); + goto last8; + } + +done: + return u64x8_hxor (h0); +#elif defined(CLIB_HAVE_VEC256) + u64x4 v4, shift = { 0, 1, 2, 3 }; + u32x8 h0 = {}; + + while (n_bytes >= 4) + { + v4 = u64x4_splat (clib_net_to_host_u64 (*(u64u *) key)) << shift; + + h0 = toeplitz_hash_one_x8 (h0, v4, data[0], 0); + h0 = toeplitz_hash_one_x8 (h0, v4, data[1], 1); + h0 = toeplitz_hash_one_x8 (h0, v4, data[2], 2); + h0 = toeplitz_hash_one_x8 (h0, v4, data[3], 3); + + data += 4; + key += 4; + n_bytes -= 4; + } + + if (n_bytes) + { + u64 v = (u64) clib_net_to_host_u32 ((u64) (*(u32u *) key)) << 32; + v |= (u64) key[4] << 24; + + if (n_bytes == 3) + { + v |= (u64) key[5] << 16; + v |= (u64) key[6] << 8; + v4 = u64x4_splat (v) << shift; + h0 = toeplitz_hash_one_x8 (h0, v4, data[0], 0); + h0 = toeplitz_hash_one_x8 (h0, v4, data[1], 1); + h0 = toeplitz_hash_one_x8 (h0, v4, data[2], 2); + } + else if (n_bytes == 2) + { + v |= (u64) key[5] << 16; + v4 = u64x4_splat (v) << shift; + h0 = toeplitz_hash_one_x8 (h0, v4, data[0], 0); + h0 = toeplitz_hash_one_x8 (h0, v4, data[1], 1); + } + else + { + v4 = u64x4_splat (v) << shift; + h0 = toeplitz_hash_one_x8 (h0, v4, data[0], 0); + } + } + + return u32x8_hxor (h0); +#endif + u64 v, hash = 0; + + while (n_bytes >= 4) + { + v = clib_net_to_host_u64 (*(u64u *) key); + + for (u8 bit = 1 << 7, byte = data[0]; bit; bit >>= 1, v <<= 1) + hash ^= byte & bit ? v : 0; + for (u8 bit = 1 << 7, byte = data[1]; bit; bit >>= 1, v <<= 1) + hash ^= byte & bit ? v : 0; + for (u8 bit = 1 << 7, byte = data[2]; bit; bit >>= 1, v <<= 1) + hash ^= byte & bit ? v : 0; + for (u8 bit = 1 << 7, byte = data[3]; bit; bit >>= 1, v <<= 1) + hash ^= byte & bit ? v : 0; + + data += 4; + key += 4; + n_bytes -= 4; + } + + if (n_bytes) + { + v = (u64) clib_net_to_host_u32 ((u64) (*(u32u *) key)) << 32; + v |= (u64) key[4] << 24; + for (u8 bit = 1 << 7, byte = data[0]; bit; bit >>= 1, v <<= 1) + hash ^= byte & bit ? v : 0; + if (n_bytes > 1) + { + v |= (u64) key[5] << 24; + for (u8 bit = 1 << 7, byte = data[1]; bit; bit >>= 1, v <<= 1) + hash ^= byte & bit ? v : 0; + } + if (n_bytes > 2) + { + v |= (u64) key[6] << 24; + for (u8 bit = 1 << 7, byte = data[2]; bit; bit >>= 1, v <<= 1) + hash ^= byte & bit ? v : 0; + } + } + return hash >> 32; +} + +static_always_inline void +clib_toeplitz_hash_x4 (clib_toeplitz_hash_key_t *k, u8 *data0, u8 *data1, + u8 *data2, u8 *data3, u32 *hash0, u32 *hash1, + u32 *hash2, u32 *hash3, int n_bytes) +{ + /* key must be 4 bytes longer than data */ + ASSERT (k->key_length - n_bytes >= 4); +#if defined(__GFNI__) && defined(__AVX512F__) + u64x8u *m = (u64x8u *) ((u8 *) k + k->gfni_offset); + u8x64 d0, d1, d2, d3; + u64x8 h0 = {}, h2 = {}; + u64 h, mask; + + /* move data ptr backwards for 3 byte so mask load "prepends" three zeros */ + data0 -= 3; + data1 -= 3; + data2 -= 3; + data3 -= 3; + n_bytes += 3; + + if (n_bytes < 64) + { + mask = pow2_mask (n_bytes - 3) << 3; + d0 = u8x64_mask_load_zero ((u8 *) data0, mask); + d1 = u8x64_mask_load_zero ((u8 *) data1, mask); + d2 = u8x64_mask_load_zero ((u8 *) data2, mask); + d3 = u8x64_mask_load_zero ((u8 *) data3, mask); + goto last8; + } + + mask = -1ULL << 3; + d0 = u8x64_mask_load_zero ((u8 *) data0, mask); + d1 = u8x64_mask_load_zero ((u8 *) data1, mask); + d2 = u8x64_mask_load_zero ((u8 *) data2, mask); + d3 = u8x64_mask_load_zero ((u8 *) data3, mask); +next56: + h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_two (d0, d1, m[0], 0), + clib_toeplitz_hash_gfni_two (d0, d1, m[1], 1)); + h2 = u64x8_xor3 (h2, clib_toeplitz_hash_gfni_two (d2, d3, m[0], 0), + clib_toeplitz_hash_gfni_two (d2, d3, m[1], 1)); + + h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_two (d0, d1, m[2], 2), + clib_toeplitz_hash_gfni_two (d0, d1, m[3], 3)); + h2 = u64x8_xor3 (h2, clib_toeplitz_hash_gfni_two (d2, d3, m[2], 2), + clib_toeplitz_hash_gfni_two (d2, d3, m[3], 3)); + + h0 = u64x8_xor3 (h0, clib_toeplitz_hash_gfni_two (d0, d1, m[4], 4), + clib_toeplitz_hash_gfni_two (d0, d1, m[5], 5)); + h2 = u64x8_xor3 (h2, clib_toeplitz_hash_gfni_two (d2, d3, m[4], 4), + clib_toeplitz_hash_gfni_two (d2, d3, m[5], 5)); + + h0 ^= clib_toeplitz_hash_gfni_two (d0, d1, m[6], 6); + h2 ^= clib_toeplitz_hash_gfni_two (d2, d3, m[6], 6); + + n_bytes -= 56; + data0 += 56; + data1 += 56; + data2 += 56; + data3 += 56; + m += 7; + + if (n_bytes >= 64) + { + d0 = *(u8x64u *) data0; + d1 = *(u8x64u *) data1; + d2 = *(u8x64u *) data2; + d3 = *(u8x64u *) data3; + goto next56; + } + + if (n_bytes == 0) + goto done; + + mask = pow2_mask (n_bytes); + d0 = u8x64_mask_load_zero ((u8 *) data0, mask); + d1 = u8x64_mask_load_zero ((u8 *) data1, mask); + d2 = u8x64_mask_load_zero ((u8 *) data2, mask); + d3 = u8x64_mask_load_zero ((u8 *) data3, mask); +last8: + h0 ^= clib_toeplitz_hash_gfni_two (d0, d1, m[0], 0); + h2 ^= clib_toeplitz_hash_gfni_two (d2, d3, m[0], 0); + n_bytes -= 8; + + if (n_bytes > 0) + { + u64x8 zero = {}; + m += 1; + d0 = (u8x64) u64x8_align_right (zero, d0, 1); + d1 = (u8x64) u64x8_align_right (zero, d1, 1); + d2 = (u8x64) u64x8_align_right (zero, d2, 1); + d3 = (u8x64) u64x8_align_right (zero, d3, 1); + goto last8; + } + +done: + h = u64x8_hxor (h0); + *hash0 = h; + *hash1 = h >> 32; + h = u64x8_hxor (h2); + *hash2 = h; + *hash3 = h >> 32; +#elif defined(CLIB_HAVE_VEC256) + u8 *key = k->data; + u64x4 v4, shift = { 0, 1, 2, 3 }; + u32x8 h0 = {}, h1 = {}, h2 = {}, h3 = {}; + + while (n_bytes >= 4) + { + v4 = u64x4_splat (clib_net_to_host_u64 (*(u64u *) key)) << shift; + + h0 = toeplitz_hash_one_x8 (h0, v4, data0[0], 0); + h1 = toeplitz_hash_one_x8 (h1, v4, data1[0], 0); + h2 = toeplitz_hash_one_x8 (h2, v4, data2[0], 0); + h3 = toeplitz_hash_one_x8 (h3, v4, data3[0], 0); + + h0 = toeplitz_hash_one_x8 (h0, v4, data0[1], 1); + h1 = toeplitz_hash_one_x8 (h1, v4, data1[1], 1); + h2 = toeplitz_hash_one_x8 (h2, v4, data2[1], 1); + h3 = toeplitz_hash_one_x8 (h3, v4, data3[1], 1); + + h0 = toeplitz_hash_one_x8 (h0, v4, data0[2], 2); + h1 = toeplitz_hash_one_x8 (h1, v4, data1[2], 2); + h2 = toeplitz_hash_one_x8 (h2, v4, data2[2], 2); + h3 = toeplitz_hash_one_x8 (h3, v4, data3[2], 2); + + h0 = toeplitz_hash_one_x8 (h0, v4, data0[3], 3); + h1 = toeplitz_hash_one_x8 (h1, v4, data1[3], 3); + h2 = toeplitz_hash_one_x8 (h2, v4, data2[3], 3); + h3 = toeplitz_hash_one_x8 (h3, v4, data3[3], 3); + + data0 += 4; + data1 += 4; + data2 += 4; + data3 += 4; + key += 4; + n_bytes -= 4; + } + + if (n_bytes) + { + u64 v = (u64) clib_net_to_host_u32 ((u64) (*(u32u *) key)) << 32; + v |= (u64) key[4] << 24; + + if (n_bytes == 3) + { + v |= (u64) key[5] << 16; + v |= (u64) key[6] << 8; + v4 = u64x4_splat (v) << shift; + h0 = toeplitz_hash_one_x8 (h0, v4, data0[0], 0); + h1 = toeplitz_hash_one_x8 (h1, v4, data1[0], 0); + h2 = toeplitz_hash_one_x8 (h2, v4, data2[0], 0); + h3 = toeplitz_hash_one_x8 (h3, v4, data3[0], 0); + + h0 = toeplitz_hash_one_x8 (h0, v4, data0[1], 1); + h1 = toeplitz_hash_one_x8 (h1, v4, data1[1], 1); + h2 = toeplitz_hash_one_x8 (h2, v4, data2[1], 1); + h3 = toeplitz_hash_one_x8 (h3, v4, data3[1], 1); + + h0 = toeplitz_hash_one_x8 (h0, v4, data0[2], 2); + h1 = toeplitz_hash_one_x8 (h1, v4, data1[2], 2); + h2 = toeplitz_hash_one_x8 (h2, v4, data2[2], 2); + h3 = toeplitz_hash_one_x8 (h3, v4, data3[2], 2); + } + else if (n_bytes == 2) + { + v |= (u64) key[5] << 16; + v4 = u64x4_splat (v) << shift; + h0 = toeplitz_hash_one_x8 (h0, v4, data0[0], 0); + h1 = toeplitz_hash_one_x8 (h1, v4, data1[0], 0); + h2 = toeplitz_hash_one_x8 (h2, v4, data2[0], 0); + h3 = toeplitz_hash_one_x8 (h3, v4, data3[0], 0); + + h0 = toeplitz_hash_one_x8 (h0, v4, data0[1], 1); + h1 = toeplitz_hash_one_x8 (h1, v4, data1[1], 1); + h2 = toeplitz_hash_one_x8 (h2, v4, data2[1], 1); + h3 = toeplitz_hash_one_x8 (h3, v4, data3[1], 1); + } + else + { + v4 = u64x4_splat (v) << shift; + h0 = toeplitz_hash_one_x8 (h0, v4, data0[0], 0); + h1 = toeplitz_hash_one_x8 (h1, v4, data1[0], 0); + h2 = toeplitz_hash_one_x8 (h2, v4, data2[0], 0); + h3 = toeplitz_hash_one_x8 (h3, v4, data3[0], 0); + } + } + + *hash0 = u32x8_hxor (h0); + *hash1 = u32x8_hxor (h1); + *hash2 = u32x8_hxor (h2); + *hash3 = u32x8_hxor (h3); +#else + u8 *key = k->data; + u64 v, h0 = 0, h1 = 0, h2 = 0, h3 = 0; + + while (n_bytes >= 4) + { + v = clib_net_to_host_u64 (*(u64u *) key); + + for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1) + { + h0 ^= data0[0] & bit ? v : 0; + h1 ^= data1[0] & bit ? v : 0; + h2 ^= data2[0] & bit ? v : 0; + h3 ^= data3[0] & bit ? v : 0; + } + for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1) + { + h0 ^= data0[1] & bit ? v : 0; + h1 ^= data1[1] & bit ? v : 0; + h2 ^= data2[1] & bit ? v : 0; + h3 ^= data3[1] & bit ? v : 0; + } + for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1) + { + h0 ^= data0[2] & bit ? v : 0; + h1 ^= data1[2] & bit ? v : 0; + h2 ^= data2[2] & bit ? v : 0; + h3 ^= data3[2] & bit ? v : 0; + } + for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1) + { + h0 ^= data0[3] & bit ? v : 0; + h1 ^= data1[3] & bit ? v : 0; + h2 ^= data2[3] & bit ? v : 0; + h3 ^= data3[3] & bit ? v : 0; + } + + data0 += 4; + data1 += 4; + data2 += 4; + data3 += 4; + key += 4; + n_bytes -= 4; + } + + if (n_bytes) + { + v = (u64) clib_net_to_host_u32 ((u64) (*(u32u *) key)) << 32; + v |= (u64) key[4] << 24; + for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1) + { + h0 ^= data0[0] & bit ? v : 0; + h1 ^= data1[0] & bit ? v : 0; + h2 ^= data2[0] & bit ? v : 0; + h3 ^= data3[0] & bit ? v : 0; + } + if (n_bytes > 1) + { + v |= (u64) key[5] << 24; + for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1) + { + h0 ^= data0[1] & bit ? v : 0; + h1 ^= data1[1] & bit ? v : 0; + h2 ^= data2[1] & bit ? v : 0; + h3 ^= data3[1] & bit ? v : 0; + } + } + if (n_bytes > 2) + { + v |= (u64) key[6] << 24; + for (u8 bit = 1 << 7; bit; bit >>= 1, v <<= 1) + { + h0 ^= data0[2] & bit ? v : 0; + h1 ^= data1[2] & bit ? v : 0; + h2 ^= data2[2] & bit ? v : 0; + h3 ^= data3[2] & bit ? v : 0; + } + } + } + *hash0 = h0 >> 32; + *hash1 = h1 >> 32; + *hash2 = h2 >> 32; + *hash3 = h3 >> 32; +#endif +} + +#endif |