From aa63bc6cf4b9031c3fc6ae22aecd846cc712bc52 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Mon, 8 Nov 2021 11:18:30 +0000 Subject: vppinfra: new vectorized ip checksum functions incl. csum_and_copy Type: improvement Change-Id: Id5810b7f4a6d6e4ce16b73c235b50db5d475ebf7 Signed-off-by: Damjan Marion --- src/vppinfra/vector/ip_csum.h | 339 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 339 insertions(+) create mode 100644 src/vppinfra/vector/ip_csum.h (limited to 'src/vppinfra/vector/ip_csum.h') diff --git a/src/vppinfra/vector/ip_csum.h b/src/vppinfra/vector/ip_csum.h new file mode 100644 index 00000000000..2cea9b448ea --- /dev/null +++ b/src/vppinfra/vector/ip_csum.h @@ -0,0 +1,339 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright(c) 2021 Cisco Systems, Inc. + */ + +#ifndef included_vector_ip_csum_h +#define included_vector_ip_csum_h +#include +typedef struct +{ + u64 sum; + u8 odd; +} clib_ip_csum_t; + +#if defined(CLIB_HAVE_VEC128) +static_always_inline u64x2 +clib_ip_csum_cvt_and_add_4 (u32x4 v) +{ + return ((u64x2) u32x4_interleave_lo ((u32x4) v, u32x4_zero ()) + + (u64x2) u32x4_interleave_hi ((u32x4) v, u32x4_zero ())); +} +static_always_inline u64 +clib_ip_csum_hadd_2 (u64x2 v) +{ + return v[0] + v[1]; +} +#endif + +#if defined(CLIB_HAVE_VEC256) +static_always_inline u64x4 +clib_ip_csum_cvt_and_add_8 (u32x8 v) +{ + return ((u64x4) u32x8_interleave_lo ((u32x8) v, u32x8_zero ()) + + (u64x4) u32x8_interleave_hi ((u32x8) v, u32x8_zero ())); +} +static_always_inline u64 +clib_ip_csum_hadd_4 (u64x4 v) +{ + return clib_ip_csum_hadd_2 (u64x4_extract_lo (v) + u64x4_extract_hi (v)); +} +#endif + +#if defined(CLIB_HAVE_VEC512) +static_always_inline u64x8 +clib_ip_csum_cvt_and_add_16 (u32x16 v) +{ + return ((u64x8) u32x16_interleave_lo ((u32x16) v, u32x16_zero ()) + + (u64x8) u32x16_interleave_hi ((u32x16) v, u32x16_zero ())); +} +static_always_inline u64 +clib_ip_csum_hadd_8 (u64x8 v) +{ + return clib_ip_csum_hadd_4 (u64x8_extract_lo (v) + u64x8_extract_hi (v)); +} +#endif + +static_always_inline void +clib_ip_csum_inline (clib_ip_csum_t *c, u8 *dst, u8 *src, u16 count, + int is_copy) +{ + if (c->odd) + { + c->odd = 0; + c->sum += (u16) src[0] << 8; + count--; + src++; + if (is_copy) + dst++[0] = src[0]; + } + +#if defined(CLIB_HAVE_VEC512) + u64x8 sum8 = {}; + + while (count >= 512) + { + u32x16u *s = (u32x16u *) src; + sum8 += clib_ip_csum_cvt_and_add_16 (s[0]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[1]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[2]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[3]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[8]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[5]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[6]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[7]); + count -= 512; + src += 512; + if (is_copy) + { + u32x16u *d = (u32x16u *) dst; + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + d[4] = s[4]; + d[5] = s[5]; + d[6] = s[6]; + d[7] = s[7]; + dst += 512; + } + } + + while (count >= 64) + { + u32x16u *s = (u32x16u *) src; + sum8 += clib_ip_csum_cvt_and_add_16 (s[0]); + count -= 64; + src += 64; + if (is_copy) + { + u32x16u *d = (u32x16u *) dst; + d[0] = s[0]; + dst += 512; + } + } + +#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE + if (count) + { + u64 mask = pow2_mask (count); + u32x16 v = (u32x16) u8x64_mask_load_zero (src, mask); + sum8 += clib_ip_csum_cvt_and_add_16 (v); + c->odd = count & 1; + if (is_copy) + u32x16_mask_store (v, dst, mask); + } + c->sum += clib_ip_csum_hadd_8 (sum8); + return; +#endif + + c->sum += clib_ip_csum_hadd_8 (sum8); +#elif defined(CLIB_HAVE_VEC256) + u64x4 sum4 = {}; + + while (count >= 256) + { + u32x8u *s = (u32x8u *) src; + sum4 += clib_ip_csum_cvt_and_add_8 (s[0]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[1]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[2]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[3]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[4]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[5]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[6]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[7]); + count -= 256; + src += 256; + if (is_copy) + { + u32x8u *d = (u32x8u *) dst; + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + d[4] = s[4]; + d[5] = s[5]; + d[6] = s[6]; + d[7] = s[7]; + dst += 256; + } + } + + while (count >= 32) + { + u32x8u *s = (u32x8u *) src; + sum4 += clib_ip_csum_cvt_and_add_8 (s[0]); + count -= 32; + src += 32; + if (is_copy) + { + u32x8u *d = (u32x8u *) dst; + d[0] = s[0]; + dst += 32; + } + } + +#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE + if (count) + { + u32 mask = pow2_mask (count); + u32x8 v = (u32x8) u8x32_mask_load_zero (src, mask); + sum4 += clib_ip_csum_cvt_and_add_8 (v); + c->odd = count & 1; + if (is_copy) + u32x8_mask_store (v, dst, mask); + } + c->sum += clib_ip_csum_hadd_4 (sum4); + return; +#endif + + c->sum += clib_ip_csum_hadd_4 (sum4); +#elif defined(CLIB_HAVE_VEC128) + u64x2 sum2 = {}; + + while (count >= 128) + { + u32x4u *s = (u32x4u *) src; + sum2 += clib_ip_csum_cvt_and_add_4 (s[0]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[1]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[2]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[3]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[4]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[5]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[6]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[7]); + count -= 128; + src += 128; + if (is_copy) + { + u32x4u *d = (u32x4u *) dst; + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + d[4] = s[4]; + d[5] = s[5]; + d[6] = s[6]; + d[7] = s[7]; + dst += 128; + } + } + + while (count >= 16) + { + u32x4u *s = (u32x4u *) src; + sum2 += clib_ip_csum_cvt_and_add_4 (s[0]); + count -= 16; + src += 16; + if (is_copy) + { + u32x4u *d = (u32x4u *) dst; + d[0] = s[0]; + dst += 16; + } + } + c->sum += clib_ip_csum_hadd_2 (sum2); +#else + while (count >= 4) + { + u32 v = *((u32 *) src); + c->sum += v; + count -= 4; + src += 4; + if (is_copy) + { + *(u32 *) dst = v; + dst += 4; + } + } +#endif + while (count >= 2) + { + u16 v = *((u16 *) src); + c->sum += v; + count -= 2; + src += 2; + if (is_copy) + { + *(u16 *) dst = v; + dst += 2; + } + } + + if (count) + { + c->odd = 1; + c->sum += (u16) src[0]; + if (is_copy) + dst[0] = src[0]; + } +} + +static_always_inline u16 +clib_ip_csum_fold (clib_ip_csum_t *c) +{ + u64 sum = c->sum; +#if defined(__x86_64__) && defined(__BMI2__) + u64 tmp = sum; + asm volatile( + /* using ADC is much faster than mov, shift, add sequence + * compiler produces */ + "shr $32, %[sum] \n\t" + "add %k[tmp], %k[sum] \n\t" + "mov $16, %k[tmp] \n\t" + "shrx %k[tmp], %k[sum], %k[tmp] \n\t" + "adc %w[tmp], %w[sum] \n\t" + "adc $0, %w[sum] \n\t" + : [ sum ] "+&r"(sum), [ tmp ] "+&r"(tmp)); +#else + sum = ((u32) sum) + (sum >> 32); + sum = ((u16) sum) + (sum >> 16); + sum = ((u16) sum) + (sum >> 16); +#endif + return (~((u16) sum)); +} + +static_always_inline void +clib_ip_csum_chunk (clib_ip_csum_t *c, u8 *src, u16 count) +{ + return clib_ip_csum_inline (c, 0, src, count, 0); +} + +static_always_inline void +clib_ip_csum_and_copy_chunk (clib_ip_csum_t *c, u8 *src, u8 *dst, u16 count) +{ + return clib_ip_csum_inline (c, dst, src, count, 1); +} + +static_always_inline u16 +clib_ip_csum (u8 *src, u16 count) +{ + clib_ip_csum_t c = {}; + if (COMPILE_TIME_CONST (count) && count == 12) + { + for (int i = 0; i < 3; i++) + c.sum += ((u32 *) src)[i]; + } + else if (COMPILE_TIME_CONST (count) && count == 20) + { + for (int i = 0; i < 5; i++) + c.sum += ((u32 *) src)[i]; + } + else if (COMPILE_TIME_CONST (count) && count == 40) + { + for (int i = 0; i < 10; i++) + c.sum += ((u32 *) src)[i]; + } + else + clib_ip_csum_inline (&c, 0, src, count, 0); + return clib_ip_csum_fold (&c); +} + +static_always_inline u16 +clib_ip_csum_and_copy (u8 *dst, u8 *src, u16 count) +{ + clib_ip_csum_t c = {}; + clib_ip_csum_inline (&c, dst, src, count, 1); + return clib_ip_csum_fold (&c); +} + +#endif -- cgit 1.2.3-korg