From aa63bc6cf4b9031c3fc6ae22aecd846cc712bc52 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Mon, 8 Nov 2021 11:18:30 +0000 Subject: vppinfra: new vectorized ip checksum functions incl. csum_and_copy Type: improvement Change-Id: Id5810b7f4a6d6e4ce16b73c235b50db5d475ebf7 Signed-off-by: Damjan Marion --- src/vppinfra/CMakeLists.txt | 2 + src/vppinfra/vector/ip_csum.h | 339 +++++++++++++++++++++++++++++++++++++ src/vppinfra/vector/test/ip_csum.c | 120 +++++++++++++ src/vppinfra/vector_neon.h | 12 ++ 4 files changed, 473 insertions(+) create mode 100644 src/vppinfra/vector/ip_csum.h create mode 100644 src/vppinfra/vector/test/ip_csum.c (limited to 'src/vppinfra') diff --git a/src/vppinfra/CMakeLists.txt b/src/vppinfra/CMakeLists.txt index 9f407a10a22..7a73fe531d7 100644 --- a/src/vppinfra/CMakeLists.txt +++ b/src/vppinfra/CMakeLists.txt @@ -194,6 +194,7 @@ set(VPPINFRA_HEADERS vector/compress.h vector/count_equal.h vector/index_to_ptr.h + vector/ip_csum.h vector/mask_compare.h vector.h vector_neon.h @@ -275,6 +276,7 @@ set(test_files vector/test/compress.c vector/test/count_equal.c vector/test/index_to_ptr.c + vector/test/ip_csum.c vector/test/mask_compare.c vector/test/memcpy_x86_64.c ) diff --git a/src/vppinfra/vector/ip_csum.h b/src/vppinfra/vector/ip_csum.h new file mode 100644 index 00000000000..2cea9b448ea --- /dev/null +++ b/src/vppinfra/vector/ip_csum.h @@ -0,0 +1,339 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright(c) 2021 Cisco Systems, Inc. + */ + +#ifndef included_vector_ip_csum_h +#define included_vector_ip_csum_h +#include +typedef struct +{ + u64 sum; + u8 odd; +} clib_ip_csum_t; + +#if defined(CLIB_HAVE_VEC128) +static_always_inline u64x2 +clib_ip_csum_cvt_and_add_4 (u32x4 v) +{ + return ((u64x2) u32x4_interleave_lo ((u32x4) v, u32x4_zero ()) + + (u64x2) u32x4_interleave_hi ((u32x4) v, u32x4_zero ())); +} +static_always_inline u64 +clib_ip_csum_hadd_2 (u64x2 v) +{ + return v[0] + v[1]; +} +#endif + +#if defined(CLIB_HAVE_VEC256) +static_always_inline u64x4 +clib_ip_csum_cvt_and_add_8 (u32x8 v) +{ + return ((u64x4) u32x8_interleave_lo ((u32x8) v, u32x8_zero ()) + + (u64x4) u32x8_interleave_hi ((u32x8) v, u32x8_zero ())); +} +static_always_inline u64 +clib_ip_csum_hadd_4 (u64x4 v) +{ + return clib_ip_csum_hadd_2 (u64x4_extract_lo (v) + u64x4_extract_hi (v)); +} +#endif + +#if defined(CLIB_HAVE_VEC512) +static_always_inline u64x8 +clib_ip_csum_cvt_and_add_16 (u32x16 v) +{ + return ((u64x8) u32x16_interleave_lo ((u32x16) v, u32x16_zero ()) + + (u64x8) u32x16_interleave_hi ((u32x16) v, u32x16_zero ())); +} +static_always_inline u64 +clib_ip_csum_hadd_8 (u64x8 v) +{ + return clib_ip_csum_hadd_4 (u64x8_extract_lo (v) + u64x8_extract_hi (v)); +} +#endif + +static_always_inline void +clib_ip_csum_inline (clib_ip_csum_t *c, u8 *dst, u8 *src, u16 count, + int is_copy) +{ + if (c->odd) + { + c->odd = 0; + c->sum += (u16) src[0] << 8; + count--; + src++; + if (is_copy) + dst++[0] = src[0]; + } + +#if defined(CLIB_HAVE_VEC512) + u64x8 sum8 = {}; + + while (count >= 512) + { + u32x16u *s = (u32x16u *) src; + sum8 += clib_ip_csum_cvt_and_add_16 (s[0]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[1]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[2]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[3]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[8]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[5]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[6]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[7]); + count -= 512; + src += 512; + if (is_copy) + { + u32x16u *d = (u32x16u *) dst; + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + d[4] = s[4]; + d[5] = s[5]; + d[6] = s[6]; + d[7] = s[7]; + dst += 512; + } + } + + while (count >= 64) + { + u32x16u *s = (u32x16u *) src; + sum8 += clib_ip_csum_cvt_and_add_16 (s[0]); + count -= 64; + src += 64; + if (is_copy) + { + u32x16u *d = (u32x16u *) dst; + d[0] = s[0]; + dst += 512; + } + } + +#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE + if (count) + { + u64 mask = pow2_mask (count); + u32x16 v = (u32x16) u8x64_mask_load_zero (src, mask); + sum8 += clib_ip_csum_cvt_and_add_16 (v); + c->odd = count & 1; + if (is_copy) + u32x16_mask_store (v, dst, mask); + } + c->sum += clib_ip_csum_hadd_8 (sum8); + return; +#endif + + c->sum += clib_ip_csum_hadd_8 (sum8); +#elif defined(CLIB_HAVE_VEC256) + u64x4 sum4 = {}; + + while (count >= 256) + { + u32x8u *s = (u32x8u *) src; + sum4 += clib_ip_csum_cvt_and_add_8 (s[0]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[1]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[2]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[3]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[4]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[5]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[6]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[7]); + count -= 256; + src += 256; + if (is_copy) + { + u32x8u *d = (u32x8u *) dst; + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + d[4] = s[4]; + d[5] = s[5]; + d[6] = s[6]; + d[7] = s[7]; + dst += 256; + } + } + + while (count >= 32) + { + u32x8u *s = (u32x8u *) src; + sum4 += clib_ip_csum_cvt_and_add_8 (s[0]); + count -= 32; + src += 32; + if (is_copy) + { + u32x8u *d = (u32x8u *) dst; + d[0] = s[0]; + dst += 32; + } + } + +#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE + if (count) + { + u32 mask = pow2_mask (count); + u32x8 v = (u32x8) u8x32_mask_load_zero (src, mask); + sum4 += clib_ip_csum_cvt_and_add_8 (v); + c->odd = count & 1; + if (is_copy) + u32x8_mask_store (v, dst, mask); + } + c->sum += clib_ip_csum_hadd_4 (sum4); + return; +#endif + + c->sum += clib_ip_csum_hadd_4 (sum4); +#elif defined(CLIB_HAVE_VEC128) + u64x2 sum2 = {}; + + while (count >= 128) + { + u32x4u *s = (u32x4u *) src; + sum2 += clib_ip_csum_cvt_and_add_4 (s[0]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[1]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[2]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[3]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[4]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[5]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[6]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[7]); + count -= 128; + src += 128; + if (is_copy) + { + u32x4u *d = (u32x4u *) dst; + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + d[4] = s[4]; + d[5] = s[5]; + d[6] = s[6]; + d[7] = s[7]; + dst += 128; + } + } + + while (count >= 16) + { + u32x4u *s = (u32x4u *) src; + sum2 += clib_ip_csum_cvt_and_add_4 (s[0]); + count -= 16; + src += 16; + if (is_copy) + { + u32x4u *d = (u32x4u *) dst; + d[0] = s[0]; + dst += 16; + } + } + c->sum += clib_ip_csum_hadd_2 (sum2); +#else + while (count >= 4) + { + u32 v = *((u32 *) src); + c->sum += v; + count -= 4; + src += 4; + if (is_copy) + { + *(u32 *) dst = v; + dst += 4; + } + } +#endif + while (count >= 2) + { + u16 v = *((u16 *) src); + c->sum += v; + count -= 2; + src += 2; + if (is_copy) + { + *(u16 *) dst = v; + dst += 2; + } + } + + if (count) + { + c->odd = 1; + c->sum += (u16) src[0]; + if (is_copy) + dst[0] = src[0]; + } +} + +static_always_inline u16 +clib_ip_csum_fold (clib_ip_csum_t *c) +{ + u64 sum = c->sum; +#if defined(__x86_64__) && defined(__BMI2__) + u64 tmp = sum; + asm volatile( + /* using ADC is much faster than mov, shift, add sequence + * compiler produces */ + "shr $32, %[sum] \n\t" + "add %k[tmp], %k[sum] \n\t" + "mov $16, %k[tmp] \n\t" + "shrx %k[tmp], %k[sum], %k[tmp] \n\t" + "adc %w[tmp], %w[sum] \n\t" + "adc $0, %w[sum] \n\t" + : [ sum ] "+&r"(sum), [ tmp ] "+&r"(tmp)); +#else + sum = ((u32) sum) + (sum >> 32); + sum = ((u16) sum) + (sum >> 16); + sum = ((u16) sum) + (sum >> 16); +#endif + return (~((u16) sum)); +} + +static_always_inline void +clib_ip_csum_chunk (clib_ip_csum_t *c, u8 *src, u16 count) +{ + return clib_ip_csum_inline (c, 0, src, count, 0); +} + +static_always_inline void +clib_ip_csum_and_copy_chunk (clib_ip_csum_t *c, u8 *src, u8 *dst, u16 count) +{ + return clib_ip_csum_inline (c, dst, src, count, 1); +} + +static_always_inline u16 +clib_ip_csum (u8 *src, u16 count) +{ + clib_ip_csum_t c = {}; + if (COMPILE_TIME_CONST (count) && count == 12) + { + for (int i = 0; i < 3; i++) + c.sum += ((u32 *) src)[i]; + } + else if (COMPILE_TIME_CONST (count) && count == 20) + { + for (int i = 0; i < 5; i++) + c.sum += ((u32 *) src)[i]; + } + else if (COMPILE_TIME_CONST (count) && count == 40) + { + for (int i = 0; i < 10; i++) + c.sum += ((u32 *) src)[i]; + } + else + clib_ip_csum_inline (&c, 0, src, count, 0); + return clib_ip_csum_fold (&c); +} + +static_always_inline u16 +clib_ip_csum_and_copy (u8 *dst, u8 *src, u16 count) +{ + clib_ip_csum_t c = {}; + clib_ip_csum_inline (&c, dst, src, count, 1); + return clib_ip_csum_fold (&c); +} + +#endif diff --git a/src/vppinfra/vector/test/ip_csum.c b/src/vppinfra/vector/test/ip_csum.c new file mode 100644 index 00000000000..135d5ae63b2 --- /dev/null +++ b/src/vppinfra/vector/test/ip_csum.c @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright(c) 2021 Cisco Systems, Inc. + */ + +#include +#include +#include + +typedef struct +{ + struct + { + u8 *src; + u32 count; + } chunk[5]; + u16 result; +} ip_csum_test_t; + +static u8 test1[] = { 0x45, 0x00, 0x00, 0x73, 0x00, 0x00, 0x40, + 0x00, 0x40, 0x11, 0x00, 0x00, 0xc0, 0xa8, + 0x00, 0x01, 0xc0, 0xa8, 0x00, 0xc7, 0x00 }; +#define TEST_LEN(x) (ARRAY_LEN (x) - 1) + +static ip_csum_test_t tests[] = { { + .chunk[0].src = test1, + .chunk[0].count = TEST_LEN (test1), + .result = 0x61b8, + }, + { + .chunk[0].src = test1, + .chunk[0].count = 1, + .chunk[1].src = test1 + 1, + .chunk[1].count = 2, + .chunk[2].src = test1 + 3, + .chunk[2].count = 3, + .chunk[3].src = test1 + 6, + .chunk[3].count = 4, + .chunk[4].src = test1 + 10, + .chunk[4].count = TEST_LEN (test1) - 10, + .result = 0x61b8, + }, + { + .chunk[0].count = 1, + .result = 0xff0f, + }, + { + .chunk[0].count = 2, + .result = 0x080f, + }, + { + .chunk[0].count = 3, + .result = 0x0711, + }, + { + .chunk[0].count = 4, + .result = 0x1210, + }, + { + .chunk[0].count = 63, + .result = 0xda01, + }, + { + .chunk[0].count = 64, + .result = 0xe100, + }, + { + .chunk[0].count = 65, + .result = 0xe010, + }, + { + .chunk[0].count = 65535, + .result = 0xfc84, + }, + { + .chunk[0].count = 65536, + .result = 0xffff, + } }; + +static clib_error_t * +test_clib_ip_csum (clib_error_t *err) +{ + u8 *buf; + buf = clib_mem_alloc_aligned (65536, CLIB_CACHE_LINE_BYTES); + for (int i = 0; i < 65536; i++) + buf[i] = 0xf0 + ((i * 7) & 0xf); + + for (int i = 0; i < ARRAY_LEN (tests); i++) + { + clib_ip_csum_t c = {}; + ip_csum_test_t *t = tests + i; + u16 rv; + + for (int j = 0; j < ARRAY_LEN (((ip_csum_test_t *) 0)->chunk); j++) + if (t->chunk[j].count > 0) + { + if (t->chunk[j].src == 0) + clib_ip_csum_chunk (&c, buf, t->chunk[j].count); + else + clib_ip_csum_chunk (&c, t->chunk[j].src, t->chunk[j].count); + } + rv = clib_ip_csum_fold (&c); + + if (rv != tests[i].result) + { + err = clib_error_return (err, + "bad checksum in test case %u (expected " + "0x%04x, calculated 0x%04x)", + i, tests[i].result, rv); + goto done; + } + } +done: + clib_mem_free (buf); + return err; +} + +REGISTER_TEST (clib_ip_csum) = { + .name = "clib_ip_csum", + .fn = test_clib_ip_csum, +}; diff --git a/src/vppinfra/vector_neon.h b/src/vppinfra/vector_neon.h index 70b05c60884..80d7bda9f3c 100644 --- a/src/vppinfra/vector_neon.h +++ b/src/vppinfra/vector_neon.h @@ -211,6 +211,18 @@ u32x4_min_scalar (u32x4 v) #define u8x16_word_shift_left(x,n) vextq_u8(u8x16_splat (0), x, 16 - n) #define u8x16_word_shift_right(x,n) vextq_u8(x, u8x16_splat (0), n) +always_inline u32x4 +u32x4_interleave_hi (u32x4 a, u32x4 b) +{ + return (u32x4) vzip2q_u32 (a, b); +} + +always_inline u32x4 +u32x4_interleave_lo (u32x4 a, u32x4 b) +{ + return (u32x4) vzip1q_u32 (a, b); +} + static_always_inline u8x16 u8x16_reflect (u8x16 v) { -- cgit 1.2.3-korg