diff options
author | Damjan Marion <damarion@cisco.com> | 2021-11-08 11:18:30 +0000 |
---|---|---|
committer | Florin Coras <florin.coras@gmail.com> | 2021-11-10 23:22:58 +0000 |
commit | aa63bc6cf4b9031c3fc6ae22aecd846cc712bc52 (patch) | |
tree | c007868ca129f5594ac9c5aa460edea4aa63951e /src | |
parent | 29355644c5eca85b83b183ff887633dbcf86cf35 (diff) |
vppinfra: new vectorized ip checksum functions incl. csum_and_copy
Type: improvement
Change-Id: Id5810b7f4a6d6e4ce16b73c235b50db5d475ebf7
Signed-off-by: Damjan Marion <damarion@cisco.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/plugins/avf/output.c | 5 | ||||
-rw-r--r-- | src/vnet/ip/ip4_input.h | 10 | ||||
-rwxr-xr-x | src/vnet/ip/ip_packet.h | 92 | ||||
-rw-r--r-- | src/vnet/ip/ip_psh_cksum.h | 7 | ||||
-rw-r--r-- | src/vppinfra/CMakeLists.txt | 2 | ||||
-rw-r--r-- | src/vppinfra/vector/ip_csum.h | 339 | ||||
-rw-r--r-- | src/vppinfra/vector/test/ip_csum.c | 120 | ||||
-rw-r--r-- | src/vppinfra/vector_neon.h | 12 |
8 files changed, 487 insertions, 100 deletions
diff --git a/src/plugins/avf/output.c b/src/plugins/avf/output.c index 4cc9d5a49c1..8cc76a6c47b 100644 --- a/src/plugins/avf/output.c +++ b/src/plugins/avf/output.c @@ -19,6 +19,7 @@ #include <vlib/unix/unix.h> #include <vlib/pci/pci.h> #include <vppinfra/ring.h> +#include <vppinfra/vector/ip_csum.h> #include <vnet/ethernet/ethernet.h> #include <vnet/ip/ip4_packet.h> @@ -110,7 +111,7 @@ avf_tx_prepare_cksum (vlib_buffer_t * b, u8 is_tso) is_tso ? 0 : clib_host_to_net_u16 (clib_net_to_host_u16 (ip4->length) - (l4_hdr_offset - l3_hdr_offset)); - sum = ~ip_csum (&psh, sizeof (psh)); + sum = ~clib_ip_csum ((u8 *) &psh, sizeof (psh)); } else { @@ -119,7 +120,7 @@ avf_tx_prepare_cksum (vlib_buffer_t * b, u8 is_tso) psh.dst = ip6->dst_address; psh.proto = clib_host_to_net_u32 ((u32) ip6->protocol); psh.l4len = is_tso ? 0 : ip6->payload_length; - sum = ~ip_csum (&psh, sizeof (psh)); + sum = ~clib_ip_csum ((u8 *) &psh, sizeof (psh)); } /* ip_csum does a byte swap for some reason... */ diff --git a/src/vnet/ip/ip4_input.h b/src/vnet/ip/ip4_input.h index 383ef31758c..53948d60266 100644 --- a/src/vnet/ip/ip4_input.h +++ b/src/vnet/ip/ip4_input.h @@ -42,6 +42,7 @@ #include <vnet/ip/ip.h> #include <vnet/ethernet/ethernet.h> +#include <vppinfra/vector/ip_csum.h> typedef enum { @@ -63,15 +64,16 @@ check_ver_opt_csum (ip4_header_t * ip, u8 * error, int verify_checksum) if ((ip->ip_version_and_header_length & 0xf) != 5) { *error = IP4_ERROR_OPTIONS; - if (verify_checksum && ip_csum (ip, ip4_header_bytes (ip)) != 0) + if (verify_checksum && + clib_ip_csum ((u8 *) ip, ip4_header_bytes (ip)) != 0) *error = IP4_ERROR_BAD_CHECKSUM; } else *error = IP4_ERROR_VERSION; } - else - if (PREDICT_FALSE (verify_checksum && - ip_csum (ip, sizeof (ip4_header_t)) != 0)) + else if (PREDICT_FALSE (verify_checksum && + clib_ip_csum ((u8 *) ip, sizeof (ip4_header_t)) != + 0)) *error = IP4_ERROR_BAD_CHECKSUM; } diff --git a/src/vnet/ip/ip_packet.h b/src/vnet/ip/ip_packet.h index 837b3df8563..04cf9f11d70 100755 --- a/src/vnet/ip/ip_packet.h +++ b/src/vnet/ip/ip_packet.h @@ -149,98 +149,6 @@ STATIC_ASSERT_SIZEOF (ip_ecn_t, 1); extern u8 *format_ip_ecn (u8 * s, va_list * va); -/* IP checksum support. */ - -static_always_inline u16 -ip_csum (void *data, u16 n_left) -{ - u32 sum; -#ifdef CLIB_HAVE_VEC256 - u16x16 v1, v2; - u32x8 zero = { 0 }; - u32x8 sum8 = { 0 }; - u32x4 sum4; -#endif - - /* if there is odd number of bytes, pad by zero and store in sum */ - sum = (n_left & 1) ? ((u8 *) data)[n_left - 1] << 8 : 0; - - /* we deal with words */ - n_left >>= 1; - -#ifdef CLIB_HAVE_VEC256 - while (n_left >= 32) - { - v1 = u16x16_load_unaligned (data); - v2 = u16x16_load_unaligned (data + 32); - -#ifdef CLIB_ARCH_IS_LITTLE_ENDIAN - v1 = u16x16_byte_swap (v1); - v2 = u16x16_byte_swap (v2); -#endif - sum8 += u32x8_from_u16x8 (u16x16_extract_lo (v1)); - sum8 += u32x8_from_u16x8 (u16x16_extract_hi (v1)); - sum8 += u32x8_from_u16x8 (u16x16_extract_lo (v2)); - sum8 += u32x8_from_u16x8 (u16x16_extract_hi (v2)); - n_left -= 32; - data += 64; - } - - if (n_left >= 16) - { - v1 = u16x16_load_unaligned (data); -#ifdef CLIB_ARCH_IS_LITTLE_ENDIAN - v1 = u16x16_byte_swap (v1); -#endif - sum8 += u32x8_from_u16x8 (u16x16_extract_lo (v1)); - sum8 += u32x8_from_u16x8 (u16x16_extract_hi (v1)); - n_left -= 16; - data += 32; - } - - if (n_left) - { - v1 = u16x16_load_unaligned (data); -#ifdef CLIB_ARCH_IS_LITTLE_ENDIAN - v1 = u16x16_byte_swap (v1); -#endif - v1 = u16x16_mask_last (v1, 16 - n_left); - sum8 += u32x8_from_u16x8 (u16x16_extract_lo (v1)); - sum8 += u32x8_from_u16x8 (u16x16_extract_hi (v1)); - } - - sum8 = u32x8_hadd (sum8, zero); - sum4 = u32x8_extract_lo (sum8) + u32x8_extract_hi (sum8); - sum += sum4[0] + sum4[1]; - -#else - /* scalar version */ - while (n_left >= 8) - { - sum += clib_net_to_host_u16 (*((u16 *) data + 0)); - sum += clib_net_to_host_u16 (*((u16 *) data + 1)); - sum += clib_net_to_host_u16 (*((u16 *) data + 2)); - sum += clib_net_to_host_u16 (*((u16 *) data + 3)); - sum += clib_net_to_host_u16 (*((u16 *) data + 4)); - sum += clib_net_to_host_u16 (*((u16 *) data + 5)); - sum += clib_net_to_host_u16 (*((u16 *) data + 6)); - sum += clib_net_to_host_u16 (*((u16 *) data + 7)); - n_left -= 8; - data += 16; - } - while (n_left) - { - sum += clib_net_to_host_u16 (*(u16 *) data); - n_left -= 1; - data += 2; - } -#endif - - sum = (sum & 0xffff) + (sum >> 16); - sum = (sum & 0xffff) + (sum >> 16); - return ~((u16) sum); -} - /* Incremental checksum update. */ typedef uword ip_csum_t; diff --git a/src/vnet/ip/ip_psh_cksum.h b/src/vnet/ip/ip_psh_cksum.h index eaac401f223..8723749865f 100644 --- a/src/vnet/ip/ip_psh_cksum.h +++ b/src/vnet/ip/ip_psh_cksum.h @@ -7,6 +7,7 @@ #define included_ip_psh_cksum_h #include <vnet/ip/ip.h> +#include <vppinfra/vector/ip_csum.h> typedef struct _ip4_psh { @@ -37,7 +38,8 @@ ip4_pseudo_header_cksum (ip4_header_t *ip4) psh.proto = ip4->protocol; psh.l4len = clib_host_to_net_u16 (clib_net_to_host_u16 (ip4->length) - sizeof (ip4_header_t)); - return ~clib_net_to_host_u16 (ip_csum (&psh, sizeof (ip4_psh_t))); + return ~clib_net_to_host_u16 ( + clib_ip_csum ((u8 *) &psh, sizeof (ip4_psh_t))); } static_always_inline u16 @@ -48,7 +50,8 @@ ip6_pseudo_header_cksum (ip6_header_t *ip6) psh.dst = ip6->dst_address; psh.l4len = ip6->payload_length; psh.proto = clib_host_to_net_u32 ((u32) ip6->protocol); - return ~clib_net_to_host_u16 (ip_csum (&psh, sizeof (ip6_psh_t))); + return ~clib_net_to_host_u16 ( + clib_ip_csum ((u8 *) &psh, sizeof (ip6_psh_t))); } #endif /* included_ip_psh_cksum_h */ diff --git a/src/vppinfra/CMakeLists.txt b/src/vppinfra/CMakeLists.txt index 9f407a10a22..7a73fe531d7 100644 --- a/src/vppinfra/CMakeLists.txt +++ b/src/vppinfra/CMakeLists.txt @@ -194,6 +194,7 @@ set(VPPINFRA_HEADERS vector/compress.h vector/count_equal.h vector/index_to_ptr.h + vector/ip_csum.h vector/mask_compare.h vector.h vector_neon.h @@ -275,6 +276,7 @@ set(test_files vector/test/compress.c vector/test/count_equal.c vector/test/index_to_ptr.c + vector/test/ip_csum.c vector/test/mask_compare.c vector/test/memcpy_x86_64.c ) diff --git a/src/vppinfra/vector/ip_csum.h b/src/vppinfra/vector/ip_csum.h new file mode 100644 index 00000000000..2cea9b448ea --- /dev/null +++ b/src/vppinfra/vector/ip_csum.h @@ -0,0 +1,339 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright(c) 2021 Cisco Systems, Inc. + */ + +#ifndef included_vector_ip_csum_h +#define included_vector_ip_csum_h +#include <vppinfra/clib.h> +typedef struct +{ + u64 sum; + u8 odd; +} clib_ip_csum_t; + +#if defined(CLIB_HAVE_VEC128) +static_always_inline u64x2 +clib_ip_csum_cvt_and_add_4 (u32x4 v) +{ + return ((u64x2) u32x4_interleave_lo ((u32x4) v, u32x4_zero ()) + + (u64x2) u32x4_interleave_hi ((u32x4) v, u32x4_zero ())); +} +static_always_inline u64 +clib_ip_csum_hadd_2 (u64x2 v) +{ + return v[0] + v[1]; +} +#endif + +#if defined(CLIB_HAVE_VEC256) +static_always_inline u64x4 +clib_ip_csum_cvt_and_add_8 (u32x8 v) +{ + return ((u64x4) u32x8_interleave_lo ((u32x8) v, u32x8_zero ()) + + (u64x4) u32x8_interleave_hi ((u32x8) v, u32x8_zero ())); +} +static_always_inline u64 +clib_ip_csum_hadd_4 (u64x4 v) +{ + return clib_ip_csum_hadd_2 (u64x4_extract_lo (v) + u64x4_extract_hi (v)); +} +#endif + +#if defined(CLIB_HAVE_VEC512) +static_always_inline u64x8 +clib_ip_csum_cvt_and_add_16 (u32x16 v) +{ + return ((u64x8) u32x16_interleave_lo ((u32x16) v, u32x16_zero ()) + + (u64x8) u32x16_interleave_hi ((u32x16) v, u32x16_zero ())); +} +static_always_inline u64 +clib_ip_csum_hadd_8 (u64x8 v) +{ + return clib_ip_csum_hadd_4 (u64x8_extract_lo (v) + u64x8_extract_hi (v)); +} +#endif + +static_always_inline void +clib_ip_csum_inline (clib_ip_csum_t *c, u8 *dst, u8 *src, u16 count, + int is_copy) +{ + if (c->odd) + { + c->odd = 0; + c->sum += (u16) src[0] << 8; + count--; + src++; + if (is_copy) + dst++[0] = src[0]; + } + +#if defined(CLIB_HAVE_VEC512) + u64x8 sum8 = {}; + + while (count >= 512) + { + u32x16u *s = (u32x16u *) src; + sum8 += clib_ip_csum_cvt_and_add_16 (s[0]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[1]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[2]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[3]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[8]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[5]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[6]); + sum8 += clib_ip_csum_cvt_and_add_16 (s[7]); + count -= 512; + src += 512; + if (is_copy) + { + u32x16u *d = (u32x16u *) dst; + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + d[4] = s[4]; + d[5] = s[5]; + d[6] = s[6]; + d[7] = s[7]; + dst += 512; + } + } + + while (count >= 64) + { + u32x16u *s = (u32x16u *) src; + sum8 += clib_ip_csum_cvt_and_add_16 (s[0]); + count -= 64; + src += 64; + if (is_copy) + { + u32x16u *d = (u32x16u *) dst; + d[0] = s[0]; + dst += 512; + } + } + +#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE + if (count) + { + u64 mask = pow2_mask (count); + u32x16 v = (u32x16) u8x64_mask_load_zero (src, mask); + sum8 += clib_ip_csum_cvt_and_add_16 (v); + c->odd = count & 1; + if (is_copy) + u32x16_mask_store (v, dst, mask); + } + c->sum += clib_ip_csum_hadd_8 (sum8); + return; +#endif + + c->sum += clib_ip_csum_hadd_8 (sum8); +#elif defined(CLIB_HAVE_VEC256) + u64x4 sum4 = {}; + + while (count >= 256) + { + u32x8u *s = (u32x8u *) src; + sum4 += clib_ip_csum_cvt_and_add_8 (s[0]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[1]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[2]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[3]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[4]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[5]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[6]); + sum4 += clib_ip_csum_cvt_and_add_8 (s[7]); + count -= 256; + src += 256; + if (is_copy) + { + u32x8u *d = (u32x8u *) dst; + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + d[4] = s[4]; + d[5] = s[5]; + d[6] = s[6]; + d[7] = s[7]; + dst += 256; + } + } + + while (count >= 32) + { + u32x8u *s = (u32x8u *) src; + sum4 += clib_ip_csum_cvt_and_add_8 (s[0]); + count -= 32; + src += 32; + if (is_copy) + { + u32x8u *d = (u32x8u *) dst; + d[0] = s[0]; + dst += 32; + } + } + +#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE + if (count) + { + u32 mask = pow2_mask (count); + u32x8 v = (u32x8) u8x32_mask_load_zero (src, mask); + sum4 += clib_ip_csum_cvt_and_add_8 (v); + c->odd = count & 1; + if (is_copy) + u32x8_mask_store (v, dst, mask); + } + c->sum += clib_ip_csum_hadd_4 (sum4); + return; +#endif + + c->sum += clib_ip_csum_hadd_4 (sum4); +#elif defined(CLIB_HAVE_VEC128) + u64x2 sum2 = {}; + + while (count >= 128) + { + u32x4u *s = (u32x4u *) src; + sum2 += clib_ip_csum_cvt_and_add_4 (s[0]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[1]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[2]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[3]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[4]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[5]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[6]); + sum2 += clib_ip_csum_cvt_and_add_4 (s[7]); + count -= 128; + src += 128; + if (is_copy) + { + u32x4u *d = (u32x4u *) dst; + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + d[4] = s[4]; + d[5] = s[5]; + d[6] = s[6]; + d[7] = s[7]; + dst += 128; + } + } + + while (count >= 16) + { + u32x4u *s = (u32x4u *) src; + sum2 += clib_ip_csum_cvt_and_add_4 (s[0]); + count -= 16; + src += 16; + if (is_copy) + { + u32x4u *d = (u32x4u *) dst; + d[0] = s[0]; + dst += 16; + } + } + c->sum += clib_ip_csum_hadd_2 (sum2); +#else + while (count >= 4) + { + u32 v = *((u32 *) src); + c->sum += v; + count -= 4; + src += 4; + if (is_copy) + { + *(u32 *) dst = v; + dst += 4; + } + } +#endif + while (count >= 2) + { + u16 v = *((u16 *) src); + c->sum += v; + count -= 2; + src += 2; + if (is_copy) + { + *(u16 *) dst = v; + dst += 2; + } + } + + if (count) + { + c->odd = 1; + c->sum += (u16) src[0]; + if (is_copy) + dst[0] = src[0]; + } +} + +static_always_inline u16 +clib_ip_csum_fold (clib_ip_csum_t *c) +{ + u64 sum = c->sum; +#if defined(__x86_64__) && defined(__BMI2__) + u64 tmp = sum; + asm volatile( + /* using ADC is much faster than mov, shift, add sequence + * compiler produces */ + "shr $32, %[sum] \n\t" + "add %k[tmp], %k[sum] \n\t" + "mov $16, %k[tmp] \n\t" + "shrx %k[tmp], %k[sum], %k[tmp] \n\t" + "adc %w[tmp], %w[sum] \n\t" + "adc $0, %w[sum] \n\t" + : [ sum ] "+&r"(sum), [ tmp ] "+&r"(tmp)); +#else + sum = ((u32) sum) + (sum >> 32); + sum = ((u16) sum) + (sum >> 16); + sum = ((u16) sum) + (sum >> 16); +#endif + return (~((u16) sum)); +} + +static_always_inline void +clib_ip_csum_chunk (clib_ip_csum_t *c, u8 *src, u16 count) +{ + return clib_ip_csum_inline (c, 0, src, count, 0); +} + +static_always_inline void +clib_ip_csum_and_copy_chunk (clib_ip_csum_t *c, u8 *src, u8 *dst, u16 count) +{ + return clib_ip_csum_inline (c, dst, src, count, 1); +} + +static_always_inline u16 +clib_ip_csum (u8 *src, u16 count) +{ + clib_ip_csum_t c = {}; + if (COMPILE_TIME_CONST (count) && count == 12) + { + for (int i = 0; i < 3; i++) + c.sum += ((u32 *) src)[i]; + } + else if (COMPILE_TIME_CONST (count) && count == 20) + { + for (int i = 0; i < 5; i++) + c.sum += ((u32 *) src)[i]; + } + else if (COMPILE_TIME_CONST (count) && count == 40) + { + for (int i = 0; i < 10; i++) + c.sum += ((u32 *) src)[i]; + } + else + clib_ip_csum_inline (&c, 0, src, count, 0); + return clib_ip_csum_fold (&c); +} + +static_always_inline u16 +clib_ip_csum_and_copy (u8 *dst, u8 *src, u16 count) +{ + clib_ip_csum_t c = {}; + clib_ip_csum_inline (&c, dst, src, count, 1); + return clib_ip_csum_fold (&c); +} + +#endif diff --git a/src/vppinfra/vector/test/ip_csum.c b/src/vppinfra/vector/test/ip_csum.c new file mode 100644 index 00000000000..135d5ae63b2 --- /dev/null +++ b/src/vppinfra/vector/test/ip_csum.c @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright(c) 2021 Cisco Systems, Inc. + */ + +#include <vppinfra/format.h> +#include <vppinfra/vector/test/test.h> +#include <vppinfra/vector/ip_csum.h> + +typedef struct +{ + struct + { + u8 *src; + u32 count; + } chunk[5]; + u16 result; +} ip_csum_test_t; + +static u8 test1[] = { 0x45, 0x00, 0x00, 0x73, 0x00, 0x00, 0x40, + 0x00, 0x40, 0x11, 0x00, 0x00, 0xc0, 0xa8, + 0x00, 0x01, 0xc0, 0xa8, 0x00, 0xc7, 0x00 }; +#define TEST_LEN(x) (ARRAY_LEN (x) - 1) + +static ip_csum_test_t tests[] = { { + .chunk[0].src = test1, + .chunk[0].count = TEST_LEN (test1), + .result = 0x61b8, + }, + { + .chunk[0].src = test1, + .chunk[0].count = 1, + .chunk[1].src = test1 + 1, + .chunk[1].count = 2, + .chunk[2].src = test1 + 3, + .chunk[2].count = 3, + .chunk[3].src = test1 + 6, + .chunk[3].count = 4, + .chunk[4].src = test1 + 10, + .chunk[4].count = TEST_LEN (test1) - 10, + .result = 0x61b8, + }, + { + .chunk[0].count = 1, + .result = 0xff0f, + }, + { + .chunk[0].count = 2, + .result = 0x080f, + }, + { + .chunk[0].count = 3, + .result = 0x0711, + }, + { + .chunk[0].count = 4, + .result = 0x1210, + }, + { + .chunk[0].count = 63, + .result = 0xda01, + }, + { + .chunk[0].count = 64, + .result = 0xe100, + }, + { + .chunk[0].count = 65, + .result = 0xe010, + }, + { + .chunk[0].count = 65535, + .result = 0xfc84, + }, + { + .chunk[0].count = 65536, + .result = 0xffff, + } }; + +static clib_error_t * +test_clib_ip_csum (clib_error_t *err) +{ + u8 *buf; + buf = clib_mem_alloc_aligned (65536, CLIB_CACHE_LINE_BYTES); + for (int i = 0; i < 65536; i++) + buf[i] = 0xf0 + ((i * 7) & 0xf); + + for (int i = 0; i < ARRAY_LEN (tests); i++) + { + clib_ip_csum_t c = {}; + ip_csum_test_t *t = tests + i; + u16 rv; + + for (int j = 0; j < ARRAY_LEN (((ip_csum_test_t *) 0)->chunk); j++) + if (t->chunk[j].count > 0) + { + if (t->chunk[j].src == 0) + clib_ip_csum_chunk (&c, buf, t->chunk[j].count); + else + clib_ip_csum_chunk (&c, t->chunk[j].src, t->chunk[j].count); + } + rv = clib_ip_csum_fold (&c); + + if (rv != tests[i].result) + { + err = clib_error_return (err, + "bad checksum in test case %u (expected " + "0x%04x, calculated 0x%04x)", + i, tests[i].result, rv); + goto done; + } + } +done: + clib_mem_free (buf); + return err; +} + +REGISTER_TEST (clib_ip_csum) = { + .name = "clib_ip_csum", + .fn = test_clib_ip_csum, +}; diff --git a/src/vppinfra/vector_neon.h b/src/vppinfra/vector_neon.h index 70b05c60884..80d7bda9f3c 100644 --- a/src/vppinfra/vector_neon.h +++ b/src/vppinfra/vector_neon.h @@ -211,6 +211,18 @@ u32x4_min_scalar (u32x4 v) #define u8x16_word_shift_left(x,n) vextq_u8(u8x16_splat (0), x, 16 - n) #define u8x16_word_shift_right(x,n) vextq_u8(x, u8x16_splat (0), n) +always_inline u32x4 +u32x4_interleave_hi (u32x4 a, u32x4 b) +{ + return (u32x4) vzip2q_u32 (a, b); +} + +always_inline u32x4 +u32x4_interleave_lo (u32x4 a, u32x4 b) +{ + return (u32x4) vzip1q_u32 (a, b); +} + static_always_inline u8x16 u8x16_reflect (u8x16 v) { |