aboutsummaryrefslogtreecommitdiffstats
path: root/src/vnet/ip/ip_packet.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/vnet/ip/ip_packet.h')
-rwxr-xr-x[-rw-r--r--]src/vnet/ip/ip_packet.h108
1 files changed, 15 insertions, 93 deletions
diff --git a/src/vnet/ip/ip_packet.h b/src/vnet/ip/ip_packet.h
index b0b5f41260c..04cf9f11d70 100644..100755
--- a/src/vnet/ip/ip_packet.h
+++ b/src/vnet/ip/ip_packet.h
@@ -149,98 +149,6 @@ STATIC_ASSERT_SIZEOF (ip_ecn_t, 1);
extern u8 *format_ip_ecn (u8 * s, va_list * va);
-/* IP checksum support. */
-
-static_always_inline u16
-ip_csum (void *data, u16 n_left)
-{
- u32 sum;
-#ifdef CLIB_HAVE_VEC256
- u16x16 v1, v2;
- u32x8 zero = { 0 };
- u32x8 sum8 = { 0 };
- u32x4 sum4;
-#endif
-
- /* if there is odd number of bytes, pad by zero and store in sum */
- sum = (n_left & 1) ? ((u8 *) data)[n_left - 1] << 8 : 0;
-
- /* we deal with words */
- n_left >>= 1;
-
-#ifdef CLIB_HAVE_VEC256
- while (n_left >= 32)
- {
- v1 = u16x16_load_unaligned (data);
- v2 = u16x16_load_unaligned (data + 32);
-
-#ifdef CLIB_ARCH_IS_LITTLE_ENDIAN
- v1 = u16x16_byte_swap (v1);
- v2 = u16x16_byte_swap (v2);
-#endif
- sum8 += u32x8_from_u16x8 (u16x16_extract_lo (v1));
- sum8 += u32x8_from_u16x8 (u16x16_extract_hi (v1));
- sum8 += u32x8_from_u16x8 (u16x16_extract_lo (v2));
- sum8 += u32x8_from_u16x8 (u16x16_extract_hi (v2));
- n_left -= 32;
- data += 64;
- }
-
- if (n_left >= 16)
- {
- v1 = u16x16_load_unaligned (data);
-#ifdef CLIB_ARCH_IS_LITTLE_ENDIAN
- v1 = u16x16_byte_swap (v1);
-#endif
- sum8 += u32x8_from_u16x8 (u16x16_extract_lo (v1));
- sum8 += u32x8_from_u16x8 (u16x16_extract_hi (v1));
- n_left -= 16;
- data += 32;
- }
-
- if (n_left)
- {
- v1 = u16x16_load_unaligned (data);
-#ifdef CLIB_ARCH_IS_LITTLE_ENDIAN
- v1 = u16x16_byte_swap (v1);
-#endif
- v1 = u16x16_mask_last (v1, 16 - n_left);
- sum8 += u32x8_from_u16x8 (u16x16_extract_lo (v1));
- sum8 += u32x8_from_u16x8 (u16x16_extract_hi (v1));
- }
-
- sum8 = u32x8_hadd (sum8, zero);
- sum4 = u32x8_extract_lo (sum8) + u32x8_extract_hi (sum8);
- sum += sum4[0] + sum4[1];
-
-#else
- /* scalar version */
- while (n_left >= 8)
- {
- sum += clib_net_to_host_u16 (*((u16 *) data + 0));
- sum += clib_net_to_host_u16 (*((u16 *) data + 1));
- sum += clib_net_to_host_u16 (*((u16 *) data + 2));
- sum += clib_net_to_host_u16 (*((u16 *) data + 3));
- sum += clib_net_to_host_u16 (*((u16 *) data + 4));
- sum += clib_net_to_host_u16 (*((u16 *) data + 5));
- sum += clib_net_to_host_u16 (*((u16 *) data + 6));
- sum += clib_net_to_host_u16 (*((u16 *) data + 7));
- n_left -= 8;
- data += 16;
- }
- while (n_left)
- {
- sum += clib_net_to_host_u16 (*(u16 *) data);
- n_left -= 1;
- data += 2;
- }
-#endif
-
- sum = (sum & 0xffff) + (sum >> 16);
- sum = (sum & 0xffff) + (sum >> 16);
- return ~((u16) sum);
-}
-
/* Incremental checksum update. */
typedef uword ip_csum_t;
@@ -301,6 +209,20 @@ always_inline u16
ip_csum_fold (ip_csum_t c)
{
/* Reduce to 16 bits. */
+#if defined(__x86_64__) && defined(__BMI2__)
+ u64 tmp;
+ asm volatile(
+ /* using ADC is much faster than mov, shift, add sequence
+ * compiler produces */
+ "mov %k[sum], %k[tmp] \n\t"
+ "shr $32, %[sum] \n\t"
+ "add %k[tmp], %k[sum] \n\t"
+ "mov $16, %k[tmp] \n\t"
+ "shrx %k[tmp], %k[sum], %k[tmp] \n\t"
+ "adc %w[tmp], %w[sum] \n\t"
+ "adc $0, %w[sum] \n\t"
+ : [ sum ] "+&r"(c), [ tmp ] "=&r"(tmp));
+#else
#if uword_bits == 64
c = (c & (ip_csum_t) 0xffffffff) + (c >> (ip_csum_t) 32);
c = (c & 0xffff) + (c >> 16);
@@ -308,7 +230,7 @@ ip_csum_fold (ip_csum_t c)
c = (c & 0xffff) + (c >> 16);
c = (c & 0xffff) + (c >> 16);
-
+#endif
return c;
}