summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDamjan Marion <damarion@cisco.com>2021-10-28 12:02:15 +0200
committerFlorin Coras <florin.coras@gmail.com>2021-10-28 15:54:25 +0000
commite6709ff37dc0f3a58ed5ad98aace73fe801f1e9d (patch)
treed71e12113f9eb31d789d059aa4073b07a44409fb
parent48c0534c2eafe23fe8efba8c913109f30f6a294c (diff)
ip: improve csum fold on x86_64
New code seems to be 1.5 clocks faster. old: mov eax,edi shr rdi,0x20 add rdi,rax movzx edx,di shr rdi,0x10 add rdx,rdi movzx eax,dx shr rdx,0x10 add rax,rdx mov rdx,rax shr rdx,0x10 add eax,edx new: mov rax,rdi shr rax,0x20 add eax,edi mov edi,0x10 shrx edi,eax,edi adc ax,di adc ax,0x0 Type: improvement Change-Id: I3c565812c67ff4c3db197a9d4137a6c131b5b66c Signed-off-by: Damjan Marion <damarion@cisco.com>
-rw-r--r--src/vnet/ip/ip_packet.h16
1 files changed, 15 insertions, 1 deletions
diff --git a/src/vnet/ip/ip_packet.h b/src/vnet/ip/ip_packet.h
index b0b5f41260c..d862caa3a52 100644
--- a/src/vnet/ip/ip_packet.h
+++ b/src/vnet/ip/ip_packet.h
@@ -301,6 +301,20 @@ always_inline u16
ip_csum_fold (ip_csum_t c)
{
/* Reduce to 16 bits. */
+#ifdef __x86_64__
+ u64 tmp;
+ asm volatile(
+ /* using ADC is much faster than mov, shift, add sequence
+ * compiler produces */
+ "mov %k[sum], %k[tmp] \n\t"
+ "shr $32, %[sum] \n\t"
+ "add %k[tmp], %k[sum] \n\t"
+ "mov $16, %k[tmp] \n\t"
+ "shrx %k[tmp], %k[sum], %k[tmp] \n\t"
+ "adc %w[tmp], %w[sum] \n\t"
+ "adc $0, %w[sum] \n\t"
+ : [ sum ] "+&r"(c), [ tmp ] "=&r"(tmp));
+#else
#if uword_bits == 64
c = (c & (ip_csum_t) 0xffffffff) + (c >> (ip_csum_t) 32);
c = (c & 0xffff) + (c >> 16);
@@ -308,7 +322,7 @@ ip_csum_fold (ip_csum_t c)
c = (c & 0xffff) + (c >> 16);
c = (c & 0xffff) + (c >> 16);
-
+#endif
return c;
}