From e6709ff37dc0f3a58ed5ad98aace73fe801f1e9d Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 28 Oct 2021 12:02:15 +0200 Subject: ip: improve csum fold on x86_64 New code seems to be 1.5 clocks faster. old: mov eax,edi shr rdi,0x20 add rdi,rax movzx edx,di shr rdi,0x10 add rdx,rdi movzx eax,dx shr rdx,0x10 add rax,rdx mov rdx,rax shr rdx,0x10 add eax,edx new: mov rax,rdi shr rax,0x20 add eax,edi mov edi,0x10 shrx edi,eax,edi adc ax,di adc ax,0x0 Type: improvement Change-Id: I3c565812c67ff4c3db197a9d4137a6c131b5b66c Signed-off-by: Damjan Marion --- src/vnet/ip/ip_packet.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/vnet/ip/ip_packet.h b/src/vnet/ip/ip_packet.h index b0b5f41260c..d862caa3a52 100644 --- a/src/vnet/ip/ip_packet.h +++ b/src/vnet/ip/ip_packet.h @@ -301,6 +301,20 @@ always_inline u16 ip_csum_fold (ip_csum_t c) { /* Reduce to 16 bits. */ +#ifdef __x86_64__ + u64 tmp; + asm volatile( + /* using ADC is much faster than mov, shift, add sequence + * compiler produces */ + "mov %k[sum], %k[tmp] \n\t" + "shr $32, %[sum] \n\t" + "add %k[tmp], %k[sum] \n\t" + "mov $16, %k[tmp] \n\t" + "shrx %k[tmp], %k[sum], %k[tmp] \n\t" + "adc %w[tmp], %w[sum] \n\t" + "adc $0, %w[sum] \n\t" + : [ sum ] "+&r"(c), [ tmp ] "=&r"(tmp)); +#else #if uword_bits == 64 c = (c & (ip_csum_t) 0xffffffff) + (c >> (ip_csum_t) 32); c = (c & 0xffff) + (c >> 16); @@ -308,7 +322,7 @@ ip_csum_fold (ip_csum_t c) c = (c & 0xffff) + (c >> 16); c = (c & 0xffff) + (c >> 16); - +#endif return c; } -- cgit 1.2.3-korg