From 3eab064e3fadaf2a6a128f167ad04ca0319b4e17 Mon Sep 17 00:00:00 2001 From: Anton Ivanov Date: Tue, 3 Oct 2017 10:08:05 +0100 Subject: VPP-1001 - update AF Packet Driver to for modern kernels 1. Add VNET headers support for checksumming - required to operate correctly on any recent Linux 2. Bypass QDISC on transmit - improves performance by ~ 5%. Enabled only if the macro is detected - apparently not present on archaic distributions. This still does not solve all issues with TSO - it can be fixed only by going to tpacket v3 and dynamic rx ring as well as significant changes in the TX (sendmmsg?). Change-Id: Iea14ade12586c0a8da49e6dd1012108a08bc85b3 Signed-off-by: Anton Ivanov --- src/vnet/devices/af_packet/af_packet.c | 30 ++++++++++++++-- src/vnet/devices/af_packet/af_packet.h | 3 ++ src/vnet/devices/af_packet/device.c | 30 +++++++++++++++- src/vnet/devices/af_packet/node.c | 66 ++++++++++++++++++++++++++++++---- 4 files changed, 119 insertions(+), 10 deletions(-) diff --git a/src/vnet/devices/af_packet/af_packet.c b/src/vnet/devices/af_packet/af_packet.c index 32696014727..fbcd488ac9b 100644 --- a/src/vnet/devices/af_packet/af_packet.c +++ b/src/vnet/devices/af_packet/af_packet.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -128,6 +129,7 @@ static int create_packet_v2_sock (int host_if_index, tpacket_req_t * rx_req, tpacket_req_t * tx_req, int *fd, u8 ** ring) { + af_packet_main_t *apm = &af_packet_main; int ret, err; struct sockaddr_ll sll; int ver = TPACKET_V2; @@ -141,7 +143,31 @@ create_packet_v2_sock (int host_if_index, tpacket_req_t * rx_req, ret = VNET_API_ERROR_SYSCALL_ERROR_1; goto error; } - + int opt = 1; + if (setsockopt (*fd, SOL_PACKET, PACKET_VNET_HDR, &opt, sizeof (opt)) != 0) + { + DBG_SOCK ("Failed to enable vnet headers on the socket"); + if ((apm->flags & AF_PACKET_USES_VNET_HEADERS) != 0) + { + /* Should never happen - vnet was already enabled once, + * but we fail to reenable it on a new interface + **/ + ret = VNET_API_ERROR_SYSCALL_ERROR_1; + goto error; + } + } + else + { + apm->flags |= AF_PACKET_USES_VNET_HEADERS; + } +#ifdef PACKET_QDISC_BYPASS + opt = 1; + if (setsockopt (*fd, SOL_PACKET, PACKET_QDISC_BYPASS, &opt, sizeof (opt)) != + 0) + { + DBG_SOCK ("Failed to bypass Linux QDISC"); + } +#endif if ((err = setsockopt (*fd, SOL_PACKET, PACKET_VERSION, &ver, sizeof (ver))) < 0) { @@ -150,7 +176,7 @@ create_packet_v2_sock (int host_if_index, tpacket_req_t * rx_req, goto error; } - int opt = 1; + opt = 1; if ((err = setsockopt (*fd, SOL_PACKET, PACKET_LOSS, &opt, sizeof (opt))) < 0) { diff --git a/src/vnet/devices/af_packet/af_packet.h b/src/vnet/devices/af_packet/af_packet.h index 95c7e7cf5a7..f731427c6b5 100644 --- a/src/vnet/devices/af_packet/af_packet.h +++ b/src/vnet/devices/af_packet/af_packet.h @@ -19,6 +19,8 @@ #include +#define AF_PACKET_USES_VNET_HEADERS 1 + typedef struct { CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); @@ -54,6 +56,7 @@ typedef struct /* hash of host interface names */ mhash_t if_index_by_host_if_name; + u32 flags; } af_packet_main_t; af_packet_main_t af_packet_main; diff --git a/src/vnet/devices/af_packet/device.c b/src/vnet/devices/af_packet/device.c index e01b1c71b32..a48ae5cf25d 100644 --- a/src/vnet/devices/af_packet/device.c +++ b/src/vnet/devices/af_packet/device.c @@ -23,6 +23,8 @@ #include #include +#include + #include #include #include @@ -50,7 +52,6 @@ static char *af_packet_tx_func_error_strings[] = { #undef _ }; - static u8 * format_af_packet_device_name (u8 * s, va_list * args) { @@ -76,6 +77,23 @@ format_af_packet_tx_trace (u8 * s, va_list * args) return s; } + +static_always_inline void +af_packet_buffer_tx_offload (vlib_buffer_t * b, struct virtio_net_hdr *vhdr) +{ + /* For now - just mark the data as valid, + * DPDK csums on input, tap presently operates in legacy + * compatibility mode where the kernel checksums CSUM_PARTIAL + * for it and we have fixed the af_packet input + * + * In the future, locally originated frames, etc can be made + * to fit this convention so that they are not checksummed + * unless needed. + **/ + vhdr->flags = VIRTIO_NET_HDR_F_DATA_VALID; +} + + static uword af_packet_interface_tx (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) @@ -102,6 +120,10 @@ af_packet_interface_tx (vlib_main_t * vm, { u32 len; u32 offset = 0; + if (PREDICT_TRUE ((apm->flags & AF_PACKET_USES_VNET_HEADERS) != 0)) + { + offset = sizeof (struct virtio_net_hdr); + } vlib_buffer_t *b0; n_left--; u32 bi = buffers[0]; @@ -119,6 +141,12 @@ af_packet_interface_tx (vlib_main_t * vm, do { b0 = vlib_get_buffer (vm, bi); + if (PREDICT_TRUE ((apm->flags & AF_PACKET_USES_VNET_HEADERS) != 0)) + { + u8 *vh = + (u8 *) tph + TPACKET_ALIGN (sizeof (struct tpacket2_hdr)); + af_packet_buffer_tx_offload (b0, (struct virtio_net_hdr *) vh); + } len = b0->current_length; clib_memcpy ((u8 *) tph + TPACKET_ALIGN (sizeof (struct tpacket2_hdr)) + offset, diff --git a/src/vnet/devices/af_packet/node.c b/src/vnet/devices/af_packet/node.c index 99c91f38805..5301ad299f2 100644 --- a/src/vnet/devices/af_packet/node.c +++ b/src/vnet/devices/af_packet/node.c @@ -1,5 +1,4 @@ -/* - *------------------------------------------------------------------ +/*------------------------------------------------------------------ * af_packet.c - linux kernel packet interface * * Copyright (c) 2016 Cisco and/or its affiliates. @@ -18,6 +17,7 @@ */ #include +#include #include #include @@ -155,9 +155,18 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, while ((tph->tp_status & TP_STATUS_USER) && (n_free_bufs > min_bufs) && n_left_to_next) { + + struct virtio_net_hdr *vh = + (struct virtio_net_hdr *) (((u8 *) tph) + tph->tp_mac - + sizeof (struct virtio_net_hdr)); u32 data_len = tph->tp_snaplen; u32 offset = 0; u32 bi0 = 0, first_bi0 = 0, prev_bi0; + u32 vlan_len = 0; + ip_csum_t wsum = 0; + u16 *wsum_addr = NULL; + u32 do_vnet = apm->flags & AF_PACKET_USES_VNET_HEADERS; + u32 do_csum = tph->tp_status & TP_STATUS_CSUMNOTREADY; while (data_len) { @@ -173,7 +182,6 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, /* copy data */ u32 bytes_to_copy = data_len > n_buffer_bytes ? n_buffer_bytes : data_len; - u32 vlan_len = 0; u32 bytes_copied = 0; b0->current_data = 0; /* Kernel removes VLAN headers, so reconstruct VLAN */ @@ -195,10 +203,50 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, bytes_copied = sizeof (ethernet_header_t); } } - clib_memcpy (((u8 *) vlib_buffer_get_current (b0)) + - bytes_copied + vlan_len, - (u8 *) tph + tph->tp_mac + offset + bytes_copied, - (bytes_to_copy - bytes_copied)); + /* Check if the incoming skb is marked as CSUM_PARTIAL, + * If VNET Headers are enabled TP_STATUS_CSUMNOTREADY is + * equivalent to the vnet csum flag. + **/ + if (PREDICT_TRUE ((do_vnet != 0) && (do_csum != 0))) + { + wsum_addr = (u16 *) (((u8 *) vlib_buffer_get_current (b0)) + + vlan_len + vh->csum_start + + vh->csum_offset); + if (bytes_copied <= vh->csum_start) + { + clib_memcpy (((u8 *) vlib_buffer_get_current (b0)) + + bytes_copied + vlan_len, + (u8 *) tph + tph->tp_mac + offset + + bytes_copied, + (vh->csum_start - bytes_copied)); + wsum = + ip_csum_and_memcpy (wsum, + ((u8 *) + vlib_buffer_get_current (b0)) + + vh->csum_start + vlan_len, + (u8 *) tph + tph->tp_mac + + offset + vh->csum_start, + (bytes_to_copy - vh->csum_start)); + } + else + { + wsum = + ip_csum_and_memcpy (wsum, + ((u8 *) + vlib_buffer_get_current (b0)) + + bytes_copied + vlan_len, + (u8 *) tph + tph->tp_mac + + offset + bytes_copied, + (bytes_to_copy - bytes_copied)); + } + } + else + { + clib_memcpy (((u8 *) vlib_buffer_get_current (b0)) + + bytes_copied + vlan_len, + (u8 *) tph + tph->tp_mac + offset + + bytes_copied, (bytes_to_copy - bytes_copied)); + } /* fill buffer header */ b0->current_length = bytes_to_copy + vlan_len; @@ -218,6 +266,10 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, offset += bytes_to_copy; data_len -= bytes_to_copy; } + if (PREDICT_TRUE ((do_vnet != 0) && (do_csum != 0))) + { + *wsum_addr = ~ip_csum_fold (wsum); + } n_rx_packets++; n_rx_bytes += tph->tp_snaplen; to_next[0] = first_bi0; -- cgit 1.2.3-korg