From ffc6bdcd38b8209050671d3d86f943c37887a7b7 Mon Sep 17 00:00:00 2001 From: Nathan Skrzypczak Date: Mon, 1 Feb 2021 17:13:59 +0100 Subject: devices: af-packet gso mtu Type: fix Set the GSO flag when buffer length exceeds the linux mtu. Don't listen for mtu changes on linux side for now. This also fixes a TX issue, as we only search for valid frames on tx to the extent of n_left, we might stay stuck. Change-Id: Idf0bdd88990254a614962c2f7bc3e0292ccfd61a Signed-off-by: Nathan Skrzypczak --- src/vnet/devices/af_packet/af_packet.c | 25 ++++++++++ src/vnet/devices/af_packet/af_packet.h | 1 + src/vnet/devices/af_packet/device.c | 88 +++++++++++++++++++++++++--------- src/vnet/devices/af_packet/node.c | 50 ++++++++++++------- 4 files changed, 126 insertions(+), 38 deletions(-) (limited to 'src/vnet/devices/af_packet') diff --git a/src/vnet/devices/af_packet/af_packet.c b/src/vnet/devices/af_packet/af_packet.c index ba6bf3d5a46..2cc0cc70bca 100644 --- a/src/vnet/devices/af_packet/af_packet.c +++ b/src/vnet/devices/af_packet/af_packet.c @@ -83,11 +83,32 @@ af_packet_eth_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, clib_error_free (error); return VNET_API_ERROR_SYSCALL_ERROR_1; } + else + apif->host_mtu = hi->max_packet_bytes; } return 0; } +static int +af_packet_read_mtu (af_packet_if_t *apif) +{ + af_packet_main_t *apm = &af_packet_main; + clib_error_t *error; + u8 *s; + s = format (0, "/sys/class/net/%s/mtu%c", apif->host_if_name, 0); + error = clib_sysfs_read ((char *) s, "%d", &apif->host_mtu); + vec_free (s); + if (error) + { + vlib_log_err (apm->log_class, "sysfs read failed to get MTU: %U", + format_clib_error, error); + clib_error_free (error); + return VNET_API_ERROR_SYSCALL_ERROR_1; + } + return 0; +} + static clib_error_t * af_packet_fd_read_ready (clib_file_t * uf) { @@ -338,6 +359,10 @@ af_packet_create_if (vlib_main_t * vm, u8 * host_if_name, u8 * hw_addr_set, apif->next_tx_frame = 0; apif->next_rx_frame = 0; + ret = af_packet_read_mtu (apif); + if (ret != 0) + goto error; + if (tm->n_vlib_mains > 1) clib_spinlock_init (&apif->lockp); diff --git a/src/vnet/devices/af_packet/af_packet.h b/src/vnet/devices/af_packet/af_packet.h index 395f2d93fdf..fc35b48b85e 100644 --- a/src/vnet/devices/af_packet/af_packet.h +++ b/src/vnet/devices/af_packet/af_packet.h @@ -48,6 +48,7 @@ typedef struct u32 per_interface_next_index; u8 is_admin_up; u32 queue_index; + u32 host_mtu; } af_packet_if_t; typedef struct diff --git a/src/vnet/devices/af_packet/device.c b/src/vnet/devices/af_packet/device.c index 2886fb5b6d5..b6b99a0465c 100644 --- a/src/vnet/devices/af_packet/device.c +++ b/src/vnet/devices/af_packet/device.c @@ -67,7 +67,48 @@ format_af_packet_device_name (u8 * s, va_list * args) static u8 * format_af_packet_device (u8 * s, va_list * args) { - s = format (s, "Linux PACKET socket interface"); + u32 dev_instance = va_arg (*args, u32); + u32 indent = format_get_indent (s); + int __clib_unused verbose = va_arg (*args, int); + + af_packet_main_t *apm = &af_packet_main; + af_packet_if_t *apif = pool_elt_at_index (apm->interfaces, dev_instance); + clib_spinlock_lock_if_init (&apif->lockp); + u32 block_size = apif->tx_req->tp_block_size; + u32 frame_size = apif->tx_req->tp_frame_size; + u32 frame_num = apif->tx_req->tp_frame_nr; + int block = 0; + u8 *block_start = apif->tx_ring + block * block_size; + u32 tx_frame = apif->next_tx_frame; + struct tpacket2_hdr *tph; + + s = format (s, "Linux PACKET socket interface\n"); + s = format (s, "%Ublock:%d frame:%d\n", format_white_space, indent, + block_size, frame_size); + s = format (s, "%Unext frame:%d\n", format_white_space, indent, + apif->next_tx_frame); + + int n_send_req = 0, n_avail = 0, n_sending = 0, n_tot = 0, n_wrong = 0; + do + { + tph = (struct tpacket2_hdr *) (block_start + tx_frame * frame_size); + tx_frame = (tx_frame + 1) % frame_num; + if (tph->tp_status == 0) + n_avail++; + else if (tph->tp_status & TP_STATUS_SEND_REQUEST) + n_send_req++; + else if (tph->tp_status & TP_STATUS_SENDING) + n_sending++; + else + n_wrong++; + n_tot++; + } + while (tx_frame != apif->next_tx_frame); + s = format (s, "%Uavailable:%d request:%d sending:%d wrong:%d total:%d\n", + format_white_space, indent, n_avail, n_send_req, n_sending, + n_wrong, n_tot); + + clib_spinlock_unlock_if_init (&apif->lockp); return s; } @@ -99,7 +140,7 @@ VNET_DEVICE_CLASS_TX_FN (af_packet_device_class) (vlib_main_t * vm, struct tpacket2_hdr *tph; u32 frame_not_ready = 0; - while (n_left > 0) + while (n_left) { u32 len; u32 offset = 0; @@ -108,13 +149,17 @@ VNET_DEVICE_CLASS_TX_FN (af_packet_device_class) (vlib_main_t * vm, u32 bi = buffers[0]; buffers++; + nextframe: tph = (struct tpacket2_hdr *) (block_start + tx_frame * frame_size); - - if (PREDICT_FALSE - (tph->tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING))) + if (PREDICT_FALSE (tph->tp_status & + (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING))) { + tx_frame = (tx_frame + 1) % frame_num; frame_not_ready++; - goto next; + /* check if we've exhausted the ring */ + if (PREDICT_FALSE (frame_not_ready + n_sent == frame_num)) + break; + goto nextframe; } do @@ -132,7 +177,7 @@ VNET_DEVICE_CLASS_TX_FN (af_packet_device_class) (vlib_main_t * vm, tph->tp_len = tph->tp_snaplen = offset; tph->tp_status = TP_STATUS_SEND_REQUEST; n_sent++; - next: + tx_frame = (tx_frame + 1) % frame_num; /* check if we've exhausted the ring */ @@ -142,23 +187,22 @@ VNET_DEVICE_CLASS_TX_FN (af_packet_device_class) (vlib_main_t * vm, CLIB_MEMORY_BARRIER (); - if (PREDICT_TRUE (n_sent)) - { - apif->next_tx_frame = tx_frame; + apif->next_tx_frame = tx_frame; - if (PREDICT_FALSE (sendto (apif->fd, NULL, 0, - MSG_DONTWAIT, NULL, 0) == -1)) - { - /* Uh-oh, drop & move on, but count whether it was fatal or not. - * Note that we have no reliable way to properly determine the - * disposition of the packets we just enqueued for delivery. - */ - vlib_error_count (vm, node->node_index, - unix_error_is_fatal (errno) ? + if (PREDICT_TRUE (n_sent)) + if (PREDICT_FALSE (sendto (apif->fd, NULL, 0, MSG_DONTWAIT, NULL, 0) == + -1)) + { + /* Uh-oh, drop & move on, but count whether it was fatal or not. + * Note that we have no reliable way to properly determine the + * disposition of the packets we just enqueued for delivery. + */ + vlib_error_count (vm, node->node_index, + unix_error_is_fatal (errno) ? AF_PACKET_TX_ERROR_TXRING_FATAL : - AF_PACKET_TX_ERROR_TXRING_EAGAIN, n_sent); - } - } + AF_PACKET_TX_ERROR_TXRING_EAGAIN, + n_sent); + } clib_spinlock_unlock_if_init (&apif->lockp); diff --git a/src/vnet/devices/af_packet/node.c b/src/vnet/devices/af_packet/node.c index d444b3b6eea..f4db6399a57 100644 --- a/src/vnet/devices/af_packet/node.c +++ b/src/vnet/devices/af_packet/node.c @@ -109,7 +109,15 @@ buffer_add_to_chain (vlib_main_t * vm, u32 bi, u32 first_bi, u32 prev_bi) } static_always_inline void -mark_tcp_udp_cksum_calc (vlib_buffer_t * b) +fill_gso_buffer_flags (vlib_buffer_t *b, u32 gso_size, u8 l4_hdr_sz) +{ + b->flags |= VNET_BUFFER_F_GSO; + vnet_buffer2 (b)->gso_size = gso_size; + vnet_buffer2 (b)->gso_l4_hdr_sz = l4_hdr_sz; +} + +static_always_inline void +mark_tcp_udp_cksum_calc (vlib_buffer_t *b, u8 *l4_hdr_sz) { ethernet_header_t *eth = vlib_buffer_get_current (b); if (clib_net_to_host_u16 (eth->type) == ETHERNET_TYPE_IP4) @@ -120,18 +128,20 @@ mark_tcp_udp_cksum_calc (vlib_buffer_t * b) if (ip4->protocol == IP_PROTOCOL_TCP) { b->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; - ((tcp_header_t - *) (vlib_buffer_get_current (b) + - sizeof (ethernet_header_t) + - ip4_header_bytes (ip4)))->checksum = 0; + tcp_header_t *tcp = (tcp_header_t *) (vlib_buffer_get_current (b) + + sizeof (ethernet_header_t) + + ip4_header_bytes (ip4)); + tcp->checksum = 0; + *l4_hdr_sz = tcp_header_bytes (tcp); } else if (ip4->protocol == IP_PROTOCOL_UDP) { b->flags |= VNET_BUFFER_F_OFFLOAD_UDP_CKSUM; - ((udp_header_t - *) (vlib_buffer_get_current (b) + - sizeof (ethernet_header_t) + - ip4_header_bytes (ip4)))->checksum = 0; + udp_header_t *udp = (udp_header_t *) (vlib_buffer_get_current (b) + + sizeof (ethernet_header_t) + + ip4_header_bytes (ip4)); + udp->checksum = 0; + *l4_hdr_sz = sizeof (*udp); } vnet_buffer (b)->l3_hdr_offset = sizeof (ethernet_header_t); vnet_buffer (b)->l4_hdr_offset = @@ -156,16 +166,20 @@ mark_tcp_udp_cksum_calc (vlib_buffer_t * b) if (ip6->protocol == IP_PROTOCOL_TCP) { b->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; - ((tcp_header_t - *) (vlib_buffer_get_current (b) + - sizeof (ethernet_header_t) + ip6_hdr_len))->checksum = 0; + tcp_header_t *tcp = + (tcp_header_t *) (vlib_buffer_get_current (b) + + sizeof (ethernet_header_t) + ip6_hdr_len); + tcp->checksum = 0; + *l4_hdr_sz = tcp_header_bytes (tcp); } else if (ip6->protocol == IP_PROTOCOL_UDP) { b->flags |= VNET_BUFFER_F_OFFLOAD_UDP_CKSUM; - ((udp_header_t - *) (vlib_buffer_get_current (b) + - sizeof (ethernet_header_t) + ip6_hdr_len))->checksum = 0; + udp_header_t *udp = + (udp_header_t *) (vlib_buffer_get_current (b) + + sizeof (ethernet_header_t) + ip6_hdr_len); + udp->checksum = 0; + *l4_hdr_sz = sizeof (*udp); } vnet_buffer (b)->l3_hdr_offset = sizeof (ethernet_header_t); vnet_buffer (b)->l4_hdr_offset = @@ -221,6 +235,7 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, u32 data_len = tph->tp_snaplen; u32 offset = 0; u32 bi0 = 0, first_bi0 = 0, prev_bi0; + u8 l4_hdr_sz = 0; while (data_len) { @@ -275,7 +290,10 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, first_bi0 = bi0; first_b0 = vlib_get_buffer (vm, first_bi0); if (tph->tp_status & TP_STATUS_CSUMNOTREADY) - mark_tcp_udp_cksum_calc (first_b0); + mark_tcp_udp_cksum_calc (first_b0, &l4_hdr_sz); + if (tph->tp_snaplen > apif->host_mtu) + fill_gso_buffer_flags (first_b0, apif->host_mtu, + l4_hdr_sz); } else buffer_add_to_chain (vm, bi0, first_bi0, prev_bi0); -- cgit 1.2.3-korg