summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNathan Skrzypczak <nathan.skrzypczak@gmail.com>2021-02-01 17:13:59 +0100
committerDamjan Marion <dmarion@me.com>2021-02-05 19:02:38 +0000
commitffc6bdcd38b8209050671d3d86f943c37887a7b7 (patch)
tree4150c463c4b3bfbc91751d55ec272dc81d421289
parent5398dfb2592d525018997a991a4f7bfde515adc4 (diff)
devices: af-packet gso mtu
Type: fix Set the GSO flag when buffer length exceeds the linux mtu. Don't listen for mtu changes on linux side for now. This also fixes a TX issue, as we only search for valid frames on tx to the extent of n_left, we might stay stuck. Change-Id: Idf0bdd88990254a614962c2f7bc3e0292ccfd61a Signed-off-by: Nathan Skrzypczak <nathan.skrzypczak@gmail.com>
-rw-r--r--src/vnet/devices/af_packet/af_packet.c25
-rw-r--r--src/vnet/devices/af_packet/af_packet.h1
-rw-r--r--src/vnet/devices/af_packet/device.c88
-rw-r--r--src/vnet/devices/af_packet/node.c50
4 files changed, 126 insertions, 38 deletions
diff --git a/src/vnet/devices/af_packet/af_packet.c b/src/vnet/devices/af_packet/af_packet.c
index ba6bf3d5a46..2cc0cc70bca 100644
--- a/src/vnet/devices/af_packet/af_packet.c
+++ b/src/vnet/devices/af_packet/af_packet.c
@@ -83,11 +83,32 @@ af_packet_eth_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi,
clib_error_free (error);
return VNET_API_ERROR_SYSCALL_ERROR_1;
}
+ else
+ apif->host_mtu = hi->max_packet_bytes;
}
return 0;
}
+static int
+af_packet_read_mtu (af_packet_if_t *apif)
+{
+ af_packet_main_t *apm = &af_packet_main;
+ clib_error_t *error;
+ u8 *s;
+ s = format (0, "/sys/class/net/%s/mtu%c", apif->host_if_name, 0);
+ error = clib_sysfs_read ((char *) s, "%d", &apif->host_mtu);
+ vec_free (s);
+ if (error)
+ {
+ vlib_log_err (apm->log_class, "sysfs read failed to get MTU: %U",
+ format_clib_error, error);
+ clib_error_free (error);
+ return VNET_API_ERROR_SYSCALL_ERROR_1;
+ }
+ return 0;
+}
+
static clib_error_t *
af_packet_fd_read_ready (clib_file_t * uf)
{
@@ -338,6 +359,10 @@ af_packet_create_if (vlib_main_t * vm, u8 * host_if_name, u8 * hw_addr_set,
apif->next_tx_frame = 0;
apif->next_rx_frame = 0;
+ ret = af_packet_read_mtu (apif);
+ if (ret != 0)
+ goto error;
+
if (tm->n_vlib_mains > 1)
clib_spinlock_init (&apif->lockp);
diff --git a/src/vnet/devices/af_packet/af_packet.h b/src/vnet/devices/af_packet/af_packet.h
index 395f2d93fdf..fc35b48b85e 100644
--- a/src/vnet/devices/af_packet/af_packet.h
+++ b/src/vnet/devices/af_packet/af_packet.h
@@ -48,6 +48,7 @@ typedef struct
u32 per_interface_next_index;
u8 is_admin_up;
u32 queue_index;
+ u32 host_mtu;
} af_packet_if_t;
typedef struct
diff --git a/src/vnet/devices/af_packet/device.c b/src/vnet/devices/af_packet/device.c
index 2886fb5b6d5..b6b99a0465c 100644
--- a/src/vnet/devices/af_packet/device.c
+++ b/src/vnet/devices/af_packet/device.c
@@ -67,7 +67,48 @@ format_af_packet_device_name (u8 * s, va_list * args)
static u8 *
format_af_packet_device (u8 * s, va_list * args)
{
- s = format (s, "Linux PACKET socket interface");
+ u32 dev_instance = va_arg (*args, u32);
+ u32 indent = format_get_indent (s);
+ int __clib_unused verbose = va_arg (*args, int);
+
+ af_packet_main_t *apm = &af_packet_main;
+ af_packet_if_t *apif = pool_elt_at_index (apm->interfaces, dev_instance);
+ clib_spinlock_lock_if_init (&apif->lockp);
+ u32 block_size = apif->tx_req->tp_block_size;
+ u32 frame_size = apif->tx_req->tp_frame_size;
+ u32 frame_num = apif->tx_req->tp_frame_nr;
+ int block = 0;
+ u8 *block_start = apif->tx_ring + block * block_size;
+ u32 tx_frame = apif->next_tx_frame;
+ struct tpacket2_hdr *tph;
+
+ s = format (s, "Linux PACKET socket interface\n");
+ s = format (s, "%Ublock:%d frame:%d\n", format_white_space, indent,
+ block_size, frame_size);
+ s = format (s, "%Unext frame:%d\n", format_white_space, indent,
+ apif->next_tx_frame);
+
+ int n_send_req = 0, n_avail = 0, n_sending = 0, n_tot = 0, n_wrong = 0;
+ do
+ {
+ tph = (struct tpacket2_hdr *) (block_start + tx_frame * frame_size);
+ tx_frame = (tx_frame + 1) % frame_num;
+ if (tph->tp_status == 0)
+ n_avail++;
+ else if (tph->tp_status & TP_STATUS_SEND_REQUEST)
+ n_send_req++;
+ else if (tph->tp_status & TP_STATUS_SENDING)
+ n_sending++;
+ else
+ n_wrong++;
+ n_tot++;
+ }
+ while (tx_frame != apif->next_tx_frame);
+ s = format (s, "%Uavailable:%d request:%d sending:%d wrong:%d total:%d\n",
+ format_white_space, indent, n_avail, n_send_req, n_sending,
+ n_wrong, n_tot);
+
+ clib_spinlock_unlock_if_init (&apif->lockp);
return s;
}
@@ -99,7 +140,7 @@ VNET_DEVICE_CLASS_TX_FN (af_packet_device_class) (vlib_main_t * vm,
struct tpacket2_hdr *tph;
u32 frame_not_ready = 0;
- while (n_left > 0)
+ while (n_left)
{
u32 len;
u32 offset = 0;
@@ -108,13 +149,17 @@ VNET_DEVICE_CLASS_TX_FN (af_packet_device_class) (vlib_main_t * vm,
u32 bi = buffers[0];
buffers++;
+ nextframe:
tph = (struct tpacket2_hdr *) (block_start + tx_frame * frame_size);
-
- if (PREDICT_FALSE
- (tph->tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING)))
+ if (PREDICT_FALSE (tph->tp_status &
+ (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING)))
{
+ tx_frame = (tx_frame + 1) % frame_num;
frame_not_ready++;
- goto next;
+ /* check if we've exhausted the ring */
+ if (PREDICT_FALSE (frame_not_ready + n_sent == frame_num))
+ break;
+ goto nextframe;
}
do
@@ -132,7 +177,7 @@ VNET_DEVICE_CLASS_TX_FN (af_packet_device_class) (vlib_main_t * vm,
tph->tp_len = tph->tp_snaplen = offset;
tph->tp_status = TP_STATUS_SEND_REQUEST;
n_sent++;
- next:
+
tx_frame = (tx_frame + 1) % frame_num;
/* check if we've exhausted the ring */
@@ -142,23 +187,22 @@ VNET_DEVICE_CLASS_TX_FN (af_packet_device_class) (vlib_main_t * vm,
CLIB_MEMORY_BARRIER ();
- if (PREDICT_TRUE (n_sent))
- {
- apif->next_tx_frame = tx_frame;
+ apif->next_tx_frame = tx_frame;
- if (PREDICT_FALSE (sendto (apif->fd, NULL, 0,
- MSG_DONTWAIT, NULL, 0) == -1))
- {
- /* Uh-oh, drop & move on, but count whether it was fatal or not.
- * Note that we have no reliable way to properly determine the
- * disposition of the packets we just enqueued for delivery.
- */
- vlib_error_count (vm, node->node_index,
- unix_error_is_fatal (errno) ?
+ if (PREDICT_TRUE (n_sent))
+ if (PREDICT_FALSE (sendto (apif->fd, NULL, 0, MSG_DONTWAIT, NULL, 0) ==
+ -1))
+ {
+ /* Uh-oh, drop & move on, but count whether it was fatal or not.
+ * Note that we have no reliable way to properly determine the
+ * disposition of the packets we just enqueued for delivery.
+ */
+ vlib_error_count (vm, node->node_index,
+ unix_error_is_fatal (errno) ?
AF_PACKET_TX_ERROR_TXRING_FATAL :
- AF_PACKET_TX_ERROR_TXRING_EAGAIN, n_sent);
- }
- }
+ AF_PACKET_TX_ERROR_TXRING_EAGAIN,
+ n_sent);
+ }
clib_spinlock_unlock_if_init (&apif->lockp);
diff --git a/src/vnet/devices/af_packet/node.c b/src/vnet/devices/af_packet/node.c
index d444b3b6eea..f4db6399a57 100644
--- a/src/vnet/devices/af_packet/node.c
+++ b/src/vnet/devices/af_packet/node.c
@@ -109,7 +109,15 @@ buffer_add_to_chain (vlib_main_t * vm, u32 bi, u32 first_bi, u32 prev_bi)
}
static_always_inline void
-mark_tcp_udp_cksum_calc (vlib_buffer_t * b)
+fill_gso_buffer_flags (vlib_buffer_t *b, u32 gso_size, u8 l4_hdr_sz)
+{
+ b->flags |= VNET_BUFFER_F_GSO;
+ vnet_buffer2 (b)->gso_size = gso_size;
+ vnet_buffer2 (b)->gso_l4_hdr_sz = l4_hdr_sz;
+}
+
+static_always_inline void
+mark_tcp_udp_cksum_calc (vlib_buffer_t *b, u8 *l4_hdr_sz)
{
ethernet_header_t *eth = vlib_buffer_get_current (b);
if (clib_net_to_host_u16 (eth->type) == ETHERNET_TYPE_IP4)
@@ -120,18 +128,20 @@ mark_tcp_udp_cksum_calc (vlib_buffer_t * b)
if (ip4->protocol == IP_PROTOCOL_TCP)
{
b->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM;
- ((tcp_header_t
- *) (vlib_buffer_get_current (b) +
- sizeof (ethernet_header_t) +
- ip4_header_bytes (ip4)))->checksum = 0;
+ tcp_header_t *tcp = (tcp_header_t *) (vlib_buffer_get_current (b) +
+ sizeof (ethernet_header_t) +
+ ip4_header_bytes (ip4));
+ tcp->checksum = 0;
+ *l4_hdr_sz = tcp_header_bytes (tcp);
}
else if (ip4->protocol == IP_PROTOCOL_UDP)
{
b->flags |= VNET_BUFFER_F_OFFLOAD_UDP_CKSUM;
- ((udp_header_t
- *) (vlib_buffer_get_current (b) +
- sizeof (ethernet_header_t) +
- ip4_header_bytes (ip4)))->checksum = 0;
+ udp_header_t *udp = (udp_header_t *) (vlib_buffer_get_current (b) +
+ sizeof (ethernet_header_t) +
+ ip4_header_bytes (ip4));
+ udp->checksum = 0;
+ *l4_hdr_sz = sizeof (*udp);
}
vnet_buffer (b)->l3_hdr_offset = sizeof (ethernet_header_t);
vnet_buffer (b)->l4_hdr_offset =
@@ -156,16 +166,20 @@ mark_tcp_udp_cksum_calc (vlib_buffer_t * b)
if (ip6->protocol == IP_PROTOCOL_TCP)
{
b->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM;
- ((tcp_header_t
- *) (vlib_buffer_get_current (b) +
- sizeof (ethernet_header_t) + ip6_hdr_len))->checksum = 0;
+ tcp_header_t *tcp =
+ (tcp_header_t *) (vlib_buffer_get_current (b) +
+ sizeof (ethernet_header_t) + ip6_hdr_len);
+ tcp->checksum = 0;
+ *l4_hdr_sz = tcp_header_bytes (tcp);
}
else if (ip6->protocol == IP_PROTOCOL_UDP)
{
b->flags |= VNET_BUFFER_F_OFFLOAD_UDP_CKSUM;
- ((udp_header_t
- *) (vlib_buffer_get_current (b) +
- sizeof (ethernet_header_t) + ip6_hdr_len))->checksum = 0;
+ udp_header_t *udp =
+ (udp_header_t *) (vlib_buffer_get_current (b) +
+ sizeof (ethernet_header_t) + ip6_hdr_len);
+ udp->checksum = 0;
+ *l4_hdr_sz = sizeof (*udp);
}
vnet_buffer (b)->l3_hdr_offset = sizeof (ethernet_header_t);
vnet_buffer (b)->l4_hdr_offset =
@@ -221,6 +235,7 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
u32 data_len = tph->tp_snaplen;
u32 offset = 0;
u32 bi0 = 0, first_bi0 = 0, prev_bi0;
+ u8 l4_hdr_sz = 0;
while (data_len)
{
@@ -275,7 +290,10 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
first_bi0 = bi0;
first_b0 = vlib_get_buffer (vm, first_bi0);
if (tph->tp_status & TP_STATUS_CSUMNOTREADY)
- mark_tcp_udp_cksum_calc (first_b0);
+ mark_tcp_udp_cksum_calc (first_b0, &l4_hdr_sz);
+ if (tph->tp_snaplen > apif->host_mtu)
+ fill_gso_buffer_flags (first_b0, apif->host_mtu,
+ l4_hdr_sz);
}
else
buffer_add_to_chain (vm, bi0, first_bi0, prev_bi0);