diff options
author | Steven Luong <sluong@cisco.com> | 2020-03-23 09:34:59 -0700 |
---|---|---|
committer | Steven Luong <sluong@cisco.com> | 2020-04-27 09:25:32 -0700 |
commit | bc0d9ff6727d77668e216aba1c6d6cb753fa2ac3 (patch) | |
tree | dc2af469cf255d3d819dd52f1cc6703d708f9728 | |
parent | ba6deb96e923f71aa9387c06000412c3fb1362fa (diff) |
virtio: support virtio 1.1 packed ring in vhost
virtio 1.1 defines a number of new features. Packed ring is among the most
notable and important one. It combines used, available, and descripptor rings
into one.
This patch provides experimental support for packed ring. To avoid
regression, when packed ring is configured for the interface, it is branched
to a separate RX and TX driver. Non packed ring should continue to perform
as it was before.
Packed ring is tested using qemu4.2 and ubuntu focal fossa (kernel 5.4.0-12)
on the guess VM which supports packed ring.
To configure VPP with packed ring, just add the optional keyword "packed"
when creating the vhost interface. To bring up the guest VM with packed ring,
add "packed=on" in the qemu launch command.
To facilitate troubleshooting, also added "verbose" option in
show vhost desc CLI to include displaying the indirect descriptors.
Known qemu reconnect issue -
If VPP is restarted, guest VMs also need to be restarted. The problem
is kernel virtio-net-pci keeps track of the previous available and used
indices. For virtio 1.0, these indices are in shared memory and qemu can
easily copy them to pass to the backend for reconnect. For virio 1.1, these
indices are no longer in shared memory. Qemu needs a new mechanism to retrieve
them and it is not currently implemented. So when the protocol reconnects,
qemu does not have the correct available and used indices to pass to the
backend. As a result, after the reconnect, virtio-net-pci is reading the TX
ring from the wrong position in the ring, not the same position which the
backend is writing. Similar problem exists also in the RX.
Type: feature
Signed-off-by: Steven Luong <sluong@cisco.com>
Change-Id: I5afc50b0bafab5a1de7a6dd10f399db3fafd144c
-rw-r--r-- | src/vat/api_format.c | 12 | ||||
-rw-r--r-- | src/vnet/devices/virtio/FEATURE.yaml | 2 | ||||
-rw-r--r-- | src/vnet/devices/virtio/vhost_user.api | 6 | ||||
-rw-r--r-- | src/vnet/devices/virtio/vhost_user.c | 311 | ||||
-rw-r--r-- | src/vnet/devices/virtio/vhost_user.h | 63 | ||||
-rw-r--r-- | src/vnet/devices/virtio/vhost_user_api.c | 20 | ||||
-rw-r--r-- | src/vnet/devices/virtio/vhost_user_inline.h | 79 | ||||
-rw-r--r-- | src/vnet/devices/virtio/vhost_user_input.c | 858 | ||||
-rw-r--r-- | src/vnet/devices/virtio/vhost_user_output.c | 421 | ||||
-rw-r--r-- | src/vpp/api/custom_dump.c | 10 | ||||
-rw-r--r-- | test/vpp_vhost_interface.py | 6 |
11 files changed, 1625 insertions, 163 deletions
diff --git a/src/vat/api_format.c b/src/vat/api_format.c index bea743f9283..96bb78fdb87 100644 --- a/src/vat/api_format.c +++ b/src/vat/api_format.c @@ -12447,6 +12447,7 @@ api_create_vhost_user_if (vat_main_t * vam) u8 disable_indirect_desc = 0; u8 *tag = 0; u8 enable_gso = 0; + u8 enable_packed = 0; int ret; /* Shut up coverity */ @@ -12470,6 +12471,8 @@ api_create_vhost_user_if (vat_main_t * vam) disable_indirect_desc = 1; else if (unformat (i, "gso")) enable_gso = 1; + else if (unformat (i, "packed")) + enable_packed = 1; else if (unformat (i, "tag %s", &tag)) ; else @@ -12495,6 +12498,7 @@ api_create_vhost_user_if (vat_main_t * vam) mp->disable_mrg_rxbuf = disable_mrg_rxbuf; mp->disable_indirect_desc = disable_indirect_desc; mp->enable_gso = enable_gso; + mp->enable_packed = enable_packed; clib_memcpy (mp->sock_filename, file_name, vec_len (file_name)); vec_free (file_name); if (custom_dev_instance != ~0) @@ -12526,6 +12530,7 @@ api_modify_vhost_user_if (vat_main_t * vam) u8 sw_if_index_set = 0; u32 sw_if_index = (u32) ~ 0; u8 enable_gso = 0; + u8 enable_packed = 0; int ret; while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) @@ -12544,6 +12549,8 @@ api_modify_vhost_user_if (vat_main_t * vam) is_server = 1; else if (unformat (i, "gso")) enable_gso = 1; + else if (unformat (i, "packed")) + enable_packed = 1; else break; } @@ -12572,6 +12579,7 @@ api_modify_vhost_user_if (vat_main_t * vam) mp->sw_if_index = ntohl (sw_if_index); mp->is_server = is_server; mp->enable_gso = enable_gso; + mp->enable_packed = enable_packed; clib_memcpy (mp->sock_filename, file_name, vec_len (file_name)); vec_free (file_name); if (custom_dev_instance != ~0) @@ -20782,10 +20790,10 @@ _(l2_interface_vlan_tag_rewrite, \ _(create_vhost_user_if, \ "socket <filename> [server] [renumber <dev_instance>] " \ "[disable_mrg_rxbuf] [disable_indirect_desc] [gso] " \ - "[mac <mac_address>]") \ + "[mac <mac_address>] [packed]") \ _(modify_vhost_user_if, \ "<intfc> | sw_if_index <nn> socket <filename>\n" \ - "[server] [renumber <dev_instance>] [gso]") \ + "[server] [renumber <dev_instance>] [gso] [packed]") \ _(delete_vhost_user_if, "<intfc> | sw_if_index <nn>") \ _(sw_interface_vhost_user_dump, "") \ _(show_version, "") \ diff --git a/src/vnet/devices/virtio/FEATURE.yaml b/src/vnet/devices/virtio/FEATURE.yaml index de449c866e5..b446a559403 100644 --- a/src/vnet/devices/virtio/FEATURE.yaml +++ b/src/vnet/devices/virtio/FEATURE.yaml @@ -7,7 +7,7 @@ features: - device mode to emulate vhost-user interface presented to VPP from the guest VM. - support multi-queue, GSO, checksum offload, indirect descriptor, - and jumbo frame. + jumbo frame, and packed ring. description: "Virtio v1.0 implementation" missing: - API dump filtering by sw_if_index diff --git a/src/vnet/devices/virtio/vhost_user.api b/src/vnet/devices/virtio/vhost_user.api index 9b057552c00..127b0a27fc7 100644 --- a/src/vnet/devices/virtio/vhost_user.api +++ b/src/vnet/devices/virtio/vhost_user.api @@ -13,7 +13,7 @@ * limitations under the License. */ -option version = "4.0.0"; +option version = "4.0.1"; import "vnet/interface_types.api"; import "vnet/ethernet/ethernet_types.api"; @@ -27,6 +27,7 @@ import "vnet/devices/virtio/virtio_types.api"; @param disable_mrg_rxbuf - disable the use of merge receive buffers @param disable_indirect_desc - disable the use of indirect descriptors which driver can use @param enable_gso - enable gso support (default 0) + @param enable_packed - enable packed ring support (default 0) @param mac_address - hardware address to use if 'use_custom_mac' is set */ define create_vhost_user_if @@ -39,6 +40,7 @@ define create_vhost_user_if bool disable_mrg_rxbuf; bool disable_indirect_desc; bool enable_gso; + bool enable_packed; u32 custom_dev_instance; bool use_custom_mac; vl_api_mac_address_t mac_address; @@ -62,6 +64,7 @@ define create_vhost_user_if_reply @param is_server - our side is socket server @param sock_filename - unix socket filename, used to speak with frontend @param enable_gso - enable gso support (default 0) + @param enable_packed - enable packed ring support (default 0) */ autoreply define modify_vhost_user_if { @@ -72,6 +75,7 @@ autoreply define modify_vhost_user_if string sock_filename[256]; bool renumber; bool enable_gso; + bool enable_packed; u32 custom_dev_instance; }; diff --git a/src/vnet/devices/virtio/vhost_user.c b/src/vnet/devices/virtio/vhost_user.c index 7094a00fb33..d24e516a93c 100644 --- a/src/vnet/devices/virtio/vhost_user.c +++ b/src/vnet/devices/virtio/vhost_user.c @@ -466,6 +466,8 @@ vhost_user_socket_read (clib_file_t * uf) if (vui->enable_gso) msg.u64 |= FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS; + if (vui->enable_packed) + msg.u64 |= (1ULL << FEAT_VIRTIO_F_RING_PACKED); msg.size = sizeof (msg.u64); vu_log_debug (vui, "if %d msg VHOST_USER_GET_FEATURES - reply " @@ -655,7 +657,11 @@ vhost_user_socket_read (clib_file_t * uf) vui->vrings[msg.state.index].used->idx; /* tell driver that we don't want interrupts */ - vui->vrings[msg.state.index].used->flags = VRING_USED_F_NO_NOTIFY; + if (vhost_user_is_packed_ring_supported (vui)) + vui->vrings[msg.state.index].used_event->flags = + VRING_EVENT_F_DISABLE; + else + vui->vrings[msg.state.index].used->flags = VRING_USED_F_NO_NOTIFY; vlib_worker_thread_barrier_release (vm); vhost_user_update_iface_state (vui); break; @@ -762,10 +768,47 @@ vhost_user_socket_read (clib_file_t * uf) break; case VHOST_USER_SET_VRING_BASE: - vu_log_debug (vui, "if %d msg VHOST_USER_SET_VRING_BASE idx %d num %d", + vu_log_debug (vui, + "if %d msg VHOST_USER_SET_VRING_BASE idx %d num 0x%x", vui->hw_if_index, msg.state.index, msg.state.num); vlib_worker_thread_barrier_sync (vm); vui->vrings[msg.state.index].last_avail_idx = msg.state.num; + if (vhost_user_is_packed_ring_supported (vui)) + { + /* + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | last avail idx | | last used idx | | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * ^ ^ + * | | + * avail wrap counter used wrap counter + */ + /* last avail idx at bit 0-14. */ + vui->vrings[msg.state.index].last_avail_idx = + msg.state.num & 0x7fff; + /* avail wrap counter at bit 15 */ + vui->vrings[msg.state.index].avail_wrap_counter = + ! !(msg.state.num & (1 << 15)); + + /* + * Although last_used_idx is passed in the upper 16 bits in qemu + * implementation, in practice, last_avail_idx and last_used_idx are + * usually the same. As a result, DPDK does not bother to pass us + * last_used_idx. The spec is not clear on thex coding. I figured it + * out by reading the qemu code. So let's just read last_avail_idx + * and set last_used_idx equals to last_avail_idx. + */ + vui->vrings[msg.state.index].last_used_idx = + vui->vrings[msg.state.index].last_avail_idx; + vui->vrings[msg.state.index].used_wrap_counter = + vui->vrings[msg.state.index].avail_wrap_counter; + + if (vui->vrings[msg.state.index].avail_wrap_counter == 1) + vui->vrings[msg.state.index].avail_wrap_counter = + VIRTQ_DESC_F_AVAIL; + } vlib_worker_thread_barrier_release (vm); break; @@ -784,6 +827,15 @@ vhost_user_socket_read (clib_file_t * uf) * closing the vring also initializes the vring last_avail_idx */ msg.state.num = vui->vrings[msg.state.index].last_avail_idx; + if (vhost_user_is_packed_ring_supported (vui)) + { + msg.state.num = + (vui->vrings[msg.state.index].last_avail_idx & 0x7fff) | + (! !vui->vrings[msg.state.index].avail_wrap_counter << 15); + msg.state.num |= + ((vui->vrings[msg.state.index].last_used_idx & 0x7fff) | + (! !vui->vrings[msg.state.index].used_wrap_counter << 15)) << 16; + } msg.flags |= 4; msg.size = sizeof (msg.state); @@ -793,7 +845,8 @@ vhost_user_socket_read (clib_file_t * uf) */ vhost_user_vring_close (vui, msg.state.index); vlib_worker_thread_barrier_release (vm); - vu_log_debug (vui, "if %d msg VHOST_USER_GET_VRING_BASE idx %d num %d", + vu_log_debug (vui, + "if %d msg VHOST_USER_GET_VRING_BASE idx %d num 0x%x", vui->hw_if_index, msg.state.index, msg.state.num); n = send (uf->file_descriptor, &msg, VHOST_USER_MSG_HDR_SZ + msg.size, 0); @@ -1440,7 +1493,8 @@ vhost_user_vui_init (vnet_main_t * vnm, vhost_user_intf_t * vui, int server_sock_fd, const char *sock_filename, - u64 feature_mask, u32 * sw_if_index, u8 enable_gso) + u64 feature_mask, u32 * sw_if_index, u8 enable_gso, + u8 enable_packed) { vnet_sw_interface_t *sw; int q; @@ -1472,6 +1526,7 @@ vhost_user_vui_init (vnet_main_t * vnm, vui->log_base_addr = 0; vui->if_index = vui - vum->vhost_user_interfaces; vui->enable_gso = enable_gso; + vui->enable_packed = enable_packed; /* * enable_gso takes precedence over configurable feature mask if there * is a clash. @@ -1519,7 +1574,7 @@ vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm, u32 * sw_if_index, u64 feature_mask, u8 renumber, u32 custom_dev_instance, u8 * hwaddr, - u8 enable_gso) + u8 enable_gso, u8 enable_packed) { vhost_user_intf_t *vui = NULL; u32 sw_if_idx = ~0; @@ -1560,7 +1615,7 @@ vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm, vlib_worker_thread_barrier_release (vm); vhost_user_vui_init (vnm, vui, server_sock_fd, sock_filename, - feature_mask, &sw_if_idx, enable_gso); + feature_mask, &sw_if_idx, enable_gso, enable_packed); vnet_sw_interface_set_mtu (vnm, vui->sw_if_index, 9000); vhost_user_rx_thread_placement (vui, 1); @@ -1582,7 +1637,7 @@ vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm, u8 is_server, u32 sw_if_index, u64 feature_mask, u8 renumber, u32 custom_dev_instance, - u8 enable_gso) + u8 enable_gso, u8 enable_packed) { vhost_user_main_t *vum = &vhost_user_main; vhost_user_intf_t *vui = NULL; @@ -1619,7 +1674,8 @@ vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm, vhost_user_term_if (vui); vhost_user_vui_init (vnm, vui, server_sock_fd, - sock_filename, feature_mask, &sw_if_idx, enable_gso); + sock_filename, feature_mask, &sw_if_idx, enable_gso, + enable_packed); if (renumber) vnet_interface_name_renumber (sw_if_idx, custom_dev_instance); @@ -1645,7 +1701,7 @@ vhost_user_connect_command_fn (vlib_main_t * vm, u8 hwaddr[6]; u8 *hw = NULL; clib_error_t *error = NULL; - u8 enable_gso = 0; + u8 enable_gso = 0, enable_packed = 0; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -1653,6 +1709,8 @@ vhost_user_connect_command_fn (vlib_main_t * vm, /* GSO feature is disable by default */ feature_mask &= ~FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS; + /* packed-ring feature is disable by default */ + feature_mask &= ~(1ULL << FEAT_VIRTIO_F_RING_PACKED); while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) { if (unformat (line_input, "socket %s", &sock_filename)) @@ -1661,6 +1719,8 @@ vhost_user_connect_command_fn (vlib_main_t * vm, is_server = 1; else if (unformat (line_input, "gso")) enable_gso = 1; + else if (unformat (line_input, "packed")) + enable_packed = 1; else if (unformat (line_input, "feature-mask 0x%llx", &feature_mask)) ; else @@ -1685,7 +1745,7 @@ vhost_user_connect_command_fn (vlib_main_t * vm, if ((rv = vhost_user_create_if (vnm, vm, (char *) sock_filename, is_server, &sw_if_index, feature_mask, renumber, custom_dev_instance, hw, - enable_gso))) + enable_gso, enable_packed))) { error = clib_error_return (0, "vhost_user_create_if returned %d", rv); goto done; @@ -1799,6 +1859,186 @@ vhost_user_dump_ifs (vnet_main_t * vnm, vlib_main_t * vm, return rv; } +static u8 * +format_vhost_user_desc (u8 * s, va_list * args) +{ + char *fmt = va_arg (*args, char *); + vhost_user_intf_t *vui = va_arg (*args, vhost_user_intf_t *); + vring_desc_t *desc_table = va_arg (*args, vring_desc_t *); + int idx = va_arg (*args, int); + u32 *mem_hint = va_arg (*args, u32 *); + + s = format (s, fmt, idx, desc_table[idx].addr, desc_table[idx].len, + desc_table[idx].flags, desc_table[idx].next, + pointer_to_uword (map_guest_mem (vui, desc_table[idx].addr, + mem_hint))); + return s; +} + +static u8 * +format_vhost_user_vring (u8 * s, va_list * args) +{ + char *fmt = va_arg (*args, char *); + vhost_user_intf_t *vui = va_arg (*args, vhost_user_intf_t *); + int q = va_arg (*args, int); + + s = format (s, fmt, vui->vrings[q].avail->flags, vui->vrings[q].avail->idx, + vui->vrings[q].used->flags, vui->vrings[q].used->idx); + return s; +} + +static void +vhost_user_show_fds (vlib_main_t * vm, vhost_user_intf_t * vui, int q) +{ + int kickfd = UNIX_GET_FD (vui->vrings[q].kickfd_idx); + int callfd = UNIX_GET_FD (vui->vrings[q].callfd_idx); + + vlib_cli_output (vm, " kickfd %d callfd %d errfd %d\n", kickfd, callfd, + vui->vrings[q].errfd); +} + +static void +vhost_user_show_desc (vlib_main_t * vm, vhost_user_intf_t * vui, int q, + int show_descr, int show_verbose) +{ + int j; + u32 mem_hint = 0; + u32 idx; + u32 n_entries; + vring_desc_t *desc_table; + + if (vui->vrings[q].avail && vui->vrings[q].used) + vlib_cli_output (vm, "%U", format_vhost_user_vring, + " avail.flags %x avail.idx %d used.flags %x used.idx %d\n", + vui, q); + + vhost_user_show_fds (vm, vui, q); + + if (show_descr) + { + vlib_cli_output (vm, "\n descriptor table:\n"); + vlib_cli_output (vm, + " slot addr len flags next " + "user_addr\n"); + vlib_cli_output (vm, + " ===== ================== ===== ====== ===== " + "==================\n"); + for (j = 0; j < vui->vrings[q].qsz_mask + 1; j++) + { + desc_table = vui->vrings[q].desc; + vlib_cli_output (vm, "%U", format_vhost_user_desc, + " %-5d 0x%016lx %-5d 0x%04x %-5d 0x%016lx\n", vui, + desc_table, j, &mem_hint); + if (show_verbose && (desc_table[j].flags & VIRTQ_DESC_F_INDIRECT)) + { + n_entries = desc_table[j].len / sizeof (vring_desc_t); + desc_table = map_guest_mem (vui, desc_table[j].addr, &mem_hint); + if (desc_table) + { + for (idx = 0; idx < clib_min (20, n_entries); idx++) + { + vlib_cli_output + (vm, "%U", format_vhost_user_desc, + "> %-4u 0x%016lx %-5u 0x%04x %-5u 0x%016lx\n", vui, + desc_table, idx, &mem_hint); + } + if (n_entries >= 20) + vlib_cli_output (vm, "Skip displaying entries 20...%u\n", + n_entries); + } + } + } + } +} + +static u8 * +format_vhost_user_packed_desc (u8 * s, va_list * args) +{ + char *fmt = va_arg (*args, char *); + vhost_user_intf_t *vui = va_arg (*args, vhost_user_intf_t *); + vring_packed_desc_t *desc_table = va_arg (*args, vring_packed_desc_t *); + int idx = va_arg (*args, int); + u32 *mem_hint = va_arg (*args, u32 *); + + s = format (s, fmt, idx, desc_table[idx].addr, desc_table[idx].len, + desc_table[idx].flags, desc_table[idx].id, + pointer_to_uword (map_guest_mem (vui, desc_table[idx].addr, + mem_hint))); + return s; +} + +static u8 * +format_vhost_user_vring_packed (u8 * s, va_list * args) +{ + char *fmt = va_arg (*args, char *); + vhost_user_intf_t *vui = va_arg (*args, vhost_user_intf_t *); + int q = va_arg (*args, int); + + s = format (s, fmt, vui->vrings[q].avail_event->flags, + vui->vrings[q].avail_event->off_wrap, + vui->vrings[q].used_event->flags, + vui->vrings[q].used_event->off_wrap, + vui->vrings[q].avail_wrap_counter, + vui->vrings[q].used_wrap_counter); + return s; +} + +static void +vhost_user_show_desc_packed (vlib_main_t * vm, vhost_user_intf_t * vui, int q, + int show_descr, int show_verbose) +{ + int j; + u32 mem_hint = 0; + u32 idx; + u32 n_entries; + vring_packed_desc_t *desc_table; + + if (vui->vrings[q].avail_event && vui->vrings[q].used_event) + vlib_cli_output (vm, "%U", format_vhost_user_vring_packed, + " avail_event.flags %x avail_event.off_wrap %u " + "used_event.flags %x used_event.off_wrap %u\n" + " avail wrap counter %u, used wrap counter %u\n", + vui, q); + + vhost_user_show_fds (vm, vui, q); + + if (show_descr) + { + vlib_cli_output (vm, "\n descriptor table:\n"); + vlib_cli_output (vm, + " slot addr len flags id " + "user_addr\n"); + vlib_cli_output (vm, + " ===== ================== ===== ====== ===== " + "==================\n"); + for (j = 0; j < vui->vrings[q].qsz_mask + 1; j++) + { + desc_table = vui->vrings[q].packed_desc; + vlib_cli_output (vm, "%U", format_vhost_user_packed_desc, + " %-5u 0x%016lx %-5u 0x%04x %-5u 0x%016lx\n", vui, + desc_table, j, &mem_hint); + if (show_verbose && (desc_table[j].flags & VIRTQ_DESC_F_INDIRECT)) + { + n_entries = desc_table[j].len >> 4; + desc_table = map_guest_mem (vui, desc_table[j].addr, &mem_hint); + if (desc_table) + { + for (idx = 0; idx < clib_min (20, n_entries); idx++) + { + vlib_cli_output + (vm, "%U", format_vhost_user_packed_desc, + "> %-4u 0x%016lx %-5u 0x%04x %-5u 0x%016lx\n", vui, + desc_table, idx, &mem_hint); + } + if (n_entries >= 20) + vlib_cli_output (vm, "Skip displaying entries 20...%u\n", + n_entries); + } + } + } + } +} + clib_error_t * show_vhost_user_command_fn (vlib_main_t * vm, unformat_input_t * input, @@ -1814,6 +2054,7 @@ show_vhost_user_command_fn (vlib_main_t * vm, u32 ci; int i, j, q; int show_descr = 0; + int show_verbose = 0; struct feat_struct { u8 bit; @@ -1855,6 +2096,8 @@ show_vhost_user_command_fn (vlib_main_t * vm, } else if (unformat (input, "descriptors") || unformat (input, "desc")) show_descr = 1; + else if (unformat (input, "verbose")) + show_verbose = 1; else { error = clib_error_return (0, "unknown input `%U'", @@ -1884,6 +2127,8 @@ show_vhost_user_command_fn (vlib_main_t * vm, hw_if_indices[i]); if (vui->enable_gso) vlib_cli_output (vm, " GSO enable"); + if (vui->enable_packed) + vlib_cli_output (vm, " Packed ring enable"); vlib_cli_output (vm, "virtio_net_hdr_sz %d\n" " features mask (0x%llx): \n" @@ -1985,41 +2230,11 @@ show_vhost_user_command_fn (vlib_main_t * vm, vui->vrings[q].last_avail_idx, vui->vrings[q].last_used_idx); - if (vui->vrings[q].avail && vui->vrings[q].used) - vlib_cli_output (vm, - " avail.flags %x avail.idx %d used.flags %x used.idx %d\n", - vui->vrings[q].avail->flags, - vui->vrings[q].avail->idx, - vui->vrings[q].used->flags, - vui->vrings[q].used->idx); - - int kickfd = UNIX_GET_FD (vui->vrings[q].kickfd_idx); - int callfd = UNIX_GET_FD (vui->vrings[q].callfd_idx); - vlib_cli_output (vm, " kickfd %d callfd %d errfd %d\n", - kickfd, callfd, vui->vrings[q].errfd); - - if (show_descr) - { - vlib_cli_output (vm, "\n descriptor table:\n"); - vlib_cli_output (vm, - " id addr len flags next user_addr\n"); - vlib_cli_output (vm, - " ===== ================== ===== ====== ===== ==================\n"); - for (j = 0; j < vui->vrings[q].qsz_mask + 1; j++) - { - u32 mem_hint = 0; - vlib_cli_output (vm, - " %-5d 0x%016lx %-5d 0x%04x %-5d 0x%016lx\n", - j, vui->vrings[q].desc[j].addr, - vui->vrings[q].desc[j].len, - vui->vrings[q].desc[j].flags, - vui->vrings[q].desc[j].next, - pointer_to_uword (map_guest_mem - (vui, - vui->vrings[q].desc[j]. - addr, &mem_hint))); - } - } + if (vhost_user_is_packed_ring_supported (vui)) + vhost_user_show_desc_packed (vm, vui, q, show_descr, + show_verbose); + else + vhost_user_show_desc (vm, vui, q, show_descr, show_verbose); } vlib_cli_output (vm, "\n"); } @@ -2090,7 +2305,8 @@ done: VLIB_CLI_COMMAND (vhost_user_connect_command, static) = { .path = "create vhost-user", .short_help = "create vhost-user socket <socket-filename> [server] " - "[feature-mask <hex>] [hwaddr <mac-addr>] [renumber <dev_instance>] [gso]", + "[feature-mask <hex>] [hwaddr <mac-addr>] [renumber <dev_instance>] [gso] " + "[packed]", .function = vhost_user_connect_command_fn, .is_mp_safe = 1, }; @@ -2251,7 +2467,8 @@ VLIB_CLI_COMMAND (vhost_user_delete_command, static) = { /* *INDENT-OFF* */ VLIB_CLI_COMMAND (show_vhost_user_command, static) = { .path = "show vhost-user", - .short_help = "show vhost-user [<interface> [<interface> [..]]] [descriptors]", + .short_help = "show vhost-user [<interface> [<interface> [..]]] " + "[[descriptors] [verbose]]", .function = show_vhost_user_command_fn, }; /* *INDENT-ON* */ diff --git a/src/vnet/devices/virtio/vhost_user.h b/src/vnet/devices/virtio/vhost_user.h index f14f26a71e4..b86f42e70e8 100644 --- a/src/vnet/devices/virtio/vhost_user.h +++ b/src/vnet/devices/virtio/vhost_user.h @@ -25,8 +25,15 @@ #define VHOST_USER_VRING_NOFD_MASK 0x100 #define VIRTQ_DESC_F_NEXT 1 +#define VIRTQ_DESC_F_WRITE 2 #define VIRTQ_DESC_F_INDIRECT 4 -#define VHOST_USER_REPLY_MASK (0x1 << 2) + +#define VIRTQ_DESC_F_AVAIL (1 << 7) +#define VIRTQ_DESC_F_USED (1 << 15) + +#define VRING_EVENT_F_ENABLE 0x0 +#define VRING_EVENT_F_DISABLE 0x1 +#define VRING_EVENT_F_DESC 0x2 #define VHOST_USER_PROTOCOL_F_MQ 0 #define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 @@ -100,8 +107,11 @@ typedef enum _ (VHOST_F_LOG_ALL, 26) \ _ (VIRTIO_F_ANY_LAYOUT, 27) \ _ (VIRTIO_F_INDIRECT_DESC, 28) \ + _ (VIRTIO_F_EVENT_IDX, 29) \ _ (VHOST_USER_F_PROTOCOL_FEATURES, 30) \ - _ (VIRTIO_F_VERSION_1, 32) + _ (VIRTIO_F_VERSION_1, 32) \ + _ (VIRTIO_F_RING_PACKED, 34) \ + _ (VIRTIO_F_IN_ORDER, 35) typedef enum { @@ -130,12 +140,12 @@ int vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm, const char *sock_filename, u8 is_server, u32 * sw_if_index, u64 feature_mask, u8 renumber, u32 custom_dev_instance, u8 * hwaddr, - u8 enable_gso); + u8 enable_gso, u8 enable_packed); int vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm, const char *sock_filename, u8 is_server, u32 sw_if_index, u64 feature_mask, u8 renumber, u32 custom_dev_instance, - u8 enable_gso); + u8 enable_gso, u8 enable_packed); int vhost_user_delete_if (vnet_main_t * vnm, vlib_main_t * vm, u32 sw_if_index); @@ -223,6 +233,22 @@ typedef struct } ring[VHOST_VRING_MAX_SIZE]; } __attribute ((packed)) vring_used_t; +typedef CLIB_PACKED (struct +{ + u64 addr; // packet data buffer address + u32 len; // packet data buffer size + u16 id; // buffer id + u16 flags; // flags +}) vring_packed_desc_t; + +STATIC_ASSERT_SIZEOF (vring_packed_desc_t, 16); + +typedef CLIB_PACKED (struct +{ + u16 off_wrap; + u16 flags; +}) vring_desc_event_t; + typedef struct { u8 flags; @@ -260,9 +286,21 @@ typedef struct u16 last_avail_idx; u16 last_used_idx; u16 n_since_last_int; - vring_desc_t *desc; - vring_avail_t *avail; - vring_used_t *used; + union + { + vring_desc_t *desc; + vring_packed_desc_t *packed_desc; + }; + union + { + vring_avail_t *avail; + vring_desc_event_t *avail_event; + }; + union + { + vring_used_t *used; + vring_desc_event_t *used_event; + }; uword desc_user_addr; uword used_user_addr; uword avail_user_addr; @@ -287,6 +325,9 @@ typedef struct * the interface even if it is disconnected and reconnected. */ i16 qid; + + u16 used_wrap_counter; + u16 avail_wrap_counter; } vhost_user_vring_t; #define VHOST_USER_EVENT_START_TIMER 1 @@ -332,6 +373,10 @@ typedef struct u16 *per_cpu_tx_qid; u8 enable_gso; + + /* Packed ring configured */ + u8 enable_packed; + } vhost_user_intf_t; typedef struct @@ -350,7 +395,6 @@ typedef struct virtio_net_hdr_mrg_rxbuf_t hdr; /** Virtio header **/ } vhost_trace_t; - #define VHOST_USER_RX_BUFFERS_N (2 * VLIB_FRAME_SIZE + 2) #define VHOST_USER_COPY_ARRAY_N (4 * VLIB_FRAME_SIZE) @@ -365,6 +409,9 @@ typedef struct /* This is here so it doesn't end-up * using stack or registers. */ vhost_trace_t *current_trace; + + u32 *to_next_list; + vlib_buffer_t **rx_buffers_pdesc; } vhost_cpu_t; typedef struct diff --git a/src/vnet/devices/virtio/vhost_user_api.c b/src/vnet/devices/virtio/vhost_user_api.c index 2ab87a65690..67365334d95 100644 --- a/src/vnet/devices/virtio/vhost_user_api.c +++ b/src/vnet/devices/virtio/vhost_user_api.c @@ -71,10 +71,12 @@ vl_api_create_vhost_user_if_t_handler (vl_api_create_vhost_user_if_t * mp) disabled_features |= (1ULL << FEAT_VIRTIO_F_INDIRECT_DESC); /* - * feature mask is not supported via binary API. We disable GSO feature in the - * feature mask. It may be enabled via enable_gso argument. + * GSO and PACKED are not supported by feature mask via binary API. We + * disable GSO and PACKED feature in the feature mask. They may be enabled + * explicitly via enable_gso and enable_packed argument */ - disabled_features |= FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS; + disabled_features |= FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS | + (1ULL << FEAT_VIRTIO_F_RING_PACKED); features &= ~disabled_features; if (mp->use_custom_mac) @@ -86,7 +88,7 @@ vl_api_create_vhost_user_if_t_handler (vl_api_create_vhost_user_if_t * mp) rv = vhost_user_create_if (vnm, vm, (char *) mp->sock_filename, mp->is_server, &sw_if_index, features, mp->renumber, ntohl (mp->custom_dev_instance), - mac_p, mp->enable_gso); + mac_p, mp->enable_gso, mp->enable_packed); /* Remember an interface tag for the new interface */ if (rv == 0) @@ -122,16 +124,18 @@ vl_api_modify_vhost_user_if_t_handler (vl_api_modify_vhost_user_if_t * mp) vlib_main_t *vm = vlib_get_main (); /* - * feature mask is not supported via binary API. We disable GSO feature in the - * feature mask. It may be enabled via enable_gso argument. + * GSO and PACKED are not supported by feature mask via binary API. We + * disable GSO and PACKED feature in the feature mask. They may be enabled + * explicitly via enable_gso and enable_packed argument */ - disabled_features |= FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS; + disabled_features |= FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS | + (1ULL << FEAT_VIRTIO_F_RING_PACKED); features &= ~disabled_features; rv = vhost_user_modify_if (vnm, vm, (char *) mp->sock_filename, mp->is_server, sw_if_index, features, mp->renumber, ntohl (mp->custom_dev_instance), - mp->enable_gso); + mp->enable_gso, mp->enable_packed); REPLY_MACRO (VL_API_MODIFY_VHOST_USER_IF_REPLY); } diff --git a/src/vnet/devices/virtio/vhost_user_inline.h b/src/vnet/devices/virtio/vhost_user_inline.h index e4a1d596040..ceaf78cf799 100644 --- a/src/vnet/devices/virtio/vhost_user_inline.h +++ b/src/vnet/devices/virtio/vhost_user_inline.h @@ -292,6 +292,85 @@ vhost_user_update_gso_interface_count (vhost_user_intf_t * vui, u8 add) } } } + +static_always_inline u8 +vhost_user_packed_desc_available (vhost_user_vring_t * vring, u16 idx) +{ + return (((vring->packed_desc[idx].flags & VIRTQ_DESC_F_AVAIL) == + vring->avail_wrap_counter)); +} + +static_always_inline void +vhost_user_advance_last_avail_idx (vhost_user_vring_t * vring) +{ + vring->last_avail_idx++; + if (PREDICT_FALSE ((vring->last_avail_idx & vring->qsz_mask) == 0)) + vring->avail_wrap_counter ^= VIRTQ_DESC_F_AVAIL; +} + +static_always_inline void +vhost_user_advance_last_avail_table_idx (vhost_user_intf_t * vui, + vhost_user_vring_t * vring, + u8 chained) +{ + if (chained) + { + vring_packed_desc_t *desc_table = vring->packed_desc; + + /* pick up the slot of the next avail idx */ + while (desc_table[vring->last_avail_idx & vring->qsz_mask].flags & + VIRTQ_DESC_F_NEXT) + vhost_user_advance_last_avail_idx (vring); + } + + vhost_user_advance_last_avail_idx (vring); +} + +static_always_inline void +vhost_user_undo_advanced_last_avail_idx (vhost_user_vring_t * vring) +{ + if (PREDICT_FALSE ((vring->last_avail_idx & vring->qsz_mask) == 0)) + vring->avail_wrap_counter ^= VIRTQ_DESC_F_AVAIL; + vring->last_avail_idx--; +} + +static_always_inline void +vhost_user_dequeue_descs (vhost_user_vring_t * rxvq, + virtio_net_hdr_mrg_rxbuf_t * hdr, + u16 * n_descs_processed) +{ + u16 i; + + *n_descs_processed -= (hdr->num_buffers - 1); + for (i = 0; i < hdr->num_buffers - 1; i++) + vhost_user_undo_advanced_last_avail_idx (rxvq); +} + +static_always_inline void +vhost_user_dequeue_chained_descs (vhost_user_vring_t * rxvq, + u16 * n_descs_processed) +{ + while (*n_descs_processed) + { + vhost_user_undo_advanced_last_avail_idx (rxvq); + (*n_descs_processed)--; + } +} + +static_always_inline void +vhost_user_advance_last_used_idx (vhost_user_vring_t * vring) +{ + vring->last_used_idx++; + if (PREDICT_FALSE ((vring->last_used_idx & vring->qsz_mask) == 0)) + vring->used_wrap_counter ^= 1; +} + +static_always_inline u64 +vhost_user_is_packed_ring_supported (vhost_user_intf_t * vui) +{ + return (vui->features & (1ULL << FEAT_VIRTIO_F_RING_PACKED)); +} + #endif /* diff --git a/src/vnet/devices/virtio/vhost_user_input.c b/src/vnet/devices/virtio/vhost_user_input.c index 4b52bd5a54b..dd899094225 100644 --- a/src/vnet/devices/virtio/vhost_user_input.c +++ b/src/vnet/devices/virtio/vhost_user_input.c @@ -74,6 +74,7 @@ extern vlib_node_registration_t vhost_user_input_node; _(MMAP_FAIL, "mmap failure") \ _(INDIRECT_OVERFLOW, "indirect descriptor overflows table") \ _(UNDERSIZED_FRAME, "undersized ethernet frame received (< 14 bytes)") \ + _(NOT_READY, "vhost interface not ready or down") \ _(FULL_RX_QUEUE, "full rx queue (possible driver tx drop)") typedef enum @@ -249,64 +250,59 @@ vhost_user_handle_rx_offload (vlib_buffer_t * b0, u8 * b0_data, virtio_net_hdr_t * hdr) { u8 l4_hdr_sz = 0; + u8 l4_proto = 0; + ethernet_header_t *eh = (ethernet_header_t *) b0_data; + u16 ethertype = clib_net_to_host_u16 (eh->type); + u16 l2hdr_sz = sizeof (ethernet_header_t); - if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) + if (ethernet_frame_is_tagged (ethertype)) { - u8 l4_proto = 0; - ethernet_header_t *eh = (ethernet_header_t *) b0_data; - u16 ethertype = clib_net_to_host_u16 (eh->type); - u16 l2hdr_sz = sizeof (ethernet_header_t); + ethernet_vlan_header_t *vlan = (ethernet_vlan_header_t *) (eh + 1); - if (ethernet_frame_is_tagged (ethertype)) + ethertype = clib_net_to_host_u16 (vlan->type); + l2hdr_sz += sizeof (*vlan); + if (ethertype == ETHERNET_TYPE_VLAN) { - ethernet_vlan_header_t *vlan = (ethernet_vlan_header_t *) (eh + 1); - + vlan++; ethertype = clib_net_to_host_u16 (vlan->type); l2hdr_sz += sizeof (*vlan); - if (ethertype == ETHERNET_TYPE_VLAN) - { - vlan++; - ethertype = clib_net_to_host_u16 (vlan->type); - l2hdr_sz += sizeof (*vlan); - } - } - vnet_buffer (b0)->l2_hdr_offset = 0; - vnet_buffer (b0)->l3_hdr_offset = l2hdr_sz; - vnet_buffer (b0)->l4_hdr_offset = hdr->csum_start; - b0->flags |= (VNET_BUFFER_F_L2_HDR_OFFSET_VALID | - VNET_BUFFER_F_L3_HDR_OFFSET_VALID | - VNET_BUFFER_F_L4_HDR_OFFSET_VALID); - - if (PREDICT_TRUE (ethertype == ETHERNET_TYPE_IP4)) - { - ip4_header_t *ip4 = (ip4_header_t *) (b0_data + l2hdr_sz); - l4_proto = ip4->protocol; - b0->flags |= (VNET_BUFFER_F_IS_IP4 | - VNET_BUFFER_F_OFFLOAD_IP_CKSUM); - } - else if (PREDICT_TRUE (ethertype == ETHERNET_TYPE_IP6)) - { - ip6_header_t *ip6 = (ip6_header_t *) (b0_data + l2hdr_sz); - l4_proto = ip6->protocol; - b0->flags |= VNET_BUFFER_F_IS_IP6; } + } + vnet_buffer (b0)->l2_hdr_offset = 0; + vnet_buffer (b0)->l3_hdr_offset = l2hdr_sz; + vnet_buffer (b0)->l4_hdr_offset = hdr->csum_start; + b0->flags |= (VNET_BUFFER_F_L2_HDR_OFFSET_VALID | + VNET_BUFFER_F_L3_HDR_OFFSET_VALID | + VNET_BUFFER_F_L4_HDR_OFFSET_VALID); + + if (PREDICT_TRUE (ethertype == ETHERNET_TYPE_IP4)) + { + ip4_header_t *ip4 = (ip4_header_t *) (b0_data + l2hdr_sz); + l4_proto = ip4->protocol; + b0->flags |= VNET_BUFFER_F_IS_IP4 | VNET_BUFFER_F_OFFLOAD_IP_CKSUM; + } + else if (PREDICT_TRUE (ethertype == ETHERNET_TYPE_IP6)) + { + ip6_header_t *ip6 = (ip6_header_t *) (b0_data + l2hdr_sz); + l4_proto = ip6->protocol; + b0->flags |= VNET_BUFFER_F_IS_IP6; + } - if (l4_proto == IP_PROTOCOL_TCP) - { - tcp_header_t *tcp = (tcp_header_t *) - (b0_data + vnet_buffer (b0)->l4_hdr_offset); - l4_hdr_sz = tcp_header_bytes (tcp); - tcp->checksum = 0; - b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; - } - else if (l4_proto == IP_PROTOCOL_UDP) - { - udp_header_t *udp = - (udp_header_t *) (b0_data + vnet_buffer (b0)->l4_hdr_offset); - l4_hdr_sz = sizeof (*udp); - udp->checksum = 0; - b0->flags |= VNET_BUFFER_F_OFFLOAD_UDP_CKSUM; - } + if (l4_proto == IP_PROTOCOL_TCP) + { + tcp_header_t *tcp = (tcp_header_t *) + (b0_data + vnet_buffer (b0)->l4_hdr_offset); + l4_hdr_sz = tcp_header_bytes (tcp); + tcp->checksum = 0; + b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; + } + else if (l4_proto == IP_PROTOCOL_UDP) + { + udp_header_t *udp = + (udp_header_t *) (b0_data + vnet_buffer (b0)->l4_hdr_offset); + l4_hdr_sz = sizeof (*udp); + udp->checksum = 0; + b0->flags |= VNET_BUFFER_F_OFFLOAD_UDP_CKSUM; } if (hdr->gso_type == VIRTIO_NET_HDR_GSO_UDP) @@ -329,6 +325,57 @@ vhost_user_handle_rx_offload (vlib_buffer_t * b0, u8 * b0_data, } } +static_always_inline void +vhost_user_input_do_interrupt (vlib_main_t * vm, vhost_user_vring_t * txvq, + vhost_user_vring_t * rxvq) +{ + f64 now = vlib_time_now (vm); + + if ((txvq->n_since_last_int) && (txvq->int_deadline < now)) + vhost_user_send_call (vm, txvq); + + if ((rxvq->n_since_last_int) && (rxvq->int_deadline < now)) + vhost_user_send_call (vm, rxvq); +} + +static_always_inline void +vhost_user_input_setup_frame (vlib_main_t * vm, vlib_node_runtime_t * node, + vhost_user_intf_t * vui, + u32 * current_config_index, u32 * next_index, + u32 ** to_next, u32 * n_left_to_next) +{ + vnet_feature_main_t *fm = &feature_main; + u8 feature_arc_idx = fm->device_input_feature_arc_index; + + if (PREDICT_FALSE (vnet_have_features (feature_arc_idx, vui->sw_if_index))) + { + vnet_feature_config_main_t *cm; + cm = &fm->feature_config_mains[feature_arc_idx]; + *current_config_index = vec_elt (cm->config_index_by_sw_if_index, + vui->sw_if_index); + vnet_get_config_data (&cm->config_main, current_config_index, + next_index, 0); + } + + vlib_get_new_next_frame (vm, node, *next_index, *to_next, *n_left_to_next); + + if (*next_index == VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT) + { + /* give some hints to ethernet-input */ + vlib_next_frame_t *nf; + vlib_frame_t *f; + ethernet_input_frame_t *ef; + nf = vlib_node_runtime_get_next_frame (vm, node, *next_index); + f = vlib_get_frame (vm, nf->frame); + f->flags = ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX; + + ef = vlib_frame_scalar_args (f); + ef->sw_if_index = vui->sw_if_index; + ef->hw_if_index = vui->hw_if_index; + vlib_frame_no_append (f); + } +} + static_always_inline u32 vhost_user_if_input (vlib_main_t * vm, vhost_user_main_t * vum, @@ -359,13 +406,7 @@ vhost_user_if_input (vlib_main_t * vm, { /* do we have pending interrupts ? */ vhost_user_vring_t *rxvq = &vui->vrings[VHOST_VRING_IDX_RX (qid)]; - f64 now = vlib_time_now (vm); - - if ((txvq->n_since_last_int) && (txvq->int_deadline < now)) - vhost_user_send_call (vm, txvq); - - if ((rxvq->n_since_last_int) && (rxvq->int_deadline < now)) - vhost_user_send_call (vm, rxvq); + vhost_user_input_do_interrupt (vm, txvq, rxvq); } /* @@ -461,37 +502,12 @@ vhost_user_if_input (vlib_main_t * vm, } } - if (PREDICT_FALSE (vnet_have_features (feature_arc_idx, vui->sw_if_index))) - { - vnet_feature_config_main_t *cm; - cm = &fm->feature_config_mains[feature_arc_idx]; - current_config_index = vec_elt (cm->config_index_by_sw_if_index, - vui->sw_if_index); - vnet_get_config_data (&cm->config_main, ¤t_config_index, - &next_index, 0); - } + vhost_user_input_setup_frame (vm, node, vui, ¤t_config_index, + &next_index, &to_next, &n_left_to_next); u16 last_avail_idx = txvq->last_avail_idx; u16 last_used_idx = txvq->last_used_idx; - vlib_get_new_next_frame (vm, node, next_index, to_next, n_left_to_next); - - if (next_index == VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT) - { - /* give some hints to ethernet-input */ - vlib_next_frame_t *nf; - vlib_frame_t *f; - ethernet_input_frame_t *ef; - nf = vlib_node_runtime_get_next_frame (vm, node, next_index); - f = vlib_get_frame (vm, nf->frame); - f->flags = ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX; - - ef = vlib_frame_scalar_args (f); - ef->sw_if_index = vui->sw_if_index; - ef->hw_if_index = vui->hw_if_index; - vlib_frame_no_append (f); - } - while (n_left > 0) { vlib_buffer_t *b_head, *b_current; @@ -747,6 +763,654 @@ done: return n_rx_packets; } +static_always_inline void +vhost_user_mark_desc_consumed (vhost_user_intf_t * vui, + vhost_user_vring_t * txvq, u16 desc_head, + u16 n_descs_processed) +{ + vring_packed_desc_t *desc_table = txvq->packed_desc; + u16 desc_idx; + u16 mask = txvq->qsz_mask; + + for (desc_idx = 0; desc_idx < n_descs_processed; desc_idx++) + { + if (txvq->used_wrap_counter) + desc_table[(desc_head + desc_idx) & mask].flags |= + (VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED); + else + desc_table[(desc_head + desc_idx) & mask].flags &= + ~(VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED); + vhost_user_advance_last_used_idx (txvq); + } +} + +static_always_inline void +vhost_user_rx_trace_packed (vhost_trace_t * t, vhost_user_intf_t * vui, + u16 qid, vhost_user_vring_t * txvq, + u16 desc_current) +{ + vhost_user_main_t *vum = &vhost_user_main; + vring_packed_desc_t *hdr_desc; + virtio_net_hdr_mrg_rxbuf_t *hdr; + u32 hint = 0; + + clib_memset (t, 0, sizeof (*t)); + t->device_index = vui - vum->vhost_user_interfaces; + t->qid = qid; + + hdr_desc = &txvq->packed_desc[desc_current]; + if (txvq->packed_desc[desc_current].flags & VIRTQ_DESC_F_INDIRECT) + { + t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_INDIRECT; + /* Header is the first here */ + hdr_desc = map_guest_mem (vui, txvq->packed_desc[desc_current].addr, + &hint); + } + if (txvq->packed_desc[desc_current].flags & VIRTQ_DESC_F_NEXT) + t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_SIMPLE_CHAINED; + + if (!(txvq->packed_desc[desc_current].flags & VIRTQ_DESC_F_NEXT) && + !(txvq->packed_desc[desc_current].flags & VIRTQ_DESC_F_INDIRECT)) + t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_SINGLE_DESC; + + t->first_desc_len = hdr_desc ? hdr_desc->len : 0; + + if (!hdr_desc || !(hdr = map_guest_mem (vui, hdr_desc->addr, &hint))) + t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_MAP_ERROR; + else + { + u32 len = vui->virtio_net_hdr_sz; + clib_memcpy_fast (&t->hdr, hdr, + len > hdr_desc->len ? hdr_desc->len : len); + } +} + +static_always_inline u32 +vhost_user_rx_discard_packet_packed (vlib_main_t * vm, + vhost_user_intf_t * vui, + vhost_user_vring_t * txvq, + u32 discard_max) +{ + u32 discarded_packets = 0; + u16 mask = txvq->qsz_mask; + u16 desc_current, desc_head; + + desc_head = desc_current = txvq->last_used_idx & mask; + + /* + * On the RX side, each packet corresponds to one descriptor + * (it is the same whether it is a shallow descriptor, chained, or indirect). + * Therefore, discarding a packet is like discarding a descriptor. + */ + while ((discarded_packets != discard_max) && + vhost_user_packed_desc_available (txvq, desc_current)) + { + vhost_user_advance_last_avail_idx (txvq); + discarded_packets++; + desc_current = (desc_current + 1) & mask; + } + + if (PREDICT_TRUE (discarded_packets)) + vhost_user_mark_desc_consumed (vui, txvq, desc_head, discarded_packets); + return (discarded_packets); +} + +static_always_inline u32 +vhost_user_input_copy_packed (vhost_user_intf_t * vui, vhost_copy_t * cpy, + u16 copy_len, u32 * map_hint) +{ + void *src0, *src1, *src2, *src3, *src4, *src5, *src6, *src7; + u8 bad; + u32 rc = VHOST_USER_INPUT_FUNC_ERROR_NO_ERROR; + + if (PREDICT_TRUE (copy_len >= 8)) + { + src4 = map_guest_mem (vui, cpy[0].src, map_hint); + src5 = map_guest_mem (vui, cpy[1].src, map_hint); + src6 = map_guest_mem (vui, cpy[2].src, map_hint); + src7 = map_guest_mem (vui, cpy[3].src, map_hint); + bad = (src4 == 0) + (src5 == 0) + (src6 == 0) + (src7 == 0); + if (PREDICT_FALSE (bad)) + goto one_by_one; + CLIB_PREFETCH (src4, 64, LOAD); + CLIB_PREFETCH (src5, 64, LOAD); + CLIB_PREFETCH (src6, 64, LOAD); + CLIB_PREFETCH (src7, 64, LOAD); + + while (PREDICT_TRUE (copy_len >= 8)) + { + src0 = src4; + src1 = src5; + src2 = src6; + src3 = src7; + + src4 = map_guest_mem (vui, cpy[4].src, map_hint); + src5 = map_guest_mem (vui, cpy[5].src, map_hint); + src6 = map_guest_mem (vui, cpy[6].src, map_hint); + src7 = map_guest_mem (vui, cpy[7].src, map_hint); + bad = (src4 == 0) + (src5 == 0) + (src6 == 0) + (src7 == 0); + if (PREDICT_FALSE (bad)) + break; + + CLIB_PREFETCH (src4, 64, LOAD); + CLIB_PREFETCH (src5, 64, LOAD); + CLIB_PREFETCH (src6, 64, LOAD); + CLIB_PREFETCH (src7, 64, LOAD); + + clib_memcpy_fast ((void *) cpy[0].dst, src0, cpy[0].len); + clib_memcpy_fast ((void *) cpy[1].dst, src1, cpy[1].len); + clib_memcpy_fast ((void *) cpy[2].dst, src2, cpy[2].len); + clib_memcpy_fast ((void *) cpy[3].dst, src3, cpy[3].len); + copy_len -= 4; + cpy += 4; + } + } + +one_by_one: + while (copy_len) + { + if (PREDICT_FALSE (!(src0 = map_guest_mem (vui, cpy->src, map_hint)))) + { + rc = VHOST_USER_INPUT_FUNC_ERROR_MMAP_FAIL; + break; + } + clib_memcpy_fast ((void *) cpy->dst, src0, cpy->len); + copy_len -= 1; + cpy += 1; + } + return rc; +} + +static_always_inline u32 +vhost_user_do_offload (vhost_user_intf_t * vui, + vring_packed_desc_t * desc_table, u16 desc_current, + u16 mask, vlib_buffer_t * b_head, u32 * map_hint) +{ + u32 rc = VHOST_USER_INPUT_FUNC_ERROR_NO_ERROR; + virtio_net_hdr_mrg_rxbuf_t *hdr; + u8 *b_data; + u32 desc_data_offset = vui->virtio_net_hdr_sz; + + hdr = map_guest_mem (vui, desc_table[desc_current].addr, map_hint); + if (PREDICT_FALSE (hdr == 0)) + rc = VHOST_USER_INPUT_FUNC_ERROR_MMAP_FAIL; + else if (hdr->hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) + { + if (desc_data_offset == desc_table[desc_current].len) + { + desc_current = (desc_current + 1) & mask; + b_data = + map_guest_mem (vui, desc_table[desc_current].addr, map_hint); + if (PREDICT_FALSE (b_data == 0)) + rc = VHOST_USER_INPUT_FUNC_ERROR_MMAP_FAIL; + else + vhost_user_handle_rx_offload (b_head, b_data, &hdr->hdr); + } + else + { + b_data = (u8 *) hdr + desc_data_offset; + vhost_user_handle_rx_offload (b_head, b_data, &hdr->hdr); + } + } + + return rc; +} + +static_always_inline u32 +vhost_user_compute_buffers_required (u32 desc_len, u32 buffer_data_size) +{ + div_t result; + u32 buffers_required; + + if (PREDICT_TRUE (buffer_data_size == 2048)) + { + buffers_required = desc_len >> 11; + if ((desc_len & 2047) != 0) + buffers_required++; + return (buffers_required); + } + + if (desc_len < buffer_data_size) + return 1; + + result = div (desc_len, buffer_data_size); + if (result.rem) + buffers_required = result.quot + 1; + else + buffers_required = result.quot; + + return (buffers_required); +} + +static_always_inline u32 +vhost_user_compute_indirect_desc_len (vhost_user_intf_t * vui, + vhost_user_vring_t * txvq, + u32 buffer_data_size, u16 desc_current, + u32 * map_hint) +{ + vring_packed_desc_t *desc_table = txvq->packed_desc; + u32 desc_len = 0; + u16 desc_data_offset = vui->virtio_net_hdr_sz; + u16 desc_idx = desc_current; + u32 n_descs; + + n_descs = desc_table[desc_idx].len >> 4; + desc_table = map_guest_mem (vui, desc_table[desc_idx].addr, map_hint); + if (PREDICT_FALSE (desc_table == 0)) + return 0; + + for (desc_idx = 0; desc_idx < n_descs; desc_idx++) + desc_len += desc_table[desc_idx].len; + + if (PREDICT_TRUE (desc_len > desc_data_offset)) + desc_len -= desc_data_offset; + + return vhost_user_compute_buffers_required (desc_len, buffer_data_size); +} + +static_always_inline u32 +vhost_user_compute_chained_desc_len (vhost_user_intf_t * vui, + vhost_user_vring_t * txvq, + u32 buffer_data_size, u16 * current, + u16 * n_left) +{ + vring_packed_desc_t *desc_table = txvq->packed_desc; + u32 desc_len = 0; + u16 mask = txvq->qsz_mask; + + while (desc_table[*current].flags & VIRTQ_DESC_F_NEXT) + { + desc_len += desc_table[*current].len; + (*n_left)++; + *current = (*current + 1) & mask; + vhost_user_advance_last_avail_idx (txvq); + } + desc_len += desc_table[*current].len; + (*n_left)++; + *current = (*current + 1) & mask; + vhost_user_advance_last_avail_idx (txvq); + + if (PREDICT_TRUE (desc_len > vui->virtio_net_hdr_sz)) + desc_len -= vui->virtio_net_hdr_sz; + + return vhost_user_compute_buffers_required (desc_len, buffer_data_size); +} + +static_always_inline void +vhost_user_assemble_packet (vring_packed_desc_t * desc_table, + u16 * desc_idx, vlib_buffer_t * b_head, + vlib_buffer_t ** b_current, u32 ** next, + vlib_buffer_t *** b, u32 * bi_current, + vhost_cpu_t * cpu, u16 * copy_len, + u32 * buffers_used, u32 buffers_required, + u32 * desc_data_offset, u32 buffer_data_size, + u16 mask) +{ + u32 desc_data_l; + + while (*desc_data_offset < desc_table[*desc_idx].len) + { + /* Get more output if necessary. Or end of packet. */ + if (PREDICT_FALSE ((*b_current)->current_length == buffer_data_size)) + { + /* Get next output */ + u32 bi_next = **next; + (*next)++; + (*b_current)->next_buffer = bi_next; + (*b_current)->flags |= VLIB_BUFFER_NEXT_PRESENT; + *bi_current = bi_next; + *b_current = **b; + (*b)++; + (*buffers_used)++; + ASSERT (*buffers_used <= buffers_required); + } + + /* Prepare a copy order executed later for the data */ + ASSERT (*copy_len < VHOST_USER_COPY_ARRAY_N); + vhost_copy_t *cpy = &cpu->copy[*copy_len]; + (*copy_len)++; + desc_data_l = desc_table[*desc_idx].len - *desc_data_offset; + cpy->len = buffer_data_size - (*b_current)->current_length; + cpy->len = (cpy->len > desc_data_l) ? desc_data_l : cpy->len; + cpy->dst = (uword) (vlib_buffer_get_current (*b_current) + + (*b_current)->current_length); + cpy->src = desc_table[*desc_idx].addr + *desc_data_offset; + + *desc_data_offset += cpy->len; + + (*b_current)->current_length += cpy->len; + b_head->total_length_not_including_first_buffer += cpy->len; + } + *desc_idx = (*desc_idx + 1) & mask;; + *desc_data_offset = 0; +} + +static_always_inline u32 +vhost_user_if_input_packed (vlib_main_t * vm, vhost_user_main_t * vum, + vhost_user_intf_t * vui, u16 qid, + vlib_node_runtime_t * node, + vnet_hw_interface_rx_mode mode, u8 enable_csum) +{ + vhost_user_vring_t *txvq = &vui->vrings[VHOST_VRING_IDX_TX (qid)]; + vnet_feature_main_t *fm = &feature_main; + u8 feature_arc_idx = fm->device_input_feature_arc_index; + u16 n_rx_packets = 0; + u32 n_rx_bytes = 0; + u16 n_left = 0; + u32 buffers_required = 0; + u32 n_left_to_next, *to_next; + u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; + u32 n_trace = vlib_get_trace_count (vm, node); + u32 buffer_data_size = vlib_buffer_get_default_data_size (vm); + u32 map_hint = 0; + vhost_cpu_t *cpu = &vum->cpus[vm->thread_index]; + u16 copy_len = 0; + u32 current_config_index = ~0; + u16 mask = txvq->qsz_mask; + u16 desc_current, desc_head, last_used_idx; + vring_packed_desc_t *desc_table = 0; + u32 n_descs_processed = 0; + u32 rv; + vlib_buffer_t **b; + u32 *next; + u32 buffers_used = 0; + u16 current, n_descs_to_process; + + /* The descriptor table is not ready yet */ + if (PREDICT_FALSE (txvq->packed_desc == 0)) + goto done; + + /* do we have pending interrupts ? */ + vhost_user_vring_t *rxvq = &vui->vrings[VHOST_VRING_IDX_RX (qid)]; + vhost_user_input_do_interrupt (vm, txvq, rxvq); + + /* + * For adaptive mode, it is optimized to reduce interrupts. + * If the scheduler switches the input node to polling due + * to burst of traffic, we tell the driver no interrupt. + * When the traffic subsides, the scheduler switches the node back to + * interrupt mode. We must tell the driver we want interrupt. + */ + if (PREDICT_FALSE (mode == VNET_HW_INTERFACE_RX_MODE_ADAPTIVE)) + { + if ((node->flags & + VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE) || + !(node->flags & + VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE)) + /* Tell driver we want notification */ + txvq->used_event->flags = 0; + else + /* Tell driver we don't want notification */ + txvq->used_event->flags = VRING_EVENT_F_DISABLE; + } + + last_used_idx = txvq->last_used_idx & mask; + desc_head = desc_current = last_used_idx; + + if (vhost_user_packed_desc_available (txvq, desc_current) == 0) + goto done; + + if (PREDICT_FALSE (!vui->admin_up || !vui->is_ready || !(txvq->enabled))) + { + /* + * Discard input packet if interface is admin down or vring is not + * enabled. + * "For example, for a networking device, in the disabled state + * client must not supply any new RX packets, but must process + * and discard any TX packets." + */ + rv = vhost_user_rx_discard_packet_packed (vm, vui, txvq, + VHOST_USER_DOWN_DISCARD_COUNT); + vlib_error_count (vm, vhost_user_input_node.index, + VHOST_USER_INPUT_FUNC_ERROR_NOT_READY, rv); + goto done; + } + + vhost_user_input_setup_frame (vm, node, vui, ¤t_config_index, + &next_index, &to_next, &n_left_to_next); + + /* + * Compute n_left and total buffers needed + */ + desc_table = txvq->packed_desc; + current = desc_current; + while (vhost_user_packed_desc_available (txvq, current) && + (n_left < VLIB_FRAME_SIZE)) + { + if (desc_table[current].flags & VIRTQ_DESC_F_INDIRECT) + { + buffers_required += + vhost_user_compute_indirect_desc_len (vui, txvq, buffer_data_size, + current, &map_hint); + n_left++; + current = (current + 1) & mask; + vhost_user_advance_last_avail_idx (txvq); + } + else + { + buffers_required += + vhost_user_compute_chained_desc_len (vui, txvq, buffer_data_size, + ¤t, &n_left); + } + } + + /* Something is broken if we need more than 10000 buffers */ + if (PREDICT_FALSE ((buffers_required == 0) || (buffers_required > 10000))) + { + rv = vhost_user_rx_discard_packet_packed (vm, vui, txvq, n_left); + vlib_error_count (vm, vhost_user_input_node.index, + VHOST_USER_INPUT_FUNC_ERROR_NO_BUFFER, rv); + goto done; + } + + vec_validate (cpu->to_next_list, buffers_required); + rv = vlib_buffer_alloc (vm, cpu->to_next_list, buffers_required); + if (PREDICT_FALSE (rv != buffers_required)) + { + vlib_buffer_free (vm, cpu->to_next_list, rv); + rv = vhost_user_rx_discard_packet_packed (vm, vui, txvq, n_left); + vlib_error_count (vm, vhost_user_input_node.index, + VHOST_USER_INPUT_FUNC_ERROR_NO_BUFFER, rv); + goto done; + } + + next = cpu->to_next_list; + vec_validate (cpu->rx_buffers_pdesc, buffers_required); + vlib_get_buffers (vm, next, cpu->rx_buffers_pdesc, buffers_required); + b = cpu->rx_buffers_pdesc; + n_descs_processed = n_left; + + while (n_left) + { + vlib_buffer_t *b_head, *b_current; + u32 bi_current; + u32 desc_data_offset; + u16 desc_idx = desc_current; + u32 n_descs; + + desc_table = txvq->packed_desc; + to_next[0] = bi_current = next[0]; + b_head = b_current = b[0]; + b++; + buffers_used++; + ASSERT (buffers_used <= buffers_required); + to_next++; + next++; + n_left_to_next--; + + /* The buffer should already be initialized */ + b_head->total_length_not_including_first_buffer = 0; + b_head->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + desc_data_offset = vui->virtio_net_hdr_sz; + n_descs_to_process = 1; + + if (desc_table[desc_idx].flags & VIRTQ_DESC_F_INDIRECT) + { + n_descs = desc_table[desc_idx].len >> 4; + desc_table = map_guest_mem (vui, desc_table[desc_idx].addr, + &map_hint); + desc_idx = 0; + if (PREDICT_FALSE (desc_table == 0) || + (enable_csum && + (PREDICT_FALSE + (vhost_user_do_offload + (vui, desc_table, desc_idx, mask, b_head, + &map_hint) != VHOST_USER_INPUT_FUNC_ERROR_NO_ERROR)))) + { + vlib_error_count (vm, node->node_index, + VHOST_USER_INPUT_FUNC_ERROR_MMAP_FAIL, 1); + to_next--; + next--; + n_left_to_next++; + buffers_used--; + b--; + goto out; + } + while (n_descs) + { + vhost_user_assemble_packet (desc_table, &desc_idx, b_head, + &b_current, &next, &b, &bi_current, + cpu, ©_len, &buffers_used, + buffers_required, &desc_data_offset, + buffer_data_size, mask); + n_descs--; + } + } + else + { + if (enable_csum) + { + rv = vhost_user_do_offload (vui, desc_table, desc_idx, mask, + b_head, &map_hint); + if (PREDICT_FALSE (rv != VHOST_USER_INPUT_FUNC_ERROR_NO_ERROR)) + { + vlib_error_count (vm, node->node_index, rv, 1); + to_next--; + next--; + n_left_to_next++; + buffers_used--; + b--; + goto out; + } + } + /* + * For chained descriptor, we process all chains in a single while + * loop. So count how many descriptors in the chain. + */ + n_descs_to_process = 1; + while (desc_table[desc_idx].flags & VIRTQ_DESC_F_NEXT) + { + vhost_user_assemble_packet (desc_table, &desc_idx, b_head, + &b_current, &next, &b, &bi_current, + cpu, ©_len, &buffers_used, + buffers_required, &desc_data_offset, + buffer_data_size, mask); + n_descs_to_process++; + } + vhost_user_assemble_packet (desc_table, &desc_idx, b_head, + &b_current, &next, &b, &bi_current, + cpu, ©_len, &buffers_used, + buffers_required, &desc_data_offset, + buffer_data_size, mask); + } + + n_rx_bytes += b_head->total_length_not_including_first_buffer; + n_rx_packets++; + + b_head->total_length_not_including_first_buffer -= + b_head->current_length; + + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b_head); + + vnet_buffer (b_head)->sw_if_index[VLIB_RX] = vui->sw_if_index; + vnet_buffer (b_head)->sw_if_index[VLIB_TX] = ~0; + b_head->error = 0; + + if (current_config_index != ~0) + { + b_head->current_config_index = current_config_index; + vnet_buffer (b_head)->feature_arc_index = feature_arc_idx; + } + + out: + ASSERT (n_left >= n_descs_to_process); + n_left -= n_descs_to_process; + + /* advance to next descrptor */ + desc_current = (desc_current + n_descs_to_process) & mask; + + /* + * Although separating memory copies from virtio ring parsing + * is beneficial, we can offer to perform the copies from time + * to time in order to free some space in the ring. + */ + if (PREDICT_FALSE (copy_len >= VHOST_USER_RX_COPY_THRESHOLD)) + { + rv = vhost_user_input_copy_packed (vui, cpu->copy, copy_len, + &map_hint); + if (PREDICT_FALSE (rv != VHOST_USER_INPUT_FUNC_ERROR_NO_ERROR)) + vlib_error_count (vm, node->node_index, rv, 1); + copy_len = 0; + } + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + + /* Do the memory copies */ + rv = vhost_user_input_copy_packed (vui, cpu->copy, copy_len, &map_hint); + if (PREDICT_FALSE (rv != VHOST_USER_INPUT_FUNC_ERROR_NO_ERROR)) + vlib_error_count (vm, node->node_index, rv, 1); + + /* Must do the tracing before giving buffers back to driver */ + if (PREDICT_FALSE (n_trace)) + { + u32 left = n_rx_packets; + + b = cpu->rx_buffers_pdesc; + while (n_trace && left) + { + vhost_trace_t *t0; + + vlib_trace_buffer (vm, node, next_index, b[0], + /* follow_chain */ 0); + t0 = vlib_add_trace (vm, node, b[0], sizeof (t0[0])); + b++; + vhost_user_rx_trace_packed (t0, vui, qid, txvq, last_used_idx); + last_used_idx = (last_used_idx + 1) & mask; + n_trace--; + left--; + vlib_set_trace_count (vm, node, n_trace); + } + } + + /* + * Give buffers back to driver. + */ + vhost_user_mark_desc_consumed (vui, txvq, desc_head, n_descs_processed); + + /* interrupt (call) handling */ + if ((txvq->callfd_idx != ~0) && + (txvq->avail_event->flags != VRING_EVENT_F_DISABLE)) + { + txvq->n_since_last_int += n_rx_packets; + if (txvq->n_since_last_int > vum->coalesce_frames) + vhost_user_send_call (vm, txvq); + } + + /* increase rx counters */ + vlib_increment_combined_counter + (vnet_main.interface_main.combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, vm->thread_index, vui->sw_if_index, + n_rx_packets, n_rx_bytes); + + vnet_device_increment_rx_packets (vm->thread_index, n_rx_packets); + + if (PREDICT_FALSE (buffers_used < buffers_required)) + vlib_buffer_free (vm, next, buffers_required - buffers_used); + +done: + return n_rx_packets; +} + VLIB_NODE_FN (vhost_user_input_node) (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) @@ -765,14 +1429,26 @@ VLIB_NODE_FN (vhost_user_input_node) (vlib_main_t * vm, { vui = pool_elt_at_index (vum->vhost_user_interfaces, dq->dev_instance); - if (vui->features & (1ULL << FEAT_VIRTIO_NET_F_CSUM)) - n_rx_packets += - vhost_user_if_input (vm, vum, vui, dq->queue_id, node, dq->mode, - 1); + if (vhost_user_is_packed_ring_supported (vui)) + { + if (vui->features & (1ULL << FEAT_VIRTIO_NET_F_CSUM)) + n_rx_packets += vhost_user_if_input_packed (vm, vum, vui, + dq->queue_id, node, + dq->mode, 1); + else + n_rx_packets += vhost_user_if_input_packed (vm, vum, vui, + dq->queue_id, node, + dq->mode, 0); + } else - n_rx_packets += - vhost_user_if_input (vm, vum, vui, dq->queue_id, node, dq->mode, - 0); + { + if (vui->features & (1ULL << FEAT_VIRTIO_NET_F_CSUM)) + n_rx_packets += vhost_user_if_input (vm, vum, vui, dq->queue_id, + node, dq->mode, 1); + else + n_rx_packets += vhost_user_if_input (vm, vum, vui, dq->queue_id, + node, dq->mode, 0); + } } } diff --git a/src/vnet/devices/virtio/vhost_user_output.c b/src/vnet/devices/virtio/vhost_user_output.c index b6abe36d972..4f5eb3c1d76 100644 --- a/src/vnet/devices/virtio/vhost_user_output.c +++ b/src/vnet/devices/virtio/vhost_user_output.c @@ -294,6 +294,424 @@ vhost_user_handle_tx_offload (vhost_user_intf_t * vui, vlib_buffer_t * b, } } +static_always_inline void +vhost_user_mark_desc_available (vlib_main_t * vm, vhost_user_vring_t * rxvq, + u16 * n_descs_processed, u8 chained, + vlib_frame_t * frame, u32 n_left) +{ + u16 desc_idx, flags; + vring_packed_desc_t *desc_table = rxvq->packed_desc; + u16 last_used_idx = rxvq->last_used_idx; + + if (PREDICT_FALSE (*n_descs_processed == 0)) + return; + + if (rxvq->used_wrap_counter) + flags = desc_table[last_used_idx & rxvq->qsz_mask].flags | + (VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED); + else + flags = desc_table[last_used_idx & rxvq->qsz_mask].flags & + ~(VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED); + + vhost_user_advance_last_used_idx (rxvq); + + for (desc_idx = 1; desc_idx < *n_descs_processed; desc_idx++) + { + if (rxvq->used_wrap_counter) + desc_table[rxvq->last_used_idx & rxvq->qsz_mask].flags |= + (VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED); + else + desc_table[rxvq->last_used_idx & rxvq->qsz_mask].flags &= + ~(VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED); + vhost_user_advance_last_used_idx (rxvq); + } + + desc_table[last_used_idx & rxvq->qsz_mask].flags = flags; + + *n_descs_processed = 0; + + if (chained) + { + vring_packed_desc_t *desc_table = rxvq->packed_desc; + + while (desc_table[rxvq->last_used_idx & rxvq->qsz_mask].flags & + VIRTQ_DESC_F_NEXT) + vhost_user_advance_last_used_idx (rxvq); + + /* Advance past the current chained table entries */ + vhost_user_advance_last_used_idx (rxvq); + } + + /* interrupt (call) handling */ + if ((rxvq->callfd_idx != ~0) && + (rxvq->avail_event->flags != VRING_EVENT_F_DISABLE)) + { + vhost_user_main_t *vum = &vhost_user_main; + + rxvq->n_since_last_int += frame->n_vectors - n_left; + if (rxvq->n_since_last_int > vum->coalesce_frames) + vhost_user_send_call (vm, rxvq); + } +} + +static_always_inline void +vhost_user_tx_trace_packed (vhost_trace_t * t, vhost_user_intf_t * vui, + u16 qid, vlib_buffer_t * b, + vhost_user_vring_t * rxvq) +{ + vhost_user_main_t *vum = &vhost_user_main; + u32 last_avail_idx = rxvq->last_avail_idx; + u32 desc_current = last_avail_idx & rxvq->qsz_mask; + vring_packed_desc_t *hdr_desc = 0; + u32 hint = 0; + + clib_memset (t, 0, sizeof (*t)); + t->device_index = vui - vum->vhost_user_interfaces; + t->qid = qid; + + hdr_desc = &rxvq->packed_desc[desc_current]; + if (rxvq->packed_desc[desc_current].flags & VIRTQ_DESC_F_INDIRECT) + { + t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_INDIRECT; + /* Header is the first here */ + hdr_desc = map_guest_mem (vui, rxvq->packed_desc[desc_current].addr, + &hint); + } + if (rxvq->packed_desc[desc_current].flags & VIRTQ_DESC_F_NEXT) + { + t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_SIMPLE_CHAINED; + } + if (!(rxvq->packed_desc[desc_current].flags & VIRTQ_DESC_F_NEXT) && + !(rxvq->packed_desc[desc_current].flags & VIRTQ_DESC_F_INDIRECT)) + { + t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_SINGLE_DESC; + } + + t->first_desc_len = hdr_desc ? hdr_desc->len : 0; +} + +static_always_inline uword +vhost_user_device_class_packed (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 *buffers = vlib_frame_vector_args (frame); + u32 n_left = frame->n_vectors; + vhost_user_main_t *vum = &vhost_user_main; + vnet_interface_output_runtime_t *rd = (void *) node->runtime_data; + vhost_user_intf_t *vui = + pool_elt_at_index (vum->vhost_user_interfaces, rd->dev_instance); + u32 qid; + vhost_user_vring_t *rxvq; + u8 error; + u32 thread_index = vm->thread_index; + vhost_cpu_t *cpu = &vum->cpus[thread_index]; + u32 map_hint = 0; + u8 retry = 8; + u16 copy_len; + u16 tx_headers_len; + vring_packed_desc_t *desc_table; + u32 or_flags; + u16 desc_head, desc_index, desc_len; + u16 n_descs_processed; + u8 indirect, chained; + + qid = VHOST_VRING_IDX_RX (*vec_elt_at_index (vui->per_cpu_tx_qid, + thread_index)); + rxvq = &vui->vrings[qid]; + +retry: + error = VHOST_USER_TX_FUNC_ERROR_NONE; + tx_headers_len = 0; + copy_len = 0; + n_descs_processed = 0; + + while (n_left > 0) + { + vlib_buffer_t *b0, *current_b0; + uword buffer_map_addr; + u32 buffer_len; + u16 bytes_left; + u32 total_desc_len = 0; + u16 n_entries = 0; + + indirect = 0; + chained = 0; + if (PREDICT_TRUE (n_left > 1)) + vlib_prefetch_buffer_with_index (vm, buffers[1], LOAD); + + b0 = vlib_get_buffer (vm, buffers[0]); + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + cpu->current_trace = vlib_add_trace (vm, node, b0, + sizeof (*cpu->current_trace)); + vhost_user_tx_trace_packed (cpu->current_trace, vui, qid / 2, b0, + rxvq); + } + + desc_table = rxvq->packed_desc; + desc_head = desc_index = rxvq->last_avail_idx & rxvq->qsz_mask; + if (PREDICT_FALSE (!vhost_user_packed_desc_available (rxvq, desc_head))) + { + error = VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF; + goto done; + } + /* + * Go deeper in case of indirect descriptor. + * To test it, turn off mrg_rxbuf. + */ + if (desc_table[desc_head].flags & VIRTQ_DESC_F_INDIRECT) + { + indirect = 1; + if (PREDICT_FALSE (desc_table[desc_head].len < + sizeof (vring_packed_desc_t))) + { + error = VHOST_USER_TX_FUNC_ERROR_INDIRECT_OVERFLOW; + goto done; + } + n_entries = desc_table[desc_head].len >> 4; + desc_table = map_guest_mem (vui, desc_table[desc_index].addr, + &map_hint); + if (PREDICT_FALSE (desc_table == 0)) + { + error = VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL; + goto done; + } + desc_index = 0; + } + else if (rxvq->packed_desc[desc_head].flags & VIRTQ_DESC_F_NEXT) + chained = 1; + + desc_len = vui->virtio_net_hdr_sz; + buffer_map_addr = desc_table[desc_index].addr; + buffer_len = desc_table[desc_index].len; + + /* Get a header from the header array */ + virtio_net_hdr_mrg_rxbuf_t *hdr = &cpu->tx_headers[tx_headers_len]; + tx_headers_len++; + hdr->hdr.flags = 0; + hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE; + hdr->num_buffers = 1; + + or_flags = (b0->flags & VNET_BUFFER_F_OFFLOAD_IP_CKSUM) || + (b0->flags & VNET_BUFFER_F_OFFLOAD_UDP_CKSUM) || + (b0->flags & VNET_BUFFER_F_OFFLOAD_TCP_CKSUM); + + /* Guest supports csum offload and buffer requires checksum offload? */ + if (or_flags && + (vui->features & (1ULL << FEAT_VIRTIO_NET_F_GUEST_CSUM))) + vhost_user_handle_tx_offload (vui, b0, &hdr->hdr); + + /* Prepare a copy order executed later for the header */ + ASSERT (copy_len < VHOST_USER_COPY_ARRAY_N); + vhost_copy_t *cpy = &cpu->copy[copy_len]; + copy_len++; + cpy->len = vui->virtio_net_hdr_sz; + cpy->dst = buffer_map_addr; + cpy->src = (uword) hdr; + + buffer_map_addr += vui->virtio_net_hdr_sz; + buffer_len -= vui->virtio_net_hdr_sz; + bytes_left = b0->current_length; + current_b0 = b0; + while (1) + { + if (buffer_len == 0) + { + /* Get new output */ + if (chained) + { + /* + * Next one is chained + * Test it with both indirect and mrg_rxbuf off + */ + if (PREDICT_FALSE (!(desc_table[desc_index].flags & + VIRTQ_DESC_F_NEXT))) + { + /* + * Last descriptor in chain. + * Dequeue queued descriptors for this packet + */ + vhost_user_dequeue_chained_descs (rxvq, + &n_descs_processed); + error = VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF; + goto done; + } + vhost_user_advance_last_avail_idx (rxvq); + desc_index = rxvq->last_avail_idx & rxvq->qsz_mask; + n_descs_processed++; + buffer_map_addr = desc_table[desc_index].addr; + buffer_len = desc_table[desc_index].len; + total_desc_len += desc_len; + desc_len = 0; + } + else if (indirect) + { + /* + * Indirect table + * Test it with mrg_rxnuf off + */ + if (PREDICT_TRUE (n_entries > 0)) + n_entries--; + else + { + /* Dequeue queued descriptors for this packet */ + vhost_user_dequeue_chained_descs (rxvq, + &n_descs_processed); + error = VHOST_USER_TX_FUNC_ERROR_INDIRECT_OVERFLOW; + goto done; + } + total_desc_len += desc_len; + desc_index = (desc_index + 1) & rxvq->qsz_mask; + buffer_map_addr = desc_table[desc_index].addr; + buffer_len = desc_table[desc_index].len; + desc_len = 0; + } + else if (vui->virtio_net_hdr_sz == 12) + { + /* + * MRG is available + * This is the default setting for the guest VM + */ + virtio_net_hdr_mrg_rxbuf_t *hdr = + &cpu->tx_headers[tx_headers_len - 1]; + + desc_table[desc_index].len = desc_len; + vhost_user_advance_last_avail_idx (rxvq); + desc_head = desc_index = + rxvq->last_avail_idx & rxvq->qsz_mask; + hdr->num_buffers++; + n_descs_processed++; + desc_len = 0; + + if (PREDICT_FALSE (!vhost_user_packed_desc_available + (rxvq, desc_index))) + { + /* Dequeue queued descriptors for this packet */ + vhost_user_dequeue_descs (rxvq, hdr, + &n_descs_processed); + error = VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF; + goto done; + } + + buffer_map_addr = desc_table[desc_index].addr; + buffer_len = desc_table[desc_index].len; + } + else + { + error = VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOMRG; + goto done; + } + } + + ASSERT (copy_len < VHOST_USER_COPY_ARRAY_N); + vhost_copy_t *cpy = &cpu->copy[copy_len]; + copy_len++; + cpy->len = bytes_left; + cpy->len = (cpy->len > buffer_len) ? buffer_len : cpy->len; + cpy->dst = buffer_map_addr; + cpy->src = (uword) vlib_buffer_get_current (current_b0) + + current_b0->current_length - bytes_left; + + bytes_left -= cpy->len; + buffer_len -= cpy->len; + buffer_map_addr += cpy->len; + desc_len += cpy->len; + + CLIB_PREFETCH (&rxvq->packed_desc, CLIB_CACHE_LINE_BYTES, LOAD); + + /* Check if vlib buffer has more data. If not, get more or break */ + if (PREDICT_TRUE (!bytes_left)) + { + if (PREDICT_FALSE + (current_b0->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + current_b0 = vlib_get_buffer (vm, current_b0->next_buffer); + bytes_left = current_b0->current_length; + } + else + { + /* End of packet */ + break; + } + } + } + + /* Move from available to used ring */ + total_desc_len += desc_len; + rxvq->packed_desc[desc_head].len = total_desc_len; + + vhost_user_advance_last_avail_table_idx (vui, rxvq, chained); + n_descs_processed++; + + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + cpu->current_trace->hdr = cpu->tx_headers[tx_headers_len - 1]; + + n_left--; + + /* + * Do the copy periodically to prevent + * cpu->copy array overflow and corrupt memory + */ + if (PREDICT_FALSE (copy_len >= VHOST_USER_TX_COPY_THRESHOLD) || chained) + { + if (PREDICT_FALSE (vhost_user_tx_copy (vui, cpu->copy, copy_len, + &map_hint))) + vlib_error_count (vm, node->node_index, + VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL, 1); + copy_len = 0; + + /* give buffers back to driver */ + vhost_user_mark_desc_available (vm, rxvq, &n_descs_processed, + chained, frame, n_left); + } + + buffers++; + } + +done: + if (PREDICT_TRUE (copy_len)) + { + if (PREDICT_FALSE (vhost_user_tx_copy (vui, cpu->copy, copy_len, + &map_hint))) + vlib_error_count (vm, node->node_index, + VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL, 1); + + vhost_user_mark_desc_available (vm, rxvq, &n_descs_processed, chained, + frame, n_left); + } + + /* + * When n_left is set, error is always set to something too. + * In case error is due to lack of remaining buffers, we go back up and + * retry. + * The idea is that it is better to waste some time on packets + * that have been processed already than dropping them and get + * more fresh packets with a good likelyhood that they will be dropped too. + * This technique also gives more time to VM driver to pick-up packets. + * In case the traffic flows from physical to virtual interfaces, this + * technique will end-up leveraging the physical NIC buffer in order to + * absorb the VM's CPU jitter. + */ + if (n_left && (error == VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF) && retry) + { + retry--; + goto retry; + } + + vhost_user_vring_unlock (vui, qid); + + if (PREDICT_FALSE (n_left && error != VHOST_USER_TX_FUNC_ERROR_NONE)) + { + vlib_error_count (vm, node->node_index, error, n_left); + vlib_increment_simple_counter + (vnet_main.interface_main.sw_if_counters + + VNET_INTERFACE_COUNTER_DROP, thread_index, vui->sw_if_index, n_left); + } + + vlib_buffer_free (vm, vlib_frame_vector_args (frame), frame->n_vectors); + return frame->n_vectors; +} + VNET_DEVICE_CLASS_TX_FN (vhost_user_device_class) (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) @@ -339,6 +757,9 @@ VNET_DEVICE_CLASS_TX_FN (vhost_user_device_class) (vlib_main_t * vm, if (PREDICT_FALSE (vui->use_tx_spinlock)) vhost_user_vring_lock (vui, qid); + if (vhost_user_is_packed_ring_supported (vui)) + return (vhost_user_device_class_packed (vm, node, frame)); + retry: error = VHOST_USER_TX_FUNC_ERROR_NONE; tx_headers_len = 0; diff --git a/src/vpp/api/custom_dump.c b/src/vpp/api/custom_dump.c index 657a5802f47..ab80d7e7617 100644 --- a/src/vpp/api/custom_dump.c +++ b/src/vpp/api/custom_dump.c @@ -1734,9 +1734,11 @@ static void *vl_api_create_vhost_user_if_t_print if (mp->disable_indirect_desc) s = format (s, "disable_indirect_desc "); if (mp->tag[0]) - s = format (s, "tag %s", mp->tag); + s = format (s, "tag %s ", mp->tag); if (mp->enable_gso) - s = format (s, "gso"); + s = format (s, "gso "); + if (mp->enable_packed) + s = format (s, "packed"); FINISH; } @@ -1755,7 +1757,9 @@ static void *vl_api_modify_vhost_user_if_t_print if (mp->renumber) s = format (s, "renumber %d ", (mp->custom_dev_instance)); if (mp->enable_gso) - s = format (s, "gso"); + s = format (s, "gso "); + if (mp->enable_packed) + s = format (s, "packed"); FINISH; } diff --git a/test/vpp_vhost_interface.py b/test/vpp_vhost_interface.py index 569fe36d1d6..fd2928eac1d 100644 --- a/test/vpp_vhost_interface.py +++ b/test/vpp_vhost_interface.py @@ -6,8 +6,8 @@ class VppVhostInterface(VppInterface): def __init__(self, test, sock_filename, is_server=0, renumber=0, disable_mrg_rxbuf=0, disable_indirect_desc=0, gso=0, - custom_dev_instance=0, use_custom_mac=0, mac_address='', - tag=''): + packed_ring=0, custom_dev_instance=0, use_custom_mac=0, + mac_address='', tag=''): """ Create VPP Vhost interface """ super(VppVhostInterface, self).__init__(test) @@ -17,6 +17,7 @@ class VppVhostInterface(VppInterface): self.disable_mrg_rxbuf = disable_mrg_rxbuf self.disable_indirect_desc = disable_indirect_desc self.gso = gso + self.packed_ring = packed_ring self.custom_dev_instance = custom_dev_instance self.use_custom_mac = use_custom_mac self.mac_address = mac_address @@ -29,6 +30,7 @@ class VppVhostInterface(VppInterface): self.disable_mrg_rxbuf, self.disable_indirect_desc, self.gso, + self.packed_ring, self.custom_dev_instance, self.use_custom_mac, self.mac_address, |