aboutsummaryrefslogtreecommitdiffstats
path: root/src/vnet/devices/virtio/vhost_user.c
diff options
context:
space:
mode:
authorSteven Luong <sluong@cisco.com>2020-03-23 09:34:59 -0700
committerSteven Luong <sluong@cisco.com>2020-04-27 09:25:32 -0700
commitbc0d9ff6727d77668e216aba1c6d6cb753fa2ac3 (patch)
treedc2af469cf255d3d819dd52f1cc6703d708f9728 /src/vnet/devices/virtio/vhost_user.c
parentba6deb96e923f71aa9387c06000412c3fb1362fa (diff)
virtio: support virtio 1.1 packed ring in vhost
virtio 1.1 defines a number of new features. Packed ring is among the most notable and important one. It combines used, available, and descripptor rings into one. This patch provides experimental support for packed ring. To avoid regression, when packed ring is configured for the interface, it is branched to a separate RX and TX driver. Non packed ring should continue to perform as it was before. Packed ring is tested using qemu4.2 and ubuntu focal fossa (kernel 5.4.0-12) on the guess VM which supports packed ring. To configure VPP with packed ring, just add the optional keyword "packed" when creating the vhost interface. To bring up the guest VM with packed ring, add "packed=on" in the qemu launch command. To facilitate troubleshooting, also added "verbose" option in show vhost desc CLI to include displaying the indirect descriptors. Known qemu reconnect issue - If VPP is restarted, guest VMs also need to be restarted. The problem is kernel virtio-net-pci keeps track of the previous available and used indices. For virtio 1.0, these indices are in shared memory and qemu can easily copy them to pass to the backend for reconnect. For virio 1.1, these indices are no longer in shared memory. Qemu needs a new mechanism to retrieve them and it is not currently implemented. So when the protocol reconnects, qemu does not have the correct available and used indices to pass to the backend. As a result, after the reconnect, virtio-net-pci is reading the TX ring from the wrong position in the ring, not the same position which the backend is writing. Similar problem exists also in the RX. Type: feature Signed-off-by: Steven Luong <sluong@cisco.com> Change-Id: I5afc50b0bafab5a1de7a6dd10f399db3fafd144c
Diffstat (limited to 'src/vnet/devices/virtio/vhost_user.c')
-rw-r--r--src/vnet/devices/virtio/vhost_user.c311
1 files changed, 264 insertions, 47 deletions
diff --git a/src/vnet/devices/virtio/vhost_user.c b/src/vnet/devices/virtio/vhost_user.c
index 7094a00fb33..d24e516a93c 100644
--- a/src/vnet/devices/virtio/vhost_user.c
+++ b/src/vnet/devices/virtio/vhost_user.c
@@ -466,6 +466,8 @@ vhost_user_socket_read (clib_file_t * uf)
if (vui->enable_gso)
msg.u64 |= FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS;
+ if (vui->enable_packed)
+ msg.u64 |= (1ULL << FEAT_VIRTIO_F_RING_PACKED);
msg.size = sizeof (msg.u64);
vu_log_debug (vui, "if %d msg VHOST_USER_GET_FEATURES - reply "
@@ -655,7 +657,11 @@ vhost_user_socket_read (clib_file_t * uf)
vui->vrings[msg.state.index].used->idx;
/* tell driver that we don't want interrupts */
- vui->vrings[msg.state.index].used->flags = VRING_USED_F_NO_NOTIFY;
+ if (vhost_user_is_packed_ring_supported (vui))
+ vui->vrings[msg.state.index].used_event->flags =
+ VRING_EVENT_F_DISABLE;
+ else
+ vui->vrings[msg.state.index].used->flags = VRING_USED_F_NO_NOTIFY;
vlib_worker_thread_barrier_release (vm);
vhost_user_update_iface_state (vui);
break;
@@ -762,10 +768,47 @@ vhost_user_socket_read (clib_file_t * uf)
break;
case VHOST_USER_SET_VRING_BASE:
- vu_log_debug (vui, "if %d msg VHOST_USER_SET_VRING_BASE idx %d num %d",
+ vu_log_debug (vui,
+ "if %d msg VHOST_USER_SET_VRING_BASE idx %d num 0x%x",
vui->hw_if_index, msg.state.index, msg.state.num);
vlib_worker_thread_barrier_sync (vm);
vui->vrings[msg.state.index].last_avail_idx = msg.state.num;
+ if (vhost_user_is_packed_ring_supported (vui))
+ {
+ /*
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | last avail idx | | last used idx | |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * ^ ^
+ * | |
+ * avail wrap counter used wrap counter
+ */
+ /* last avail idx at bit 0-14. */
+ vui->vrings[msg.state.index].last_avail_idx =
+ msg.state.num & 0x7fff;
+ /* avail wrap counter at bit 15 */
+ vui->vrings[msg.state.index].avail_wrap_counter =
+ ! !(msg.state.num & (1 << 15));
+
+ /*
+ * Although last_used_idx is passed in the upper 16 bits in qemu
+ * implementation, in practice, last_avail_idx and last_used_idx are
+ * usually the same. As a result, DPDK does not bother to pass us
+ * last_used_idx. The spec is not clear on thex coding. I figured it
+ * out by reading the qemu code. So let's just read last_avail_idx
+ * and set last_used_idx equals to last_avail_idx.
+ */
+ vui->vrings[msg.state.index].last_used_idx =
+ vui->vrings[msg.state.index].last_avail_idx;
+ vui->vrings[msg.state.index].used_wrap_counter =
+ vui->vrings[msg.state.index].avail_wrap_counter;
+
+ if (vui->vrings[msg.state.index].avail_wrap_counter == 1)
+ vui->vrings[msg.state.index].avail_wrap_counter =
+ VIRTQ_DESC_F_AVAIL;
+ }
vlib_worker_thread_barrier_release (vm);
break;
@@ -784,6 +827,15 @@ vhost_user_socket_read (clib_file_t * uf)
* closing the vring also initializes the vring last_avail_idx
*/
msg.state.num = vui->vrings[msg.state.index].last_avail_idx;
+ if (vhost_user_is_packed_ring_supported (vui))
+ {
+ msg.state.num =
+ (vui->vrings[msg.state.index].last_avail_idx & 0x7fff) |
+ (! !vui->vrings[msg.state.index].avail_wrap_counter << 15);
+ msg.state.num |=
+ ((vui->vrings[msg.state.index].last_used_idx & 0x7fff) |
+ (! !vui->vrings[msg.state.index].used_wrap_counter << 15)) << 16;
+ }
msg.flags |= 4;
msg.size = sizeof (msg.state);
@@ -793,7 +845,8 @@ vhost_user_socket_read (clib_file_t * uf)
*/
vhost_user_vring_close (vui, msg.state.index);
vlib_worker_thread_barrier_release (vm);
- vu_log_debug (vui, "if %d msg VHOST_USER_GET_VRING_BASE idx %d num %d",
+ vu_log_debug (vui,
+ "if %d msg VHOST_USER_GET_VRING_BASE idx %d num 0x%x",
vui->hw_if_index, msg.state.index, msg.state.num);
n =
send (uf->file_descriptor, &msg, VHOST_USER_MSG_HDR_SZ + msg.size, 0);
@@ -1440,7 +1493,8 @@ vhost_user_vui_init (vnet_main_t * vnm,
vhost_user_intf_t * vui,
int server_sock_fd,
const char *sock_filename,
- u64 feature_mask, u32 * sw_if_index, u8 enable_gso)
+ u64 feature_mask, u32 * sw_if_index, u8 enable_gso,
+ u8 enable_packed)
{
vnet_sw_interface_t *sw;
int q;
@@ -1472,6 +1526,7 @@ vhost_user_vui_init (vnet_main_t * vnm,
vui->log_base_addr = 0;
vui->if_index = vui - vum->vhost_user_interfaces;
vui->enable_gso = enable_gso;
+ vui->enable_packed = enable_packed;
/*
* enable_gso takes precedence over configurable feature mask if there
* is a clash.
@@ -1519,7 +1574,7 @@ vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm,
u32 * sw_if_index,
u64 feature_mask,
u8 renumber, u32 custom_dev_instance, u8 * hwaddr,
- u8 enable_gso)
+ u8 enable_gso, u8 enable_packed)
{
vhost_user_intf_t *vui = NULL;
u32 sw_if_idx = ~0;
@@ -1560,7 +1615,7 @@ vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm,
vlib_worker_thread_barrier_release (vm);
vhost_user_vui_init (vnm, vui, server_sock_fd, sock_filename,
- feature_mask, &sw_if_idx, enable_gso);
+ feature_mask, &sw_if_idx, enable_gso, enable_packed);
vnet_sw_interface_set_mtu (vnm, vui->sw_if_index, 9000);
vhost_user_rx_thread_placement (vui, 1);
@@ -1582,7 +1637,7 @@ vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm,
u8 is_server,
u32 sw_if_index,
u64 feature_mask, u8 renumber, u32 custom_dev_instance,
- u8 enable_gso)
+ u8 enable_gso, u8 enable_packed)
{
vhost_user_main_t *vum = &vhost_user_main;
vhost_user_intf_t *vui = NULL;
@@ -1619,7 +1674,8 @@ vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm,
vhost_user_term_if (vui);
vhost_user_vui_init (vnm, vui, server_sock_fd,
- sock_filename, feature_mask, &sw_if_idx, enable_gso);
+ sock_filename, feature_mask, &sw_if_idx, enable_gso,
+ enable_packed);
if (renumber)
vnet_interface_name_renumber (sw_if_idx, custom_dev_instance);
@@ -1645,7 +1701,7 @@ vhost_user_connect_command_fn (vlib_main_t * vm,
u8 hwaddr[6];
u8 *hw = NULL;
clib_error_t *error = NULL;
- u8 enable_gso = 0;
+ u8 enable_gso = 0, enable_packed = 0;
/* Get a line of input. */
if (!unformat_user (input, unformat_line_input, line_input))
@@ -1653,6 +1709,8 @@ vhost_user_connect_command_fn (vlib_main_t * vm,
/* GSO feature is disable by default */
feature_mask &= ~FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS;
+ /* packed-ring feature is disable by default */
+ feature_mask &= ~(1ULL << FEAT_VIRTIO_F_RING_PACKED);
while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
{
if (unformat (line_input, "socket %s", &sock_filename))
@@ -1661,6 +1719,8 @@ vhost_user_connect_command_fn (vlib_main_t * vm,
is_server = 1;
else if (unformat (line_input, "gso"))
enable_gso = 1;
+ else if (unformat (line_input, "packed"))
+ enable_packed = 1;
else if (unformat (line_input, "feature-mask 0x%llx", &feature_mask))
;
else
@@ -1685,7 +1745,7 @@ vhost_user_connect_command_fn (vlib_main_t * vm,
if ((rv = vhost_user_create_if (vnm, vm, (char *) sock_filename,
is_server, &sw_if_index, feature_mask,
renumber, custom_dev_instance, hw,
- enable_gso)))
+ enable_gso, enable_packed)))
{
error = clib_error_return (0, "vhost_user_create_if returned %d", rv);
goto done;
@@ -1799,6 +1859,186 @@ vhost_user_dump_ifs (vnet_main_t * vnm, vlib_main_t * vm,
return rv;
}
+static u8 *
+format_vhost_user_desc (u8 * s, va_list * args)
+{
+ char *fmt = va_arg (*args, char *);
+ vhost_user_intf_t *vui = va_arg (*args, vhost_user_intf_t *);
+ vring_desc_t *desc_table = va_arg (*args, vring_desc_t *);
+ int idx = va_arg (*args, int);
+ u32 *mem_hint = va_arg (*args, u32 *);
+
+ s = format (s, fmt, idx, desc_table[idx].addr, desc_table[idx].len,
+ desc_table[idx].flags, desc_table[idx].next,
+ pointer_to_uword (map_guest_mem (vui, desc_table[idx].addr,
+ mem_hint)));
+ return s;
+}
+
+static u8 *
+format_vhost_user_vring (u8 * s, va_list * args)
+{
+ char *fmt = va_arg (*args, char *);
+ vhost_user_intf_t *vui = va_arg (*args, vhost_user_intf_t *);
+ int q = va_arg (*args, int);
+
+ s = format (s, fmt, vui->vrings[q].avail->flags, vui->vrings[q].avail->idx,
+ vui->vrings[q].used->flags, vui->vrings[q].used->idx);
+ return s;
+}
+
+static void
+vhost_user_show_fds (vlib_main_t * vm, vhost_user_intf_t * vui, int q)
+{
+ int kickfd = UNIX_GET_FD (vui->vrings[q].kickfd_idx);
+ int callfd = UNIX_GET_FD (vui->vrings[q].callfd_idx);
+
+ vlib_cli_output (vm, " kickfd %d callfd %d errfd %d\n", kickfd, callfd,
+ vui->vrings[q].errfd);
+}
+
+static void
+vhost_user_show_desc (vlib_main_t * vm, vhost_user_intf_t * vui, int q,
+ int show_descr, int show_verbose)
+{
+ int j;
+ u32 mem_hint = 0;
+ u32 idx;
+ u32 n_entries;
+ vring_desc_t *desc_table;
+
+ if (vui->vrings[q].avail && vui->vrings[q].used)
+ vlib_cli_output (vm, "%U", format_vhost_user_vring,
+ " avail.flags %x avail.idx %d used.flags %x used.idx %d\n",
+ vui, q);
+
+ vhost_user_show_fds (vm, vui, q);
+
+ if (show_descr)
+ {
+ vlib_cli_output (vm, "\n descriptor table:\n");
+ vlib_cli_output (vm,
+ " slot addr len flags next "
+ "user_addr\n");
+ vlib_cli_output (vm,
+ " ===== ================== ===== ====== ===== "
+ "==================\n");
+ for (j = 0; j < vui->vrings[q].qsz_mask + 1; j++)
+ {
+ desc_table = vui->vrings[q].desc;
+ vlib_cli_output (vm, "%U", format_vhost_user_desc,
+ " %-5d 0x%016lx %-5d 0x%04x %-5d 0x%016lx\n", vui,
+ desc_table, j, &mem_hint);
+ if (show_verbose && (desc_table[j].flags & VIRTQ_DESC_F_INDIRECT))
+ {
+ n_entries = desc_table[j].len / sizeof (vring_desc_t);
+ desc_table = map_guest_mem (vui, desc_table[j].addr, &mem_hint);
+ if (desc_table)
+ {
+ for (idx = 0; idx < clib_min (20, n_entries); idx++)
+ {
+ vlib_cli_output
+ (vm, "%U", format_vhost_user_desc,
+ "> %-4u 0x%016lx %-5u 0x%04x %-5u 0x%016lx\n", vui,
+ desc_table, idx, &mem_hint);
+ }
+ if (n_entries >= 20)
+ vlib_cli_output (vm, "Skip displaying entries 20...%u\n",
+ n_entries);
+ }
+ }
+ }
+ }
+}
+
+static u8 *
+format_vhost_user_packed_desc (u8 * s, va_list * args)
+{
+ char *fmt = va_arg (*args, char *);
+ vhost_user_intf_t *vui = va_arg (*args, vhost_user_intf_t *);
+ vring_packed_desc_t *desc_table = va_arg (*args, vring_packed_desc_t *);
+ int idx = va_arg (*args, int);
+ u32 *mem_hint = va_arg (*args, u32 *);
+
+ s = format (s, fmt, idx, desc_table[idx].addr, desc_table[idx].len,
+ desc_table[idx].flags, desc_table[idx].id,
+ pointer_to_uword (map_guest_mem (vui, desc_table[idx].addr,
+ mem_hint)));
+ return s;
+}
+
+static u8 *
+format_vhost_user_vring_packed (u8 * s, va_list * args)
+{
+ char *fmt = va_arg (*args, char *);
+ vhost_user_intf_t *vui = va_arg (*args, vhost_user_intf_t *);
+ int q = va_arg (*args, int);
+
+ s = format (s, fmt, vui->vrings[q].avail_event->flags,
+ vui->vrings[q].avail_event->off_wrap,
+ vui->vrings[q].used_event->flags,
+ vui->vrings[q].used_event->off_wrap,
+ vui->vrings[q].avail_wrap_counter,
+ vui->vrings[q].used_wrap_counter);
+ return s;
+}
+
+static void
+vhost_user_show_desc_packed (vlib_main_t * vm, vhost_user_intf_t * vui, int q,
+ int show_descr, int show_verbose)
+{
+ int j;
+ u32 mem_hint = 0;
+ u32 idx;
+ u32 n_entries;
+ vring_packed_desc_t *desc_table;
+
+ if (vui->vrings[q].avail_event && vui->vrings[q].used_event)
+ vlib_cli_output (vm, "%U", format_vhost_user_vring_packed,
+ " avail_event.flags %x avail_event.off_wrap %u "
+ "used_event.flags %x used_event.off_wrap %u\n"
+ " avail wrap counter %u, used wrap counter %u\n",
+ vui, q);
+
+ vhost_user_show_fds (vm, vui, q);
+
+ if (show_descr)
+ {
+ vlib_cli_output (vm, "\n descriptor table:\n");
+ vlib_cli_output (vm,
+ " slot addr len flags id "
+ "user_addr\n");
+ vlib_cli_output (vm,
+ " ===== ================== ===== ====== ===== "
+ "==================\n");
+ for (j = 0; j < vui->vrings[q].qsz_mask + 1; j++)
+ {
+ desc_table = vui->vrings[q].packed_desc;
+ vlib_cli_output (vm, "%U", format_vhost_user_packed_desc,
+ " %-5u 0x%016lx %-5u 0x%04x %-5u 0x%016lx\n", vui,
+ desc_table, j, &mem_hint);
+ if (show_verbose && (desc_table[j].flags & VIRTQ_DESC_F_INDIRECT))
+ {
+ n_entries = desc_table[j].len >> 4;
+ desc_table = map_guest_mem (vui, desc_table[j].addr, &mem_hint);
+ if (desc_table)
+ {
+ for (idx = 0; idx < clib_min (20, n_entries); idx++)
+ {
+ vlib_cli_output
+ (vm, "%U", format_vhost_user_packed_desc,
+ "> %-4u 0x%016lx %-5u 0x%04x %-5u 0x%016lx\n", vui,
+ desc_table, idx, &mem_hint);
+ }
+ if (n_entries >= 20)
+ vlib_cli_output (vm, "Skip displaying entries 20...%u\n",
+ n_entries);
+ }
+ }
+ }
+ }
+}
+
clib_error_t *
show_vhost_user_command_fn (vlib_main_t * vm,
unformat_input_t * input,
@@ -1814,6 +2054,7 @@ show_vhost_user_command_fn (vlib_main_t * vm,
u32 ci;
int i, j, q;
int show_descr = 0;
+ int show_verbose = 0;
struct feat_struct
{
u8 bit;
@@ -1855,6 +2096,8 @@ show_vhost_user_command_fn (vlib_main_t * vm,
}
else if (unformat (input, "descriptors") || unformat (input, "desc"))
show_descr = 1;
+ else if (unformat (input, "verbose"))
+ show_verbose = 1;
else
{
error = clib_error_return (0, "unknown input `%U'",
@@ -1884,6 +2127,8 @@ show_vhost_user_command_fn (vlib_main_t * vm,
hw_if_indices[i]);
if (vui->enable_gso)
vlib_cli_output (vm, " GSO enable");
+ if (vui->enable_packed)
+ vlib_cli_output (vm, " Packed ring enable");
vlib_cli_output (vm, "virtio_net_hdr_sz %d\n"
" features mask (0x%llx): \n"
@@ -1985,41 +2230,11 @@ show_vhost_user_command_fn (vlib_main_t * vm,
vui->vrings[q].last_avail_idx,
vui->vrings[q].last_used_idx);
- if (vui->vrings[q].avail && vui->vrings[q].used)
- vlib_cli_output (vm,
- " avail.flags %x avail.idx %d used.flags %x used.idx %d\n",
- vui->vrings[q].avail->flags,
- vui->vrings[q].avail->idx,
- vui->vrings[q].used->flags,
- vui->vrings[q].used->idx);
-
- int kickfd = UNIX_GET_FD (vui->vrings[q].kickfd_idx);
- int callfd = UNIX_GET_FD (vui->vrings[q].callfd_idx);
- vlib_cli_output (vm, " kickfd %d callfd %d errfd %d\n",
- kickfd, callfd, vui->vrings[q].errfd);
-
- if (show_descr)
- {
- vlib_cli_output (vm, "\n descriptor table:\n");
- vlib_cli_output (vm,
- " id addr len flags next user_addr\n");
- vlib_cli_output (vm,
- " ===== ================== ===== ====== ===== ==================\n");
- for (j = 0; j < vui->vrings[q].qsz_mask + 1; j++)
- {
- u32 mem_hint = 0;
- vlib_cli_output (vm,
- " %-5d 0x%016lx %-5d 0x%04x %-5d 0x%016lx\n",
- j, vui->vrings[q].desc[j].addr,
- vui->vrings[q].desc[j].len,
- vui->vrings[q].desc[j].flags,
- vui->vrings[q].desc[j].next,
- pointer_to_uword (map_guest_mem
- (vui,
- vui->vrings[q].desc[j].
- addr, &mem_hint)));
- }
- }
+ if (vhost_user_is_packed_ring_supported (vui))
+ vhost_user_show_desc_packed (vm, vui, q, show_descr,
+ show_verbose);
+ else
+ vhost_user_show_desc (vm, vui, q, show_descr, show_verbose);
}
vlib_cli_output (vm, "\n");
}
@@ -2090,7 +2305,8 @@ done:
VLIB_CLI_COMMAND (vhost_user_connect_command, static) = {
.path = "create vhost-user",
.short_help = "create vhost-user socket <socket-filename> [server] "
- "[feature-mask <hex>] [hwaddr <mac-addr>] [renumber <dev_instance>] [gso]",
+ "[feature-mask <hex>] [hwaddr <mac-addr>] [renumber <dev_instance>] [gso] "
+ "[packed]",
.function = vhost_user_connect_command_fn,
.is_mp_safe = 1,
};
@@ -2251,7 +2467,8 @@ VLIB_CLI_COMMAND (vhost_user_delete_command, static) = {
/* *INDENT-OFF* */
VLIB_CLI_COMMAND (show_vhost_user_command, static) = {
.path = "show vhost-user",
- .short_help = "show vhost-user [<interface> [<interface> [..]]] [descriptors]",
+ .short_help = "show vhost-user [<interface> [<interface> [..]]] "
+ "[[descriptors] [verbose]]",
.function = show_vhost_user_command_fn,
};
/* *INDENT-ON* */