diff options
Diffstat (limited to 'src/vnet')
-rw-r--r-- | src/vnet/devices/virtio/FEATURE.yaml | 2 | ||||
-rw-r--r-- | src/vnet/devices/virtio/vhost_user.api | 6 | ||||
-rw-r--r-- | src/vnet/devices/virtio/vhost_user.c | 311 | ||||
-rw-r--r-- | src/vnet/devices/virtio/vhost_user.h | 63 | ||||
-rw-r--r-- | src/vnet/devices/virtio/vhost_user_api.c | 20 | ||||
-rw-r--r-- | src/vnet/devices/virtio/vhost_user_inline.h | 79 | ||||
-rw-r--r-- | src/vnet/devices/virtio/vhost_user_input.c | 858 | ||||
-rw-r--r-- | src/vnet/devices/virtio/vhost_user_output.c | 421 |
8 files changed, 1604 insertions, 156 deletions
diff --git a/src/vnet/devices/virtio/FEATURE.yaml b/src/vnet/devices/virtio/FEATURE.yaml index de449c866e5..b446a559403 100644 --- a/src/vnet/devices/virtio/FEATURE.yaml +++ b/src/vnet/devices/virtio/FEATURE.yaml @@ -7,7 +7,7 @@ features: - device mode to emulate vhost-user interface presented to VPP from the guest VM. - support multi-queue, GSO, checksum offload, indirect descriptor, - and jumbo frame. + jumbo frame, and packed ring. description: "Virtio v1.0 implementation" missing: - API dump filtering by sw_if_index diff --git a/src/vnet/devices/virtio/vhost_user.api b/src/vnet/devices/virtio/vhost_user.api index 9b057552c00..127b0a27fc7 100644 --- a/src/vnet/devices/virtio/vhost_user.api +++ b/src/vnet/devices/virtio/vhost_user.api @@ -13,7 +13,7 @@ * limitations under the License. */ -option version = "4.0.0"; +option version = "4.0.1"; import "vnet/interface_types.api"; import "vnet/ethernet/ethernet_types.api"; @@ -27,6 +27,7 @@ import "vnet/devices/virtio/virtio_types.api"; @param disable_mrg_rxbuf - disable the use of merge receive buffers @param disable_indirect_desc - disable the use of indirect descriptors which driver can use @param enable_gso - enable gso support (default 0) + @param enable_packed - enable packed ring support (default 0) @param mac_address - hardware address to use if 'use_custom_mac' is set */ define create_vhost_user_if @@ -39,6 +40,7 @@ define create_vhost_user_if bool disable_mrg_rxbuf; bool disable_indirect_desc; bool enable_gso; + bool enable_packed; u32 custom_dev_instance; bool use_custom_mac; vl_api_mac_address_t mac_address; @@ -62,6 +64,7 @@ define create_vhost_user_if_reply @param is_server - our side is socket server @param sock_filename - unix socket filename, used to speak with frontend @param enable_gso - enable gso support (default 0) + @param enable_packed - enable packed ring support (default 0) */ autoreply define modify_vhost_user_if { @@ -72,6 +75,7 @@ autoreply define modify_vhost_user_if string sock_filename[256]; bool renumber; bool enable_gso; + bool enable_packed; u32 custom_dev_instance; }; diff --git a/src/vnet/devices/virtio/vhost_user.c b/src/vnet/devices/virtio/vhost_user.c index 7094a00fb33..d24e516a93c 100644 --- a/src/vnet/devices/virtio/vhost_user.c +++ b/src/vnet/devices/virtio/vhost_user.c @@ -466,6 +466,8 @@ vhost_user_socket_read (clib_file_t * uf) if (vui->enable_gso) msg.u64 |= FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS; + if (vui->enable_packed) + msg.u64 |= (1ULL << FEAT_VIRTIO_F_RING_PACKED); msg.size = sizeof (msg.u64); vu_log_debug (vui, "if %d msg VHOST_USER_GET_FEATURES - reply " @@ -655,7 +657,11 @@ vhost_user_socket_read (clib_file_t * uf) vui->vrings[msg.state.index].used->idx; /* tell driver that we don't want interrupts */ - vui->vrings[msg.state.index].used->flags = VRING_USED_F_NO_NOTIFY; + if (vhost_user_is_packed_ring_supported (vui)) + vui->vrings[msg.state.index].used_event->flags = + VRING_EVENT_F_DISABLE; + else + vui->vrings[msg.state.index].used->flags = VRING_USED_F_NO_NOTIFY; vlib_worker_thread_barrier_release (vm); vhost_user_update_iface_state (vui); break; @@ -762,10 +768,47 @@ vhost_user_socket_read (clib_file_t * uf) break; case VHOST_USER_SET_VRING_BASE: - vu_log_debug (vui, "if %d msg VHOST_USER_SET_VRING_BASE idx %d num %d", + vu_log_debug (vui, + "if %d msg VHOST_USER_SET_VRING_BASE idx %d num 0x%x", vui->hw_if_index, msg.state.index, msg.state.num); vlib_worker_thread_barrier_sync (vm); vui->vrings[msg.state.index].last_avail_idx = msg.state.num; + if (vhost_user_is_packed_ring_supported (vui)) + { + /* + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | last avail idx | | last used idx | | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * ^ ^ + * | | + * avail wrap counter used wrap counter + */ + /* last avail idx at bit 0-14. */ + vui->vrings[msg.state.index].last_avail_idx = + msg.state.num & 0x7fff; + /* avail wrap counter at bit 15 */ + vui->vrings[msg.state.index].avail_wrap_counter = + ! !(msg.state.num & (1 << 15)); + + /* + * Although last_used_idx is passed in the upper 16 bits in qemu + * implementation, in practice, last_avail_idx and last_used_idx are + * usually the same. As a result, DPDK does not bother to pass us + * last_used_idx. The spec is not clear on thex coding. I figured it + * out by reading the qemu code. So let's just read last_avail_idx + * and set last_used_idx equals to last_avail_idx. + */ + vui->vrings[msg.state.index].last_used_idx = + vui->vrings[msg.state.index].last_avail_idx; + vui->vrings[msg.state.index].used_wrap_counter = + vui->vrings[msg.state.index].avail_wrap_counter; + + if (vui->vrings[msg.state.index].avail_wrap_counter == 1) + vui->vrings[msg.state.index].avail_wrap_counter = + VIRTQ_DESC_F_AVAIL; + } vlib_worker_thread_barrier_release (vm); break; @@ -784,6 +827,15 @@ vhost_user_socket_read (clib_file_t * uf) * closing the vring also initializes the vring last_avail_idx */ msg.state.num = vui->vrings[msg.state.index].last_avail_idx; + if (vhost_user_is_packed_ring_supported (vui)) + { + msg.state.num = + (vui->vrings[msg.state.index].last_avail_idx & 0x7fff) | + (! !vui->vrings[msg.state.index].avail_wrap_counter << 15); + msg.state.num |= + ((vui->vrings[msg.state.index].last_used_idx & 0x7fff) | + (! !vui->vrings[msg.state.index].used_wrap_counter << 15)) << 16; + } msg.flags |= 4; msg.size = sizeof (msg.state); @@ -793,7 +845,8 @@ vhost_user_socket_read (clib_file_t * uf) */ vhost_user_vring_close (vui, msg.state.index); vlib_worker_thread_barrier_release (vm); - vu_log_debug (vui, "if %d msg VHOST_USER_GET_VRING_BASE idx %d num %d", + vu_log_debug (vui, + "if %d msg VHOST_USER_GET_VRING_BASE idx %d num 0x%x", vui->hw_if_index, msg.state.index, msg.state.num); n = send (uf->file_descriptor, &msg, VHOST_USER_MSG_HDR_SZ + msg.size, 0); @@ -1440,7 +1493,8 @@ vhost_user_vui_init (vnet_main_t * vnm, vhost_user_intf_t * vui, int server_sock_fd, const char *sock_filename, - u64 feature_mask, u32 * sw_if_index, u8 enable_gso) + u64 feature_mask, u32 * sw_if_index, u8 enable_gso, + u8 enable_packed) { vnet_sw_interface_t *sw; int q; @@ -1472,6 +1526,7 @@ vhost_user_vui_init (vnet_main_t * vnm, vui->log_base_addr = 0; vui->if_index = vui - vum->vhost_user_interfaces; vui->enable_gso = enable_gso; + vui->enable_packed = enable_packed; /* * enable_gso takes precedence over configurable feature mask if there * is a clash. @@ -1519,7 +1574,7 @@ vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm, u32 * sw_if_index, u64 feature_mask, u8 renumber, u32 custom_dev_instance, u8 * hwaddr, - u8 enable_gso) + u8 enable_gso, u8 enable_packed) { vhost_user_intf_t *vui = NULL; u32 sw_if_idx = ~0; @@ -1560,7 +1615,7 @@ vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm, vlib_worker_thread_barrier_release (vm); vhost_user_vui_init (vnm, vui, server_sock_fd, sock_filename, - feature_mask, &sw_if_idx, enable_gso); + feature_mask, &sw_if_idx, enable_gso, enable_packed); vnet_sw_interface_set_mtu (vnm, vui->sw_if_index, 9000); vhost_user_rx_thread_placement (vui, 1); @@ -1582,7 +1637,7 @@ vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm, u8 is_server, u32 sw_if_index, u64 feature_mask, u8 renumber, u32 custom_dev_instance, - u8 enable_gso) + u8 enable_gso, u8 enable_packed) { vhost_user_main_t *vum = &vhost_user_main; vhost_user_intf_t *vui = NULL; @@ -1619,7 +1674,8 @@ vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm, vhost_user_term_if (vui); vhost_user_vui_init (vnm, vui, server_sock_fd, - sock_filename, feature_mask, &sw_if_idx, enable_gso); + sock_filename, feature_mask, &sw_if_idx, enable_gso, + enable_packed); if (renumber) vnet_interface_name_renumber (sw_if_idx, custom_dev_instance); @@ -1645,7 +1701,7 @@ vhost_user_connect_command_fn (vlib_main_t * vm, u8 hwaddr[6]; u8 *hw = NULL; clib_error_t *error = NULL; - u8 enable_gso = 0; + u8 enable_gso = 0, enable_packed = 0; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -1653,6 +1709,8 @@ vhost_user_connect_command_fn (vlib_main_t * vm, /* GSO feature is disable by default */ feature_mask &= ~FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS; + /* packed-ring feature is disable by default */ + feature_mask &= ~(1ULL << FEAT_VIRTIO_F_RING_PACKED); while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) { if (unformat (line_input, "socket %s", &sock_filename)) @@ -1661,6 +1719,8 @@ vhost_user_connect_command_fn (vlib_main_t * vm, is_server = 1; else if (unformat (line_input, "gso")) enable_gso = 1; + else if (unformat (line_input, "packed")) + enable_packed = 1; else if (unformat (line_input, "feature-mask 0x%llx", &feature_mask)) ; else @@ -1685,7 +1745,7 @@ vhost_user_connect_command_fn (vlib_main_t * vm, if ((rv = vhost_user_create_if (vnm, vm, (char *) sock_filename, is_server, &sw_if_index, feature_mask, renumber, custom_dev_instance, hw, - enable_gso))) + enable_gso, enable_packed))) { error = clib_error_return (0, "vhost_user_create_if returned %d", rv); goto done; @@ -1799,6 +1859,186 @@ vhost_user_dump_ifs (vnet_main_t * vnm, vlib_main_t * vm, return rv; } +static u8 * +format_vhost_user_desc (u8 * s, va_list * args) +{ + char *fmt = va_arg (*args, char *); + vhost_user_intf_t *vui = va_arg (*args, vhost_user_intf_t *); + vring_desc_t *desc_table = va_arg (*args, vring_desc_t *); + int idx = va_arg (*args, int); + u32 *mem_hint = va_arg (*args, u32 *); + + s = format (s, fmt, idx, desc_table[idx].addr, desc_table[idx].len, + desc_table[idx].flags, desc_table[idx].next, + pointer_to_uword (map_guest_mem (vui, desc_table[idx].addr, + mem_hint))); + return s; +} + +static u8 * +format_vhost_user_vring (u8 * s, va_list * args) +{ + char *fmt = va_arg (*args, char *); + vhost_user_intf_t *vui = va_arg (*args, vhost_user_intf_t *); + int q = va_arg (*args, int); + + s = format (s, fmt, vui->vrings[q].avail->flags, vui->vrings[q].avail->idx, + vui->vrings[q].used->flags, vui->vrings[q].used->idx); + return s; +} + +static void +vhost_user_show_fds (vlib_main_t * vm, vhost_user_intf_t * vui, int q) +{ + int kickfd = UNIX_GET_FD (vui->vrings[q].kickfd_idx); + int callfd = UNIX_GET_FD (vui->vrings[q].callfd_idx); + + vlib_cli_output (vm, " kickfd %d callfd %d errfd %d\n", kickfd, callfd, + vui->vrings[q].errfd); +} + +static void +vhost_user_show_desc (vlib_main_t * vm, vhost_user_intf_t * vui, int q, + int show_descr, int show_verbose) +{ + int j; + u32 mem_hint = 0; + u32 idx; + u32 n_entries; + vring_desc_t *desc_table; + + if (vui->vrings[q].avail && vui->vrings[q].used) + vlib_cli_output (vm, "%U", format_vhost_user_vring, + " avail.flags %x avail.idx %d used.flags %x used.idx %d\n", + vui, q); + + vhost_user_show_fds (vm, vui, q); + + if (show_descr) + { + vlib_cli_output (vm, "\n descriptor table:\n"); + vlib_cli_output (vm, + " slot addr len flags next " + "user_addr\n"); + vlib_cli_output (vm, + " ===== ================== ===== ====== ===== " + "==================\n"); + for (j = 0; j < vui->vrings[q].qsz_mask + 1; j++) + { + desc_table = vui->vrings[q].desc; + vlib_cli_output (vm, "%U", format_vhost_user_desc, + " %-5d 0x%016lx %-5d 0x%04x %-5d 0x%016lx\n", vui, + desc_table, j, &mem_hint); + if (show_verbose && (desc_table[j].flags & VIRTQ_DESC_F_INDIRECT)) + { + n_entries = desc_table[j].len / sizeof (vring_desc_t); + desc_table = map_guest_mem (vui, desc_table[j].addr, &mem_hint); + if (desc_table) + { + for (idx = 0; idx < clib_min (20, n_entries); idx++) + { + vlib_cli_output + (vm, "%U", format_vhost_user_desc, + "> %-4u 0x%016lx %-5u 0x%04x %-5u 0x%016lx\n", vui, + desc_table, idx, &mem_hint); + } + if (n_entries >= 20) + vlib_cli_output (vm, "Skip displaying entries 20...%u\n", + n_entries); + } + } + } + } +} + +static u8 * +format_vhost_user_packed_desc (u8 * s, va_list * args) +{ + char *fmt = va_arg (*args, char *); + vhost_user_intf_t *vui = va_arg (*args, vhost_user_intf_t *); + vring_packed_desc_t *desc_table = va_arg (*args, vring_packed_desc_t *); + int idx = va_arg (*args, int); + u32 *mem_hint = va_arg (*args, u32 *); + + s = format (s, fmt, idx, desc_table[idx].addr, desc_table[idx].len, + desc_table[idx].flags, desc_table[idx].id, + pointer_to_uword (map_guest_mem (vui, desc_table[idx].addr, + mem_hint))); + return s; +} + +static u8 * +format_vhost_user_vring_packed (u8 * s, va_list * args) +{ + char *fmt = va_arg (*args, char *); + vhost_user_intf_t *vui = va_arg (*args, vhost_user_intf_t *); + int q = va_arg (*args, int); + + s = format (s, fmt, vui->vrings[q].avail_event->flags, + vui->vrings[q].avail_event->off_wrap, + vui->vrings[q].used_event->flags, + vui->vrings[q].used_event->off_wrap, + vui->vrings[q].avail_wrap_counter, + vui->vrings[q].used_wrap_counter); + return s; +} + +static void +vhost_user_show_desc_packed (vlib_main_t * vm, vhost_user_intf_t * vui, int q, + int show_descr, int show_verbose) +{ + int j; + u32 mem_hint = 0; + u32 idx; + u32 n_entries; + vring_packed_desc_t *desc_table; + + if (vui->vrings[q].avail_event && vui->vrings[q].used_event) + vlib_cli_output (vm, "%U", format_vhost_user_vring_packed, + " avail_event.flags %x avail_event.off_wrap %u " + "used_event.flags %x used_event.off_wrap %u\n" + " avail wrap counter %u, used wrap counter %u\n", + vui, q); + + vhost_user_show_fds (vm, vui, q); + + if (show_descr) + { + vlib_cli_output (vm, "\n descriptor table:\n"); + vlib_cli_output (vm, + " slot addr len flags id " + "user_addr\n"); + vlib_cli_output (vm, + " ===== ================== ===== ====== ===== " + "==================\n"); + for (j = 0; j < vui->vrings[q].qsz_mask + 1; j++) + { + desc_table = vui->vrings[q].packed_desc; + vlib_cli_output (vm, "%U", format_vhost_user_packed_desc, + " %-5u 0x%016lx %-5u 0x%04x %-5u 0x%016lx\n", vui, + desc_table, j, &mem_hint); + if (show_verbose && (desc_table[j].flags & VIRTQ_DESC_F_INDIRECT)) + { + n_entries = desc_table[j].len >> 4; + desc_table = map_guest_mem (vui, desc_table[j].addr, &mem_hint); + if (desc_table) + { + for (idx = 0; idx < clib_min (20, n_entries); idx++) + { + vlib_cli_output + (vm, "%U", format_vhost_user_packed_desc, + "> %-4u 0x%016lx %-5u 0x%04x %-5u 0x%016lx\n", vui, + desc_table, idx, &mem_hint); + } + if (n_entries >= 20) + vlib_cli_output (vm, "Skip displaying entries 20...%u\n", + n_entries); + } + } + } + } +} + clib_error_t * show_vhost_user_command_fn (vlib_main_t * vm, unformat_input_t * input, @@ -1814,6 +2054,7 @@ show_vhost_user_command_fn (vlib_main_t * vm, u32 ci; int i, j, q; int show_descr = 0; + int show_verbose = 0; struct feat_struct { u8 bit; @@ -1855,6 +2096,8 @@ show_vhost_user_command_fn (vlib_main_t * vm, } else if (unformat (input, "descriptors") || unformat (input, "desc")) show_descr = 1; + else if (unformat (input, "verbose")) + show_verbose = 1; else { error = clib_error_return (0, "unknown input `%U'", @@ -1884,6 +2127,8 @@ show_vhost_user_command_fn (vlib_main_t * vm, hw_if_indices[i]); if (vui->enable_gso) vlib_cli_output (vm, " GSO enable"); + if (vui->enable_packed) + vlib_cli_output (vm, " Packed ring enable"); vlib_cli_output (vm, "virtio_net_hdr_sz %d\n" " features mask (0x%llx): \n" @@ -1985,41 +2230,11 @@ show_vhost_user_command_fn (vlib_main_t * vm, vui->vrings[q].last_avail_idx, vui->vrings[q].last_used_idx); - if (vui->vrings[q].avail && vui->vrings[q].used) - vlib_cli_output (vm, - " avail.flags %x avail.idx %d used.flags %x used.idx %d\n", - vui->vrings[q].avail->flags, - vui->vrings[q].avail->idx, - vui->vrings[q].used->flags, - vui->vrings[q].used->idx); - - int kickfd = UNIX_GET_FD (vui->vrings[q].kickfd_idx); - int callfd = UNIX_GET_FD (vui->vrings[q].callfd_idx); - vlib_cli_output (vm, " kickfd %d callfd %d errfd %d\n", - kickfd, callfd, vui->vrings[q].errfd); - - if (show_descr) - { - vlib_cli_output (vm, "\n descriptor table:\n"); - vlib_cli_output (vm, - " id addr len flags next user_addr\n"); - vlib_cli_output (vm, - " ===== ================== ===== ====== ===== ==================\n"); - for (j = 0; j < vui->vrings[q].qsz_mask + 1; j++) - { - u32 mem_hint = 0; - vlib_cli_output (vm, - " %-5d 0x%016lx %-5d 0x%04x %-5d 0x%016lx\n", - j, vui->vrings[q].desc[j].addr, - vui->vrings[q].desc[j].len, - vui->vrings[q].desc[j].flags, - vui->vrings[q].desc[j].next, - pointer_to_uword (map_guest_mem - (vui, - vui->vrings[q].desc[j]. - addr, &mem_hint))); - } - } + if (vhost_user_is_packed_ring_supported (vui)) + vhost_user_show_desc_packed (vm, vui, q, show_descr, + show_verbose); + else + vhost_user_show_desc (vm, vui, q, show_descr, show_verbose); } vlib_cli_output (vm, "\n"); } @@ -2090,7 +2305,8 @@ done: VLIB_CLI_COMMAND (vhost_user_connect_command, static) = { .path = "create vhost-user", .short_help = "create vhost-user socket <socket-filename> [server] " - "[feature-mask <hex>] [hwaddr <mac-addr>] [renumber <dev_instance>] [gso]", + "[feature-mask <hex>] [hwaddr <mac-addr>] [renumber <dev_instance>] [gso] " + "[packed]", .function = vhost_user_connect_command_fn, .is_mp_safe = 1, }; @@ -2251,7 +2467,8 @@ VLIB_CLI_COMMAND (vhost_user_delete_command, static) = { /* *INDENT-OFF* */ VLIB_CLI_COMMAND (show_vhost_user_command, static) = { .path = "show vhost-user", - .short_help = "show vhost-user [<interface> [<interface> [..]]] [descriptors]", + .short_help = "show vhost-user [<interface> [<interface> [..]]] " + "[[descriptors] [verbose]]", .function = show_vhost_user_command_fn, }; /* *INDENT-ON* */ diff --git a/src/vnet/devices/virtio/vhost_user.h b/src/vnet/devices/virtio/vhost_user.h index f14f26a71e4..b86f42e70e8 100644 --- a/src/vnet/devices/virtio/vhost_user.h +++ b/src/vnet/devices/virtio/vhost_user.h @@ -25,8 +25,15 @@ #define VHOST_USER_VRING_NOFD_MASK 0x100 #define VIRTQ_DESC_F_NEXT 1 +#define VIRTQ_DESC_F_WRITE 2 #define VIRTQ_DESC_F_INDIRECT 4 -#define VHOST_USER_REPLY_MASK (0x1 << 2) + +#define VIRTQ_DESC_F_AVAIL (1 << 7) +#define VIRTQ_DESC_F_USED (1 << 15) + +#define VRING_EVENT_F_ENABLE 0x0 +#define VRING_EVENT_F_DISABLE 0x1 +#define VRING_EVENT_F_DESC 0x2 #define VHOST_USER_PROTOCOL_F_MQ 0 #define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 @@ -100,8 +107,11 @@ typedef enum _ (VHOST_F_LOG_ALL, 26) \ _ (VIRTIO_F_ANY_LAYOUT, 27) \ _ (VIRTIO_F_INDIRECT_DESC, 28) \ + _ (VIRTIO_F_EVENT_IDX, 29) \ _ (VHOST_USER_F_PROTOCOL_FEATURES, 30) \ - _ (VIRTIO_F_VERSION_1, 32) + _ (VIRTIO_F_VERSION_1, 32) \ + _ (VIRTIO_F_RING_PACKED, 34) \ + _ (VIRTIO_F_IN_ORDER, 35) typedef enum { @@ -130,12 +140,12 @@ int vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm, const char *sock_filename, u8 is_server, u32 * sw_if_index, u64 feature_mask, u8 renumber, u32 custom_dev_instance, u8 * hwaddr, - u8 enable_gso); + u8 enable_gso, u8 enable_packed); int vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm, const char *sock_filename, u8 is_server, u32 sw_if_index, u64 feature_mask, u8 renumber, u32 custom_dev_instance, - u8 enable_gso); + u8 enable_gso, u8 enable_packed); int vhost_user_delete_if (vnet_main_t * vnm, vlib_main_t * vm, u32 sw_if_index); @@ -223,6 +233,22 @@ typedef struct } ring[VHOST_VRING_MAX_SIZE]; } __attribute ((packed)) vring_used_t; +typedef CLIB_PACKED (struct +{ + u64 addr; // packet data buffer address + u32 len; // packet data buffer size + u16 id; // buffer id + u16 flags; // flags +}) vring_packed_desc_t; + +STATIC_ASSERT_SIZEOF (vring_packed_desc_t, 16); + +typedef CLIB_PACKED (struct +{ + u16 off_wrap; + u16 flags; +}) vring_desc_event_t; + typedef struct { u8 flags; @@ -260,9 +286,21 @@ typedef struct u16 last_avail_idx; u16 last_used_idx; u16 n_since_last_int; - vring_desc_t *desc; - vring_avail_t *avail; - vring_used_t *used; + union + { + vring_desc_t *desc; + vring_packed_desc_t *packed_desc; + }; + union + { + vring_avail_t *avail; + vring_desc_event_t *avail_event; + }; + union + { + vring_used_t *used; + vring_desc_event_t *used_event; + }; uword desc_user_addr; uword used_user_addr; uword avail_user_addr; @@ -287,6 +325,9 @@ typedef struct * the interface even if it is disconnected and reconnected. */ i16 qid; + + u16 used_wrap_counter; + u16 avail_wrap_counter; } vhost_user_vring_t; #define VHOST_USER_EVENT_START_TIMER 1 @@ -332,6 +373,10 @@ typedef struct u16 *per_cpu_tx_qid; u8 enable_gso; + + /* Packed ring configured */ + u8 enable_packed; + } vhost_user_intf_t; typedef struct @@ -350,7 +395,6 @@ typedef struct virtio_net_hdr_mrg_rxbuf_t hdr; /** Virtio header **/ } vhost_trace_t; - #define VHOST_USER_RX_BUFFERS_N (2 * VLIB_FRAME_SIZE + 2) #define VHOST_USER_COPY_ARRAY_N (4 * VLIB_FRAME_SIZE) @@ -365,6 +409,9 @@ typedef struct /* This is here so it doesn't end-up * using stack or registers. */ vhost_trace_t *current_trace; + + u32 *to_next_list; + vlib_buffer_t **rx_buffers_pdesc; } vhost_cpu_t; typedef struct diff --git a/src/vnet/devices/virtio/vhost_user_api.c b/src/vnet/devices/virtio/vhost_user_api.c index 2ab87a65690..67365334d95 100644 --- a/src/vnet/devices/virtio/vhost_user_api.c +++ b/src/vnet/devices/virtio/vhost_user_api.c @@ -71,10 +71,12 @@ vl_api_create_vhost_user_if_t_handler (vl_api_create_vhost_user_if_t * mp) disabled_features |= (1ULL << FEAT_VIRTIO_F_INDIRECT_DESC); /* - * feature mask is not supported via binary API. We disable GSO feature in the - * feature mask. It may be enabled via enable_gso argument. + * GSO and PACKED are not supported by feature mask via binary API. We + * disable GSO and PACKED feature in the feature mask. They may be enabled + * explicitly via enable_gso and enable_packed argument */ - disabled_features |= FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS; + disabled_features |= FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS | + (1ULL << FEAT_VIRTIO_F_RING_PACKED); features &= ~disabled_features; if (mp->use_custom_mac) @@ -86,7 +88,7 @@ vl_api_create_vhost_user_if_t_handler (vl_api_create_vhost_user_if_t * mp) rv = vhost_user_create_if (vnm, vm, (char *) mp->sock_filename, mp->is_server, &sw_if_index, features, mp->renumber, ntohl (mp->custom_dev_instance), - mac_p, mp->enable_gso); + mac_p, mp->enable_gso, mp->enable_packed); /* Remember an interface tag for the new interface */ if (rv == 0) @@ -122,16 +124,18 @@ vl_api_modify_vhost_user_if_t_handler (vl_api_modify_vhost_user_if_t * mp) vlib_main_t *vm = vlib_get_main (); /* - * feature mask is not supported via binary API. We disable GSO feature in the - * feature mask. It may be enabled via enable_gso argument. + * GSO and PACKED are not supported by feature mask via binary API. We + * disable GSO and PACKED feature in the feature mask. They may be enabled + * explicitly via enable_gso and enable_packed argument */ - disabled_features |= FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS; + disabled_features |= FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS | + (1ULL << FEAT_VIRTIO_F_RING_PACKED); features &= ~disabled_features; rv = vhost_user_modify_if (vnm, vm, (char *) mp->sock_filename, mp->is_server, sw_if_index, features, mp->renumber, ntohl (mp->custom_dev_instance), - mp->enable_gso); + mp->enable_gso, mp->enable_packed); REPLY_MACRO (VL_API_MODIFY_VHOST_USER_IF_REPLY); } diff --git a/src/vnet/devices/virtio/vhost_user_inline.h b/src/vnet/devices/virtio/vhost_user_inline.h index e4a1d596040..ceaf78cf799 100644 --- a/src/vnet/devices/virtio/vhost_user_inline.h +++ b/src/vnet/devices/virtio/vhost_user_inline.h @@ -292,6 +292,85 @@ vhost_user_update_gso_interface_count (vhost_user_intf_t * vui, u8 add) } } } + +static_always_inline u8 +vhost_user_packed_desc_available (vhost_user_vring_t * vring, u16 idx) +{ + return (((vring->packed_desc[idx].flags & VIRTQ_DESC_F_AVAIL) == + vring->avail_wrap_counter)); +} + +static_always_inline void +vhost_user_advance_last_avail_idx (vhost_user_vring_t * vring) +{ + vring->last_avail_idx++; + if (PREDICT_FALSE ((vring->last_avail_idx & vring->qsz_mask) == 0)) + vring->avail_wrap_counter ^= VIRTQ_DESC_F_AVAIL; +} + +static_always_inline void +vhost_user_advance_last_avail_table_idx (vhost_user_intf_t * vui, + vhost_user_vring_t * vring, + u8 chained) +{ + if (chained) + { + vring_packed_desc_t *desc_table = vring->packed_desc; + + /* pick up the slot of the next avail idx */ + while (desc_table[vring->last_avail_idx & vring->qsz_mask].flags & + VIRTQ_DESC_F_NEXT) + vhost_user_advance_last_avail_idx (vring); + } + + vhost_user_advance_last_avail_idx (vring); +} + +static_always_inline void +vhost_user_undo_advanced_last_avail_idx (vhost_user_vring_t * vring) +{ + if (PREDICT_FALSE ((vring->last_avail_idx & vring->qsz_mask) == 0)) + vring->avail_wrap_counter ^= VIRTQ_DESC_F_AVAIL; + vring->last_avail_idx--; +} + +static_always_inline void +vhost_user_dequeue_descs (vhost_user_vring_t * rxvq, + virtio_net_hdr_mrg_rxbuf_t * hdr, + u16 * n_descs_processed) +{ + u16 i; + + *n_descs_processed -= (hdr->num_buffers - 1); + for (i = 0; i < hdr->num_buffers - 1; i++) + vhost_user_undo_advanced_last_avail_idx (rxvq); +} + +static_always_inline void +vhost_user_dequeue_chained_descs (vhost_user_vring_t * rxvq, + u16 * n_descs_processed) +{ + while (*n_descs_processed) + { + vhost_user_undo_advanced_last_avail_idx (rxvq); + (*n_descs_processed)--; + } +} + +static_always_inline void +vhost_user_advance_last_used_idx (vhost_user_vring_t * vring) +{ + vring->last_used_idx++; + if (PREDICT_FALSE ((vring->last_used_idx & vring->qsz_mask) == 0)) + vring->used_wrap_counter ^= 1; +} + +static_always_inline u64 +vhost_user_is_packed_ring_supported (vhost_user_intf_t * vui) +{ + return (vui->features & (1ULL << FEAT_VIRTIO_F_RING_PACKED)); +} + #endif /* diff --git a/src/vnet/devices/virtio/vhost_user_input.c b/src/vnet/devices/virtio/vhost_user_input.c index 4b52bd5a54b..dd899094225 100644 --- a/src/vnet/devices/virtio/vhost_user_input.c +++ b/src/vnet/devices/virtio/vhost_user_input.c @@ -74,6 +74,7 @@ extern vlib_node_registration_t vhost_user_input_node; _(MMAP_FAIL, "mmap failure") \ _(INDIRECT_OVERFLOW, "indirect descriptor overflows table") \ _(UNDERSIZED_FRAME, "undersized ethernet frame received (< 14 bytes)") \ + _(NOT_READY, "vhost interface not ready or down") \ _(FULL_RX_QUEUE, "full rx queue (possible driver tx drop)") typedef enum @@ -249,64 +250,59 @@ vhost_user_handle_rx_offload (vlib_buffer_t * b0, u8 * b0_data, virtio_net_hdr_t * hdr) { u8 l4_hdr_sz = 0; + u8 l4_proto = 0; + ethernet_header_t *eh = (ethernet_header_t *) b0_data; + u16 ethertype = clib_net_to_host_u16 (eh->type); + u16 l2hdr_sz = sizeof (ethernet_header_t); - if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) + if (ethernet_frame_is_tagged (ethertype)) { - u8 l4_proto = 0; - ethernet_header_t *eh = (ethernet_header_t *) b0_data; - u16 ethertype = clib_net_to_host_u16 (eh->type); - u16 l2hdr_sz = sizeof (ethernet_header_t); + ethernet_vlan_header_t *vlan = (ethernet_vlan_header_t *) (eh + 1); - if (ethernet_frame_is_tagged (ethertype)) + ethertype = clib_net_to_host_u16 (vlan->type); + l2hdr_sz += sizeof (*vlan); + if (ethertype == ETHERNET_TYPE_VLAN) { - ethernet_vlan_header_t *vlan = (ethernet_vlan_header_t *) (eh + 1); - + vlan++; ethertype = clib_net_to_host_u16 (vlan->type); l2hdr_sz += sizeof (*vlan); - if (ethertype == ETHERNET_TYPE_VLAN) - { - vlan++; - ethertype = clib_net_to_host_u16 (vlan->type); - l2hdr_sz += sizeof (*vlan); - } - } - vnet_buffer (b0)->l2_hdr_offset = 0; - vnet_buffer (b0)->l3_hdr_offset = l2hdr_sz; - vnet_buffer (b0)->l4_hdr_offset = hdr->csum_start; - b0->flags |= (VNET_BUFFER_F_L2_HDR_OFFSET_VALID | - VNET_BUFFER_F_L3_HDR_OFFSET_VALID | - VNET_BUFFER_F_L4_HDR_OFFSET_VALID); - - if (PREDICT_TRUE (ethertype == ETHERNET_TYPE_IP4)) - { - ip4_header_t *ip4 = (ip4_header_t *) (b0_data + l2hdr_sz); - l4_proto = ip4->protocol; - b0->flags |= (VNET_BUFFER_F_IS_IP4 | - VNET_BUFFER_F_OFFLOAD_IP_CKSUM); - } - else if (PREDICT_TRUE (ethertype == ETHERNET_TYPE_IP6)) - { - ip6_header_t *ip6 = (ip6_header_t *) (b0_data + l2hdr_sz); - l4_proto = ip6->protocol; - b0->flags |= VNET_BUFFER_F_IS_IP6; } + } + vnet_buffer (b0)->l2_hdr_offset = 0; + vnet_buffer (b0)->l3_hdr_offset = l2hdr_sz; + vnet_buffer (b0)->l4_hdr_offset = hdr->csum_start; + b0->flags |= (VNET_BUFFER_F_L2_HDR_OFFSET_VALID | + VNET_BUFFER_F_L3_HDR_OFFSET_VALID | + VNET_BUFFER_F_L4_HDR_OFFSET_VALID); + + if (PREDICT_TRUE (ethertype == ETHERNET_TYPE_IP4)) + { + ip4_header_t *ip4 = (ip4_header_t *) (b0_data + l2hdr_sz); + l4_proto = ip4->protocol; + b0->flags |= VNET_BUFFER_F_IS_IP4 | VNET_BUFFER_F_OFFLOAD_IP_CKSUM; + } + else if (PREDICT_TRUE (ethertype == ETHERNET_TYPE_IP6)) + { + ip6_header_t *ip6 = (ip6_header_t *) (b0_data + l2hdr_sz); + l4_proto = ip6->protocol; + b0->flags |= VNET_BUFFER_F_IS_IP6; + } - if (l4_proto == IP_PROTOCOL_TCP) - { - tcp_header_t *tcp = (tcp_header_t *) - (b0_data + vnet_buffer (b0)->l4_hdr_offset); - l4_hdr_sz = tcp_header_bytes (tcp); - tcp->checksum = 0; - b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; - } - else if (l4_proto == IP_PROTOCOL_UDP) - { - udp_header_t *udp = - (udp_header_t *) (b0_data + vnet_buffer (b0)->l4_hdr_offset); - l4_hdr_sz = sizeof (*udp); - udp->checksum = 0; - b0->flags |= VNET_BUFFER_F_OFFLOAD_UDP_CKSUM; - } + if (l4_proto == IP_PROTOCOL_TCP) + { + tcp_header_t *tcp = (tcp_header_t *) + (b0_data + vnet_buffer (b0)->l4_hdr_offset); + l4_hdr_sz = tcp_header_bytes (tcp); + tcp->checksum = 0; + b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; + } + else if (l4_proto == IP_PROTOCOL_UDP) + { + udp_header_t *udp = + (udp_header_t *) (b0_data + vnet_buffer (b0)->l4_hdr_offset); + l4_hdr_sz = sizeof (*udp); + udp->checksum = 0; + b0->flags |= VNET_BUFFER_F_OFFLOAD_UDP_CKSUM; } if (hdr->gso_type == VIRTIO_NET_HDR_GSO_UDP) @@ -329,6 +325,57 @@ vhost_user_handle_rx_offload (vlib_buffer_t * b0, u8 * b0_data, } } +static_always_inline void +vhost_user_input_do_interrupt (vlib_main_t * vm, vhost_user_vring_t * txvq, + vhost_user_vring_t * rxvq) +{ + f64 now = vlib_time_now (vm); + + if ((txvq->n_since_last_int) && (txvq->int_deadline < now)) + vhost_user_send_call (vm, txvq); + + if ((rxvq->n_since_last_int) && (rxvq->int_deadline < now)) + vhost_user_send_call (vm, rxvq); +} + +static_always_inline void +vhost_user_input_setup_frame (vlib_main_t * vm, vlib_node_runtime_t * node, + vhost_user_intf_t * vui, + u32 * current_config_index, u32 * next_index, + u32 ** to_next, u32 * n_left_to_next) +{ + vnet_feature_main_t *fm = &feature_main; + u8 feature_arc_idx = fm->device_input_feature_arc_index; + + if (PREDICT_FALSE (vnet_have_features (feature_arc_idx, vui->sw_if_index))) + { + vnet_feature_config_main_t *cm; + cm = &fm->feature_config_mains[feature_arc_idx]; + *current_config_index = vec_elt (cm->config_index_by_sw_if_index, + vui->sw_if_index); + vnet_get_config_data (&cm->config_main, current_config_index, + next_index, 0); + } + + vlib_get_new_next_frame (vm, node, *next_index, *to_next, *n_left_to_next); + + if (*next_index == VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT) + { + /* give some hints to ethernet-input */ + vlib_next_frame_t *nf; + vlib_frame_t *f; + ethernet_input_frame_t *ef; + nf = vlib_node_runtime_get_next_frame (vm, node, *next_index); + f = vlib_get_frame (vm, nf->frame); + f->flags = ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX; + + ef = vlib_frame_scalar_args (f); + ef->sw_if_index = vui->sw_if_index; + ef->hw_if_index = vui->hw_if_index; + vlib_frame_no_append (f); + } +} + static_always_inline u32 vhost_user_if_input (vlib_main_t * vm, vhost_user_main_t * vum, @@ -359,13 +406,7 @@ vhost_user_if_input (vlib_main_t * vm, { /* do we have pending interrupts ? */ vhost_user_vring_t *rxvq = &vui->vrings[VHOST_VRING_IDX_RX (qid)]; - f64 now = vlib_time_now (vm); - - if ((txvq->n_since_last_int) && (txvq->int_deadline < now)) - vhost_user_send_call (vm, txvq); - - if ((rxvq->n_since_last_int) && (rxvq->int_deadline < now)) - vhost_user_send_call (vm, rxvq); + vhost_user_input_do_interrupt (vm, txvq, rxvq); } /* @@ -461,37 +502,12 @@ vhost_user_if_input (vlib_main_t * vm, } } - if (PREDICT_FALSE (vnet_have_features (feature_arc_idx, vui->sw_if_index))) - { - vnet_feature_config_main_t *cm; - cm = &fm->feature_config_mains[feature_arc_idx]; - current_config_index = vec_elt (cm->config_index_by_sw_if_index, - vui->sw_if_index); - vnet_get_config_data (&cm->config_main, ¤t_config_index, - &next_index, 0); - } + vhost_user_input_setup_frame (vm, node, vui, ¤t_config_index, + &next_index, &to_next, &n_left_to_next); u16 last_avail_idx = txvq->last_avail_idx; u16 last_used_idx = txvq->last_used_idx; - vlib_get_new_next_frame (vm, node, next_index, to_next, n_left_to_next); - - if (next_index == VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT) - { - /* give some hints to ethernet-input */ - vlib_next_frame_t *nf; - vlib_frame_t *f; - ethernet_input_frame_t *ef; - nf = vlib_node_runtime_get_next_frame (vm, node, next_index); - f = vlib_get_frame (vm, nf->frame); - f->flags = ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX; - - ef = vlib_frame_scalar_args (f); - ef->sw_if_index = vui->sw_if_index; - ef->hw_if_index = vui->hw_if_index; - vlib_frame_no_append (f); - } - while (n_left > 0) { vlib_buffer_t *b_head, *b_current; @@ -747,6 +763,654 @@ done: return n_rx_packets; } +static_always_inline void +vhost_user_mark_desc_consumed (vhost_user_intf_t * vui, + vhost_user_vring_t * txvq, u16 desc_head, + u16 n_descs_processed) +{ + vring_packed_desc_t *desc_table = txvq->packed_desc; + u16 desc_idx; + u16 mask = txvq->qsz_mask; + + for (desc_idx = 0; desc_idx < n_descs_processed; desc_idx++) + { + if (txvq->used_wrap_counter) + desc_table[(desc_head + desc_idx) & mask].flags |= + (VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED); + else + desc_table[(desc_head + desc_idx) & mask].flags &= + ~(VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED); + vhost_user_advance_last_used_idx (txvq); + } +} + +static_always_inline void +vhost_user_rx_trace_packed (vhost_trace_t * t, vhost_user_intf_t * vui, + u16 qid, vhost_user_vring_t * txvq, + u16 desc_current) +{ + vhost_user_main_t *vum = &vhost_user_main; + vring_packed_desc_t *hdr_desc; + virtio_net_hdr_mrg_rxbuf_t *hdr; + u32 hint = 0; + + clib_memset (t, 0, sizeof (*t)); + t->device_index = vui - vum->vhost_user_interfaces; + t->qid = qid; + + hdr_desc = &txvq->packed_desc[desc_current]; + if (txvq->packed_desc[desc_current].flags & VIRTQ_DESC_F_INDIRECT) + { + t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_INDIRECT; + /* Header is the first here */ + hdr_desc = map_guest_mem (vui, txvq->packed_desc[desc_current].addr, + &hint); + } + if (txvq->packed_desc[desc_current].flags & VIRTQ_DESC_F_NEXT) + t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_SIMPLE_CHAINED; + + if (!(txvq->packed_desc[desc_current].flags & VIRTQ_DESC_F_NEXT) && + !(txvq->packed_desc[desc_current].flags & VIRTQ_DESC_F_INDIRECT)) + t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_SINGLE_DESC; + + t->first_desc_len = hdr_desc ? hdr_desc->len : 0; + + if (!hdr_desc || !(hdr = map_guest_mem (vui, hdr_desc->addr, &hint))) + t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_MAP_ERROR; + else + { + u32 len = vui->virtio_net_hdr_sz; + clib_memcpy_fast (&t->hdr, hdr, + len > hdr_desc->len ? hdr_desc->len : len); + } +} + +static_always_inline u32 +vhost_user_rx_discard_packet_packed (vlib_main_t * vm, + vhost_user_intf_t * vui, + vhost_user_vring_t * txvq, + u32 discard_max) +{ + u32 discarded_packets = 0; + u16 mask = txvq->qsz_mask; + u16 desc_current, desc_head; + + desc_head = desc_current = txvq->last_used_idx & mask; + + /* + * On the RX side, each packet corresponds to one descriptor + * (it is the same whether it is a shallow descriptor, chained, or indirect). + * Therefore, discarding a packet is like discarding a descriptor. + */ + while ((discarded_packets != discard_max) && + vhost_user_packed_desc_available (txvq, desc_current)) + { + vhost_user_advance_last_avail_idx (txvq); + discarded_packets++; + desc_current = (desc_current + 1) & mask; + } + + if (PREDICT_TRUE (discarded_packets)) + vhost_user_mark_desc_consumed (vui, txvq, desc_head, discarded_packets); + return (discarded_packets); +} + +static_always_inline u32 +vhost_user_input_copy_packed (vhost_user_intf_t * vui, vhost_copy_t * cpy, + u16 copy_len, u32 * map_hint) +{ + void *src0, *src1, *src2, *src3, *src4, *src5, *src6, *src7; + u8 bad; + u32 rc = VHOST_USER_INPUT_FUNC_ERROR_NO_ERROR; + + if (PREDICT_TRUE (copy_len >= 8)) + { + src4 = map_guest_mem (vui, cpy[0].src, map_hint); + src5 = map_guest_mem (vui, cpy[1].src, map_hint); + src6 = map_guest_mem (vui, cpy[2].src, map_hint); + src7 = map_guest_mem (vui, cpy[3].src, map_hint); + bad = (src4 == 0) + (src5 == 0) + (src6 == 0) + (src7 == 0); + if (PREDICT_FALSE (bad)) + goto one_by_one; + CLIB_PREFETCH (src4, 64, LOAD); + CLIB_PREFETCH (src5, 64, LOAD); + CLIB_PREFETCH (src6, 64, LOAD); + CLIB_PREFETCH (src7, 64, LOAD); + + while (PREDICT_TRUE (copy_len >= 8)) + { + src0 = src4; + src1 = src5; + src2 = src6; + src3 = src7; + + src4 = map_guest_mem (vui, cpy[4].src, map_hint); + src5 = map_guest_mem (vui, cpy[5].src, map_hint); + src6 = map_guest_mem (vui, cpy[6].src, map_hint); + src7 = map_guest_mem (vui, cpy[7].src, map_hint); + bad = (src4 == 0) + (src5 == 0) + (src6 == 0) + (src7 == 0); + if (PREDICT_FALSE (bad)) + break; + + CLIB_PREFETCH (src4, 64, LOAD); + CLIB_PREFETCH (src5, 64, LOAD); + CLIB_PREFETCH (src6, 64, LOAD); + CLIB_PREFETCH (src7, 64, LOAD); + + clib_memcpy_fast ((void *) cpy[0].dst, src0, cpy[0].len); + clib_memcpy_fast ((void *) cpy[1].dst, src1, cpy[1].len); + clib_memcpy_fast ((void *) cpy[2].dst, src2, cpy[2].len); + clib_memcpy_fast ((void *) cpy[3].dst, src3, cpy[3].len); + copy_len -= 4; + cpy += 4; + } + } + +one_by_one: + while (copy_len) + { + if (PREDICT_FALSE (!(src0 = map_guest_mem (vui, cpy->src, map_hint)))) + { + rc = VHOST_USER_INPUT_FUNC_ERROR_MMAP_FAIL; + break; + } + clib_memcpy_fast ((void *) cpy->dst, src0, cpy->len); + copy_len -= 1; + cpy += 1; + } + return rc; +} + +static_always_inline u32 +vhost_user_do_offload (vhost_user_intf_t * vui, + vring_packed_desc_t * desc_table, u16 desc_current, + u16 mask, vlib_buffer_t * b_head, u32 * map_hint) +{ + u32 rc = VHOST_USER_INPUT_FUNC_ERROR_NO_ERROR; + virtio_net_hdr_mrg_rxbuf_t *hdr; + u8 *b_data; + u32 desc_data_offset = vui->virtio_net_hdr_sz; + + hdr = map_guest_mem (vui, desc_table[desc_current].addr, map_hint); + if (PREDICT_FALSE (hdr == 0)) + rc = VHOST_USER_INPUT_FUNC_ERROR_MMAP_FAIL; + else if (hdr->hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) + { + if (desc_data_offset == desc_table[desc_current].len) + { + desc_current = (desc_current + 1) & mask; + b_data = + map_guest_mem (vui, desc_table[desc_current].addr, map_hint); + if (PREDICT_FALSE (b_data == 0)) + rc = VHOST_USER_INPUT_FUNC_ERROR_MMAP_FAIL; + else + vhost_user_handle_rx_offload (b_head, b_data, &hdr->hdr); + } + else + { + b_data = (u8 *) hdr + desc_data_offset; + vhost_user_handle_rx_offload (b_head, b_data, &hdr->hdr); + } + } + + return rc; +} + +static_always_inline u32 +vhost_user_compute_buffers_required (u32 desc_len, u32 buffer_data_size) +{ + div_t result; + u32 buffers_required; + + if (PREDICT_TRUE (buffer_data_size == 2048)) + { + buffers_required = desc_len >> 11; + if ((desc_len & 2047) != 0) + buffers_required++; + return (buffers_required); + } + + if (desc_len < buffer_data_size) + return 1; + + result = div (desc_len, buffer_data_size); + if (result.rem) + buffers_required = result.quot + 1; + else + buffers_required = result.quot; + + return (buffers_required); +} + +static_always_inline u32 +vhost_user_compute_indirect_desc_len (vhost_user_intf_t * vui, + vhost_user_vring_t * txvq, + u32 buffer_data_size, u16 desc_current, + u32 * map_hint) +{ + vring_packed_desc_t *desc_table = txvq->packed_desc; + u32 desc_len = 0; + u16 desc_data_offset = vui->virtio_net_hdr_sz; + u16 desc_idx = desc_current; + u32 n_descs; + + n_descs = desc_table[desc_idx].len >> 4; + desc_table = map_guest_mem (vui, desc_table[desc_idx].addr, map_hint); + if (PREDICT_FALSE (desc_table == 0)) + return 0; + + for (desc_idx = 0; desc_idx < n_descs; desc_idx++) + desc_len += desc_table[desc_idx].len; + + if (PREDICT_TRUE (desc_len > desc_data_offset)) + desc_len -= desc_data_offset; + + return vhost_user_compute_buffers_required (desc_len, buffer_data_size); +} + +static_always_inline u32 +vhost_user_compute_chained_desc_len (vhost_user_intf_t * vui, + vhost_user_vring_t * txvq, + u32 buffer_data_size, u16 * current, + u16 * n_left) +{ + vring_packed_desc_t *desc_table = txvq->packed_desc; + u32 desc_len = 0; + u16 mask = txvq->qsz_mask; + + while (desc_table[*current].flags & VIRTQ_DESC_F_NEXT) + { + desc_len += desc_table[*current].len; + (*n_left)++; + *current = (*current + 1) & mask; + vhost_user_advance_last_avail_idx (txvq); + } + desc_len += desc_table[*current].len; + (*n_left)++; + *current = (*current + 1) & mask; + vhost_user_advance_last_avail_idx (txvq); + + if (PREDICT_TRUE (desc_len > vui->virtio_net_hdr_sz)) + desc_len -= vui->virtio_net_hdr_sz; + + return vhost_user_compute_buffers_required (desc_len, buffer_data_size); +} + +static_always_inline void +vhost_user_assemble_packet (vring_packed_desc_t * desc_table, + u16 * desc_idx, vlib_buffer_t * b_head, + vlib_buffer_t ** b_current, u32 ** next, + vlib_buffer_t *** b, u32 * bi_current, + vhost_cpu_t * cpu, u16 * copy_len, + u32 * buffers_used, u32 buffers_required, + u32 * desc_data_offset, u32 buffer_data_size, + u16 mask) +{ + u32 desc_data_l; + + while (*desc_data_offset < desc_table[*desc_idx].len) + { + /* Get more output if necessary. Or end of packet. */ + if (PREDICT_FALSE ((*b_current)->current_length == buffer_data_size)) + { + /* Get next output */ + u32 bi_next = **next; + (*next)++; + (*b_current)->next_buffer = bi_next; + (*b_current)->flags |= VLIB_BUFFER_NEXT_PRESENT; + *bi_current = bi_next; + *b_current = **b; + (*b)++; + (*buffers_used)++; + ASSERT (*buffers_used <= buffers_required); + } + + /* Prepare a copy order executed later for the data */ + ASSERT (*copy_len < VHOST_USER_COPY_ARRAY_N); + vhost_copy_t *cpy = &cpu->copy[*copy_len]; + (*copy_len)++; + desc_data_l = desc_table[*desc_idx].len - *desc_data_offset; + cpy->len = buffer_data_size - (*b_current)->current_length; + cpy->len = (cpy->len > desc_data_l) ? desc_data_l : cpy->len; + cpy->dst = (uword) (vlib_buffer_get_current (*b_current) + + (*b_current)->current_length); + cpy->src = desc_table[*desc_idx].addr + *desc_data_offset; + + *desc_data_offset += cpy->len; + + (*b_current)->current_length += cpy->len; + b_head->total_length_not_including_first_buffer += cpy->len; + } + *desc_idx = (*desc_idx + 1) & mask;; + *desc_data_offset = 0; +} + +static_always_inline u32 +vhost_user_if_input_packed (vlib_main_t * vm, vhost_user_main_t * vum, + vhost_user_intf_t * vui, u16 qid, + vlib_node_runtime_t * node, + vnet_hw_interface_rx_mode mode, u8 enable_csum) +{ + vhost_user_vring_t *txvq = &vui->vrings[VHOST_VRING_IDX_TX (qid)]; + vnet_feature_main_t *fm = &feature_main; + u8 feature_arc_idx = fm->device_input_feature_arc_index; + u16 n_rx_packets = 0; + u32 n_rx_bytes = 0; + u16 n_left = 0; + u32 buffers_required = 0; + u32 n_left_to_next, *to_next; + u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; + u32 n_trace = vlib_get_trace_count (vm, node); + u32 buffer_data_size = vlib_buffer_get_default_data_size (vm); + u32 map_hint = 0; + vhost_cpu_t *cpu = &vum->cpus[vm->thread_index]; + u16 copy_len = 0; + u32 current_config_index = ~0; + u16 mask = txvq->qsz_mask; + u16 desc_current, desc_head, last_used_idx; + vring_packed_desc_t *desc_table = 0; + u32 n_descs_processed = 0; + u32 rv; + vlib_buffer_t **b; + u32 *next; + u32 buffers_used = 0; + u16 current, n_descs_to_process; + + /* The descriptor table is not ready yet */ + if (PREDICT_FALSE (txvq->packed_desc == 0)) + goto done; + + /* do we have pending interrupts ? */ + vhost_user_vring_t *rxvq = &vui->vrings[VHOST_VRING_IDX_RX (qid)]; + vhost_user_input_do_interrupt (vm, txvq, rxvq); + + /* + * For adaptive mode, it is optimized to reduce interrupts. + * If the scheduler switches the input node to polling due + * to burst of traffic, we tell the driver no interrupt. + * When the traffic subsides, the scheduler switches the node back to + * interrupt mode. We must tell the driver we want interrupt. + */ + if (PREDICT_FALSE (mode == VNET_HW_INTERFACE_RX_MODE_ADAPTIVE)) + { + if ((node->flags & + VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE) || + !(node->flags & + VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE)) + /* Tell driver we want notification */ + txvq->used_event->flags = 0; + else + /* Tell driver we don't want notification */ + txvq->used_event->flags = VRING_EVENT_F_DISABLE; + } + + last_used_idx = txvq->last_used_idx & mask; + desc_head = desc_current = last_used_idx; + + if (vhost_user_packed_desc_available (txvq, desc_current) == 0) + goto done; + + if (PREDICT_FALSE (!vui->admin_up || !vui->is_ready || !(txvq->enabled))) + { + /* + * Discard input packet if interface is admin down or vring is not + * enabled. + * "For example, for a networking device, in the disabled state + * client must not supply any new RX packets, but must process + * and discard any TX packets." + */ + rv = vhost_user_rx_discard_packet_packed (vm, vui, txvq, + VHOST_USER_DOWN_DISCARD_COUNT); + vlib_error_count (vm, vhost_user_input_node.index, + VHOST_USER_INPUT_FUNC_ERROR_NOT_READY, rv); + goto done; + } + + vhost_user_input_setup_frame (vm, node, vui, ¤t_config_index, + &next_index, &to_next, &n_left_to_next); + + /* + * Compute n_left and total buffers needed + */ + desc_table = txvq->packed_desc; + current = desc_current; + while (vhost_user_packed_desc_available (txvq, current) && + (n_left < VLIB_FRAME_SIZE)) + { + if (desc_table[current].flags & VIRTQ_DESC_F_INDIRECT) + { + buffers_required += + vhost_user_compute_indirect_desc_len (vui, txvq, buffer_data_size, + current, &map_hint); + n_left++; + current = (current + 1) & mask; + vhost_user_advance_last_avail_idx (txvq); + } + else + { + buffers_required += + vhost_user_compute_chained_desc_len (vui, txvq, buffer_data_size, + ¤t, &n_left); + } + } + + /* Something is broken if we need more than 10000 buffers */ + if (PREDICT_FALSE ((buffers_required == 0) || (buffers_required > 10000))) + { + rv = vhost_user_rx_discard_packet_packed (vm, vui, txvq, n_left); + vlib_error_count (vm, vhost_user_input_node.index, + VHOST_USER_INPUT_FUNC_ERROR_NO_BUFFER, rv); + goto done; + } + + vec_validate (cpu->to_next_list, buffers_required); + rv = vlib_buffer_alloc (vm, cpu->to_next_list, buffers_required); + if (PREDICT_FALSE (rv != buffers_required)) + { + vlib_buffer_free (vm, cpu->to_next_list, rv); + rv = vhost_user_rx_discard_packet_packed (vm, vui, txvq, n_left); + vlib_error_count (vm, vhost_user_input_node.index, + VHOST_USER_INPUT_FUNC_ERROR_NO_BUFFER, rv); + goto done; + } + + next = cpu->to_next_list; + vec_validate (cpu->rx_buffers_pdesc, buffers_required); + vlib_get_buffers (vm, next, cpu->rx_buffers_pdesc, buffers_required); + b = cpu->rx_buffers_pdesc; + n_descs_processed = n_left; + + while (n_left) + { + vlib_buffer_t *b_head, *b_current; + u32 bi_current; + u32 desc_data_offset; + u16 desc_idx = desc_current; + u32 n_descs; + + desc_table = txvq->packed_desc; + to_next[0] = bi_current = next[0]; + b_head = b_current = b[0]; + b++; + buffers_used++; + ASSERT (buffers_used <= buffers_required); + to_next++; + next++; + n_left_to_next--; + + /* The buffer should already be initialized */ + b_head->total_length_not_including_first_buffer = 0; + b_head->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + desc_data_offset = vui->virtio_net_hdr_sz; + n_descs_to_process = 1; + + if (desc_table[desc_idx].flags & VIRTQ_DESC_F_INDIRECT) + { + n_descs = desc_table[desc_idx].len >> 4; + desc_table = map_guest_mem (vui, desc_table[desc_idx].addr, + &map_hint); + desc_idx = 0; + if (PREDICT_FALSE (desc_table == 0) || + (enable_csum && + (PREDICT_FALSE + (vhost_user_do_offload + (vui, desc_table, desc_idx, mask, b_head, + &map_hint) != VHOST_USER_INPUT_FUNC_ERROR_NO_ERROR)))) + { + vlib_error_count (vm, node->node_index, + VHOST_USER_INPUT_FUNC_ERROR_MMAP_FAIL, 1); + to_next--; + next--; + n_left_to_next++; + buffers_used--; + b--; + goto out; + } + while (n_descs) + { + vhost_user_assemble_packet (desc_table, &desc_idx, b_head, + &b_current, &next, &b, &bi_current, + cpu, ©_len, &buffers_used, + buffers_required, &desc_data_offset, + buffer_data_size, mask); + n_descs--; + } + } + else + { + if (enable_csum) + { + rv = vhost_user_do_offload (vui, desc_table, desc_idx, mask, + b_head, &map_hint); + if (PREDICT_FALSE (rv != VHOST_USER_INPUT_FUNC_ERROR_NO_ERROR)) + { + vlib_error_count (vm, node->node_index, rv, 1); + to_next--; + next--; + n_left_to_next++; + buffers_used--; + b--; + goto out; + } + } + /* + * For chained descriptor, we process all chains in a single while + * loop. So count how many descriptors in the chain. + */ + n_descs_to_process = 1; + while (desc_table[desc_idx].flags & VIRTQ_DESC_F_NEXT) + { + vhost_user_assemble_packet (desc_table, &desc_idx, b_head, + &b_current, &next, &b, &bi_current, + cpu, ©_len, &buffers_used, + buffers_required, &desc_data_offset, + buffer_data_size, mask); + n_descs_to_process++; + } + vhost_user_assemble_packet (desc_table, &desc_idx, b_head, + &b_current, &next, &b, &bi_current, + cpu, ©_len, &buffers_used, + buffers_required, &desc_data_offset, + buffer_data_size, mask); + } + + n_rx_bytes += b_head->total_length_not_including_first_buffer; + n_rx_packets++; + + b_head->total_length_not_including_first_buffer -= + b_head->current_length; + + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b_head); + + vnet_buffer (b_head)->sw_if_index[VLIB_RX] = vui->sw_if_index; + vnet_buffer (b_head)->sw_if_index[VLIB_TX] = ~0; + b_head->error = 0; + + if (current_config_index != ~0) + { + b_head->current_config_index = current_config_index; + vnet_buffer (b_head)->feature_arc_index = feature_arc_idx; + } + + out: + ASSERT (n_left >= n_descs_to_process); + n_left -= n_descs_to_process; + + /* advance to next descrptor */ + desc_current = (desc_current + n_descs_to_process) & mask; + + /* + * Although separating memory copies from virtio ring parsing + * is beneficial, we can offer to perform the copies from time + * to time in order to free some space in the ring. + */ + if (PREDICT_FALSE (copy_len >= VHOST_USER_RX_COPY_THRESHOLD)) + { + rv = vhost_user_input_copy_packed (vui, cpu->copy, copy_len, + &map_hint); + if (PREDICT_FALSE (rv != VHOST_USER_INPUT_FUNC_ERROR_NO_ERROR)) + vlib_error_count (vm, node->node_index, rv, 1); + copy_len = 0; + } + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + + /* Do the memory copies */ + rv = vhost_user_input_copy_packed (vui, cpu->copy, copy_len, &map_hint); + if (PREDICT_FALSE (rv != VHOST_USER_INPUT_FUNC_ERROR_NO_ERROR)) + vlib_error_count (vm, node->node_index, rv, 1); + + /* Must do the tracing before giving buffers back to driver */ + if (PREDICT_FALSE (n_trace)) + { + u32 left = n_rx_packets; + + b = cpu->rx_buffers_pdesc; + while (n_trace && left) + { + vhost_trace_t *t0; + + vlib_trace_buffer (vm, node, next_index, b[0], + /* follow_chain */ 0); + t0 = vlib_add_trace (vm, node, b[0], sizeof (t0[0])); + b++; + vhost_user_rx_trace_packed (t0, vui, qid, txvq, last_used_idx); + last_used_idx = (last_used_idx + 1) & mask; + n_trace--; + left--; + vlib_set_trace_count (vm, node, n_trace); + } + } + + /* + * Give buffers back to driver. + */ + vhost_user_mark_desc_consumed (vui, txvq, desc_head, n_descs_processed); + + /* interrupt (call) handling */ + if ((txvq->callfd_idx != ~0) && + (txvq->avail_event->flags != VRING_EVENT_F_DISABLE)) + { + txvq->n_since_last_int += n_rx_packets; + if (txvq->n_since_last_int > vum->coalesce_frames) + vhost_user_send_call (vm, txvq); + } + + /* increase rx counters */ + vlib_increment_combined_counter + (vnet_main.interface_main.combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, vm->thread_index, vui->sw_if_index, + n_rx_packets, n_rx_bytes); + + vnet_device_increment_rx_packets (vm->thread_index, n_rx_packets); + + if (PREDICT_FALSE (buffers_used < buffers_required)) + vlib_buffer_free (vm, next, buffers_required - buffers_used); + +done: + return n_rx_packets; +} + VLIB_NODE_FN (vhost_user_input_node) (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) @@ -765,14 +1429,26 @@ VLIB_NODE_FN (vhost_user_input_node) (vlib_main_t * vm, { vui = pool_elt_at_index (vum->vhost_user_interfaces, dq->dev_instance); - if (vui->features & (1ULL << FEAT_VIRTIO_NET_F_CSUM)) - n_rx_packets += - vhost_user_if_input (vm, vum, vui, dq->queue_id, node, dq->mode, - 1); + if (vhost_user_is_packed_ring_supported (vui)) + { + if (vui->features & (1ULL << FEAT_VIRTIO_NET_F_CSUM)) + n_rx_packets += vhost_user_if_input_packed (vm, vum, vui, + dq->queue_id, node, + dq->mode, 1); + else + n_rx_packets += vhost_user_if_input_packed (vm, vum, vui, + dq->queue_id, node, + dq->mode, 0); + } else - n_rx_packets += - vhost_user_if_input (vm, vum, vui, dq->queue_id, node, dq->mode, - 0); + { + if (vui->features & (1ULL << FEAT_VIRTIO_NET_F_CSUM)) + n_rx_packets += vhost_user_if_input (vm, vum, vui, dq->queue_id, + node, dq->mode, 1); + else + n_rx_packets += vhost_user_if_input (vm, vum, vui, dq->queue_id, + node, dq->mode, 0); + } } } diff --git a/src/vnet/devices/virtio/vhost_user_output.c b/src/vnet/devices/virtio/vhost_user_output.c index b6abe36d972..4f5eb3c1d76 100644 --- a/src/vnet/devices/virtio/vhost_user_output.c +++ b/src/vnet/devices/virtio/vhost_user_output.c @@ -294,6 +294,424 @@ vhost_user_handle_tx_offload (vhost_user_intf_t * vui, vlib_buffer_t * b, } } +static_always_inline void +vhost_user_mark_desc_available (vlib_main_t * vm, vhost_user_vring_t * rxvq, + u16 * n_descs_processed, u8 chained, + vlib_frame_t * frame, u32 n_left) +{ + u16 desc_idx, flags; + vring_packed_desc_t *desc_table = rxvq->packed_desc; + u16 last_used_idx = rxvq->last_used_idx; + + if (PREDICT_FALSE (*n_descs_processed == 0)) + return; + + if (rxvq->used_wrap_counter) + flags = desc_table[last_used_idx & rxvq->qsz_mask].flags | + (VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED); + else + flags = desc_table[last_used_idx & rxvq->qsz_mask].flags & + ~(VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED); + + vhost_user_advance_last_used_idx (rxvq); + + for (desc_idx = 1; desc_idx < *n_descs_processed; desc_idx++) + { + if (rxvq->used_wrap_counter) + desc_table[rxvq->last_used_idx & rxvq->qsz_mask].flags |= + (VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED); + else + desc_table[rxvq->last_used_idx & rxvq->qsz_mask].flags &= + ~(VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED); + vhost_user_advance_last_used_idx (rxvq); + } + + desc_table[last_used_idx & rxvq->qsz_mask].flags = flags; + + *n_descs_processed = 0; + + if (chained) + { + vring_packed_desc_t *desc_table = rxvq->packed_desc; + + while (desc_table[rxvq->last_used_idx & rxvq->qsz_mask].flags & + VIRTQ_DESC_F_NEXT) + vhost_user_advance_last_used_idx (rxvq); + + /* Advance past the current chained table entries */ + vhost_user_advance_last_used_idx (rxvq); + } + + /* interrupt (call) handling */ + if ((rxvq->callfd_idx != ~0) && + (rxvq->avail_event->flags != VRING_EVENT_F_DISABLE)) + { + vhost_user_main_t *vum = &vhost_user_main; + + rxvq->n_since_last_int += frame->n_vectors - n_left; + if (rxvq->n_since_last_int > vum->coalesce_frames) + vhost_user_send_call (vm, rxvq); + } +} + +static_always_inline void +vhost_user_tx_trace_packed (vhost_trace_t * t, vhost_user_intf_t * vui, + u16 qid, vlib_buffer_t * b, + vhost_user_vring_t * rxvq) +{ + vhost_user_main_t *vum = &vhost_user_main; + u32 last_avail_idx = rxvq->last_avail_idx; + u32 desc_current = last_avail_idx & rxvq->qsz_mask; + vring_packed_desc_t *hdr_desc = 0; + u32 hint = 0; + + clib_memset (t, 0, sizeof (*t)); + t->device_index = vui - vum->vhost_user_interfaces; + t->qid = qid; + + hdr_desc = &rxvq->packed_desc[desc_current]; + if (rxvq->packed_desc[desc_current].flags & VIRTQ_DESC_F_INDIRECT) + { + t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_INDIRECT; + /* Header is the first here */ + hdr_desc = map_guest_mem (vui, rxvq->packed_desc[desc_current].addr, + &hint); + } + if (rxvq->packed_desc[desc_current].flags & VIRTQ_DESC_F_NEXT) + { + t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_SIMPLE_CHAINED; + } + if (!(rxvq->packed_desc[desc_current].flags & VIRTQ_DESC_F_NEXT) && + !(rxvq->packed_desc[desc_current].flags & VIRTQ_DESC_F_INDIRECT)) + { + t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_SINGLE_DESC; + } + + t->first_desc_len = hdr_desc ? hdr_desc->len : 0; +} + +static_always_inline uword +vhost_user_device_class_packed (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 *buffers = vlib_frame_vector_args (frame); + u32 n_left = frame->n_vectors; + vhost_user_main_t *vum = &vhost_user_main; + vnet_interface_output_runtime_t *rd = (void *) node->runtime_data; + vhost_user_intf_t *vui = + pool_elt_at_index (vum->vhost_user_interfaces, rd->dev_instance); + u32 qid; + vhost_user_vring_t *rxvq; + u8 error; + u32 thread_index = vm->thread_index; + vhost_cpu_t *cpu = &vum->cpus[thread_index]; + u32 map_hint = 0; + u8 retry = 8; + u16 copy_len; + u16 tx_headers_len; + vring_packed_desc_t *desc_table; + u32 or_flags; + u16 desc_head, desc_index, desc_len; + u16 n_descs_processed; + u8 indirect, chained; + + qid = VHOST_VRING_IDX_RX (*vec_elt_at_index (vui->per_cpu_tx_qid, + thread_index)); + rxvq = &vui->vrings[qid]; + +retry: + error = VHOST_USER_TX_FUNC_ERROR_NONE; + tx_headers_len = 0; + copy_len = 0; + n_descs_processed = 0; + + while (n_left > 0) + { + vlib_buffer_t *b0, *current_b0; + uword buffer_map_addr; + u32 buffer_len; + u16 bytes_left; + u32 total_desc_len = 0; + u16 n_entries = 0; + + indirect = 0; + chained = 0; + if (PREDICT_TRUE (n_left > 1)) + vlib_prefetch_buffer_with_index (vm, buffers[1], LOAD); + + b0 = vlib_get_buffer (vm, buffers[0]); + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + cpu->current_trace = vlib_add_trace (vm, node, b0, + sizeof (*cpu->current_trace)); + vhost_user_tx_trace_packed (cpu->current_trace, vui, qid / 2, b0, + rxvq); + } + + desc_table = rxvq->packed_desc; + desc_head = desc_index = rxvq->last_avail_idx & rxvq->qsz_mask; + if (PREDICT_FALSE (!vhost_user_packed_desc_available (rxvq, desc_head))) + { + error = VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF; + goto done; + } + /* + * Go deeper in case of indirect descriptor. + * To test it, turn off mrg_rxbuf. + */ + if (desc_table[desc_head].flags & VIRTQ_DESC_F_INDIRECT) + { + indirect = 1; + if (PREDICT_FALSE (desc_table[desc_head].len < + sizeof (vring_packed_desc_t))) + { + error = VHOST_USER_TX_FUNC_ERROR_INDIRECT_OVERFLOW; + goto done; + } + n_entries = desc_table[desc_head].len >> 4; + desc_table = map_guest_mem (vui, desc_table[desc_index].addr, + &map_hint); + if (PREDICT_FALSE (desc_table == 0)) + { + error = VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL; + goto done; + } + desc_index = 0; + } + else if (rxvq->packed_desc[desc_head].flags & VIRTQ_DESC_F_NEXT) + chained = 1; + + desc_len = vui->virtio_net_hdr_sz; + buffer_map_addr = desc_table[desc_index].addr; + buffer_len = desc_table[desc_index].len; + + /* Get a header from the header array */ + virtio_net_hdr_mrg_rxbuf_t *hdr = &cpu->tx_headers[tx_headers_len]; + tx_headers_len++; + hdr->hdr.flags = 0; + hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE; + hdr->num_buffers = 1; + + or_flags = (b0->flags & VNET_BUFFER_F_OFFLOAD_IP_CKSUM) || + (b0->flags & VNET_BUFFER_F_OFFLOAD_UDP_CKSUM) || + (b0->flags & VNET_BUFFER_F_OFFLOAD_TCP_CKSUM); + + /* Guest supports csum offload and buffer requires checksum offload? */ + if (or_flags && + (vui->features & (1ULL << FEAT_VIRTIO_NET_F_GUEST_CSUM))) + vhost_user_handle_tx_offload (vui, b0, &hdr->hdr); + + /* Prepare a copy order executed later for the header */ + ASSERT (copy_len < VHOST_USER_COPY_ARRAY_N); + vhost_copy_t *cpy = &cpu->copy[copy_len]; + copy_len++; + cpy->len = vui->virtio_net_hdr_sz; + cpy->dst = buffer_map_addr; + cpy->src = (uword) hdr; + + buffer_map_addr += vui->virtio_net_hdr_sz; + buffer_len -= vui->virtio_net_hdr_sz; + bytes_left = b0->current_length; + current_b0 = b0; + while (1) + { + if (buffer_len == 0) + { + /* Get new output */ + if (chained) + { + /* + * Next one is chained + * Test it with both indirect and mrg_rxbuf off + */ + if (PREDICT_FALSE (!(desc_table[desc_index].flags & + VIRTQ_DESC_F_NEXT))) + { + /* + * Last descriptor in chain. + * Dequeue queued descriptors for this packet + */ + vhost_user_dequeue_chained_descs (rxvq, + &n_descs_processed); + error = VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF; + goto done; + } + vhost_user_advance_last_avail_idx (rxvq); + desc_index = rxvq->last_avail_idx & rxvq->qsz_mask; + n_descs_processed++; + buffer_map_addr = desc_table[desc_index].addr; + buffer_len = desc_table[desc_index].len; + total_desc_len += desc_len; + desc_len = 0; + } + else if (indirect) + { + /* + * Indirect table + * Test it with mrg_rxnuf off + */ + if (PREDICT_TRUE (n_entries > 0)) + n_entries--; + else + { + /* Dequeue queued descriptors for this packet */ + vhost_user_dequeue_chained_descs (rxvq, + &n_descs_processed); + error = VHOST_USER_TX_FUNC_ERROR_INDIRECT_OVERFLOW; + goto done; + } + total_desc_len += desc_len; + desc_index = (desc_index + 1) & rxvq->qsz_mask; + buffer_map_addr = desc_table[desc_index].addr; + buffer_len = desc_table[desc_index].len; + desc_len = 0; + } + else if (vui->virtio_net_hdr_sz == 12) + { + /* + * MRG is available + * This is the default setting for the guest VM + */ + virtio_net_hdr_mrg_rxbuf_t *hdr = + &cpu->tx_headers[tx_headers_len - 1]; + + desc_table[desc_index].len = desc_len; + vhost_user_advance_last_avail_idx (rxvq); + desc_head = desc_index = + rxvq->last_avail_idx & rxvq->qsz_mask; + hdr->num_buffers++; + n_descs_processed++; + desc_len = 0; + + if (PREDICT_FALSE (!vhost_user_packed_desc_available + (rxvq, desc_index))) + { + /* Dequeue queued descriptors for this packet */ + vhost_user_dequeue_descs (rxvq, hdr, + &n_descs_processed); + error = VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF; + goto done; + } + + buffer_map_addr = desc_table[desc_index].addr; + buffer_len = desc_table[desc_index].len; + } + else + { + error = VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOMRG; + goto done; + } + } + + ASSERT (copy_len < VHOST_USER_COPY_ARRAY_N); + vhost_copy_t *cpy = &cpu->copy[copy_len]; + copy_len++; + cpy->len = bytes_left; + cpy->len = (cpy->len > buffer_len) ? buffer_len : cpy->len; + cpy->dst = buffer_map_addr; + cpy->src = (uword) vlib_buffer_get_current (current_b0) + + current_b0->current_length - bytes_left; + + bytes_left -= cpy->len; + buffer_len -= cpy->len; + buffer_map_addr += cpy->len; + desc_len += cpy->len; + + CLIB_PREFETCH (&rxvq->packed_desc, CLIB_CACHE_LINE_BYTES, LOAD); + + /* Check if vlib buffer has more data. If not, get more or break */ + if (PREDICT_TRUE (!bytes_left)) + { + if (PREDICT_FALSE + (current_b0->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + current_b0 = vlib_get_buffer (vm, current_b0->next_buffer); + bytes_left = current_b0->current_length; + } + else + { + /* End of packet */ + break; + } + } + } + + /* Move from available to used ring */ + total_desc_len += desc_len; + rxvq->packed_desc[desc_head].len = total_desc_len; + + vhost_user_advance_last_avail_table_idx (vui, rxvq, chained); + n_descs_processed++; + + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + cpu->current_trace->hdr = cpu->tx_headers[tx_headers_len - 1]; + + n_left--; + + /* + * Do the copy periodically to prevent + * cpu->copy array overflow and corrupt memory + */ + if (PREDICT_FALSE (copy_len >= VHOST_USER_TX_COPY_THRESHOLD) || chained) + { + if (PREDICT_FALSE (vhost_user_tx_copy (vui, cpu->copy, copy_len, + &map_hint))) + vlib_error_count (vm, node->node_index, + VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL, 1); + copy_len = 0; + + /* give buffers back to driver */ + vhost_user_mark_desc_available (vm, rxvq, &n_descs_processed, + chained, frame, n_left); + } + + buffers++; + } + +done: + if (PREDICT_TRUE (copy_len)) + { + if (PREDICT_FALSE (vhost_user_tx_copy (vui, cpu->copy, copy_len, + &map_hint))) + vlib_error_count (vm, node->node_index, + VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL, 1); + + vhost_user_mark_desc_available (vm, rxvq, &n_descs_processed, chained, + frame, n_left); + } + + /* + * When n_left is set, error is always set to something too. + * In case error is due to lack of remaining buffers, we go back up and + * retry. + * The idea is that it is better to waste some time on packets + * that have been processed already than dropping them and get + * more fresh packets with a good likelyhood that they will be dropped too. + * This technique also gives more time to VM driver to pick-up packets. + * In case the traffic flows from physical to virtual interfaces, this + * technique will end-up leveraging the physical NIC buffer in order to + * absorb the VM's CPU jitter. + */ + if (n_left && (error == VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF) && retry) + { + retry--; + goto retry; + } + + vhost_user_vring_unlock (vui, qid); + + if (PREDICT_FALSE (n_left && error != VHOST_USER_TX_FUNC_ERROR_NONE)) + { + vlib_error_count (vm, node->node_index, error, n_left); + vlib_increment_simple_counter + (vnet_main.interface_main.sw_if_counters + + VNET_INTERFACE_COUNTER_DROP, thread_index, vui->sw_if_index, n_left); + } + + vlib_buffer_free (vm, vlib_frame_vector_args (frame), frame->n_vectors); + return frame->n_vectors; +} + VNET_DEVICE_CLASS_TX_FN (vhost_user_device_class) (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) @@ -339,6 +757,9 @@ VNET_DEVICE_CLASS_TX_FN (vhost_user_device_class) (vlib_main_t * vm, if (PREDICT_FALSE (vui->use_tx_spinlock)) vhost_user_vring_lock (vui, qid); + if (vhost_user_is_packed_ring_supported (vui)) + return (vhost_user_device_class_packed (vm, node, frame)); + retry: error = VHOST_USER_TX_FUNC_ERROR_NONE; tx_headers_len = 0; |