diff options
author | Christian Ehrhardt <christian.ehrhardt@canonical.com> | 2017-05-16 14:51:32 +0200 |
---|---|---|
committer | Christian Ehrhardt <christian.ehrhardt@canonical.com> | 2017-05-16 16:20:45 +0200 |
commit | 7595afa4d30097c1177b69257118d8ad89a539be (patch) | |
tree | 4bfeadc905c977e45e54a90c42330553b8942e4e /lib/librte_vhost | |
parent | ce3d555e43e3795b5d9507fcfc76b7a0a92fd0d6 (diff) |
Imported Upstream version 17.05
Change-Id: Id1e419c5a214e4a18739663b91f0f9a549f1fdc6
Signed-off-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
Diffstat (limited to 'lib/librte_vhost')
-rw-r--r-- | lib/librte_vhost/Makefile | 11 | ||||
-rw-r--r-- | lib/librte_vhost/fd_man.c | 33 | ||||
-rw-r--r-- | lib/librte_vhost/fd_man.h | 2 | ||||
-rw-r--r-- | lib/librte_vhost/rte_vhost.h | 439 | ||||
-rw-r--r-- | lib/librte_vhost/rte_vhost_version.map | 23 | ||||
-rw-r--r-- | lib/librte_vhost/rte_virtio_net.h | 193 | ||||
-rw-r--r-- | lib/librte_vhost/socket.c | 277 | ||||
-rw-r--r-- | lib/librte_vhost/vhost.c | 248 | ||||
-rw-r--r-- | lib/librte_vhost/vhost.h | 136 | ||||
-rw-r--r-- | lib/librte_vhost/vhost_user.c | 207 | ||||
-rw-r--r-- | lib/librte_vhost/vhost_user.h | 14 | ||||
-rw-r--r-- | lib/librte_vhost/virtio_net.c | 134 |
12 files changed, 1152 insertions, 565 deletions
diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile index 415ffc6e..4a116fe3 100644 --- a/lib/librte_vhost/Makefile +++ b/lib/librte_vhost/Makefile @@ -36,7 +36,7 @@ LIB = librte_vhost.a EXPORT_MAP := rte_vhost_version.map -LIBABIVER := 3 +LIBABIVER := 4 CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -D_FILE_OFFSET_BITS=64 CFLAGS += -I vhost_user @@ -51,13 +51,6 @@ SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c socket.c vhost.c vhost_user.c \ virtio_net.c # install includes -SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_virtio_net.h - -# dependencies -DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_eal -DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_ether -DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_mbuf -DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_mempool -DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_net +SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_vhost/fd_man.c b/lib/librte_vhost/fd_man.c index 8a075da2..2ceacc9a 100644 --- a/lib/librte_vhost/fd_man.c +++ b/lib/librte_vhost/fd_man.c @@ -65,17 +65,12 @@ fdset_move(struct fdset *pfdset, int dst, int src) pfdset->rwfds[dst] = pfdset->rwfds[src]; } -/* - * Find deleted fd entries and remove them - */ static void -fdset_shrink(struct fdset *pfdset) +fdset_shrink_nolock(struct fdset *pfdset) { int i; int last_valid_idx = get_last_valid_idx(pfdset, pfdset->num - 1); - pthread_mutex_lock(&pfdset->fd_mutex); - for (i = 0; i < last_valid_idx; i++) { if (pfdset->fd[i].fd != -1) continue; @@ -84,7 +79,16 @@ fdset_shrink(struct fdset *pfdset) last_valid_idx = get_last_valid_idx(pfdset, last_valid_idx - 1); } pfdset->num = last_valid_idx + 1; +} +/* + * Find deleted fd entries and remove them + */ +static void +fdset_shrink(struct fdset *pfdset) +{ + pthread_mutex_lock(&pfdset->fd_mutex); + fdset_shrink_nolock(pfdset); pthread_mutex_unlock(&pfdset->fd_mutex); } @@ -151,8 +155,12 @@ fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat) pthread_mutex_lock(&pfdset->fd_mutex); i = pfdset->num < MAX_FDS ? pfdset->num++ : -1; if (i == -1) { - pthread_mutex_unlock(&pfdset->fd_mutex); - return -2; + fdset_shrink_nolock(pfdset); + i = pfdset->num < MAX_FDS ? pfdset->num++ : -1; + if (i == -1) { + pthread_mutex_unlock(&pfdset->fd_mutex); + return -2; + } } fdset_add_fd(pfdset, i, fd, rcb, wcb, dat); @@ -202,8 +210,8 @@ fdset_del(struct fdset *pfdset, int fd) * will wait until the flag is reset to zero(which indicates the callback is * finished), then it could free the context after fdset_del. */ -void -fdset_event_dispatch(struct fdset *pfdset) +void * +fdset_event_dispatch(void *arg) { int i; struct pollfd *pfd; @@ -213,9 +221,10 @@ fdset_event_dispatch(struct fdset *pfdset) int fd, numfds; int remove1, remove2; int need_shrink; + struct fdset *pfdset = arg; if (pfdset == NULL) - return; + return NULL; while (1) { @@ -286,4 +295,6 @@ fdset_event_dispatch(struct fdset *pfdset) if (need_shrink) fdset_shrink(pfdset); } + + return NULL; } diff --git a/lib/librte_vhost/fd_man.h b/lib/librte_vhost/fd_man.h index d319cac6..90d34db1 100644 --- a/lib/librte_vhost/fd_man.h +++ b/lib/librte_vhost/fd_man.h @@ -64,6 +64,6 @@ int fdset_add(struct fdset *pfdset, int fd, void *fdset_del(struct fdset *pfdset, int fd); -void fdset_event_dispatch(struct fdset *pfdset); +void *fdset_event_dispatch(void *arg); #endif diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h new file mode 100644 index 00000000..605e47cb --- /dev/null +++ b/lib/librte_vhost/rte_vhost.h @@ -0,0 +1,439 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2017 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RTE_VHOST_H_ +#define _RTE_VHOST_H_ + +/** + * @file + * Interface to vhost-user + */ + +#include <stdint.h> +#include <sys/eventfd.h> + +#include <rte_memory.h> +#include <rte_mempool.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* These are not C++-aware. */ +#include <linux/vhost.h> +#include <linux/virtio_ring.h> + +#define RTE_VHOST_USER_CLIENT (1ULL << 0) +#define RTE_VHOST_USER_NO_RECONNECT (1ULL << 1) +#define RTE_VHOST_USER_DEQUEUE_ZERO_COPY (1ULL << 2) + +/** + * Information relating to memory regions including offsets to + * addresses in QEMUs memory file. + */ +struct rte_vhost_mem_region { + uint64_t guest_phys_addr; + uint64_t guest_user_addr; + uint64_t host_user_addr; + uint64_t size; + void *mmap_addr; + uint64_t mmap_size; + int fd; +}; + +/** + * Memory structure includes region and mapping information. + */ +struct rte_vhost_memory { + uint32_t nregions; + struct rte_vhost_mem_region regions[]; +}; + +struct rte_vhost_vring { + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; + uint64_t log_guest_addr; + + int callfd; + int kickfd; + uint16_t size; +}; + +/** + * Device and vring operations. + */ +struct vhost_device_ops { + int (*new_device)(int vid); /**< Add device. */ + void (*destroy_device)(int vid); /**< Remove device. */ + + int (*vring_state_changed)(int vid, uint16_t queue_id, int enable); /**< triggered when a vring is enabled or disabled */ + + /** + * Features could be changed after the feature negotiation. + * For example, VHOST_F_LOG_ALL will be set/cleared at the + * start/end of live migration, respectively. This callback + * is used to inform the application on such change. + */ + int (*features_changed)(int vid, uint64_t features); + + void *reserved[4]; /**< Reserved for future extension */ +}; + +/** + * Convert guest physical address to host virtual address + * + * @param mem + * the guest memory regions + * @param gpa + * the guest physical address for querying + * @return + * the host virtual address on success, 0 on failure + */ +static inline uint64_t __attribute__((always_inline)) +rte_vhost_gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa) +{ + struct rte_vhost_mem_region *reg; + uint32_t i; + + for (i = 0; i < mem->nregions; i++) { + reg = &mem->regions[i]; + if (gpa >= reg->guest_phys_addr && + gpa < reg->guest_phys_addr + reg->size) { + return gpa - reg->guest_phys_addr + + reg->host_user_addr; + } + } + + return 0; +} + +#define RTE_VHOST_NEED_LOG(features) ((features) & (1ULL << VHOST_F_LOG_ALL)) + +/** + * Log the memory write start with given address. + * + * This function only need be invoked when the live migration starts. + * Therefore, we won't need call it at all in the most of time. For + * making the performance impact be minimum, it's suggested to do a + * check before calling it: + * + * if (unlikely(RTE_VHOST_NEED_LOG(features))) + * rte_vhost_log_write(vid, addr, len); + * + * @param vid + * vhost device ID + * @param addr + * the starting address for write + * @param len + * the length to write + */ +void rte_vhost_log_write(int vid, uint64_t addr, uint64_t len); + +/** + * Log the used ring update start at given offset. + * + * Same as rte_vhost_log_write, it's suggested to do a check before + * calling it: + * + * if (unlikely(RTE_VHOST_NEED_LOG(features))) + * rte_vhost_log_used_vring(vid, vring_idx, offset, len); + * + * @param vid + * vhost device ID + * @param vring_idx + * the vring index + * @param offset + * the offset inside the used ring + * @param len + * the length to write + */ +void rte_vhost_log_used_vring(int vid, uint16_t vring_idx, + uint64_t offset, uint64_t len); + +int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable); + +/** + * Register vhost driver. path could be different for multiple + * instance support. + */ +int rte_vhost_driver_register(const char *path, uint64_t flags); + +/* Unregister vhost driver. This is only meaningful to vhost user. */ +int rte_vhost_driver_unregister(const char *path); + +/** + * Set the feature bits the vhost-user driver supports. + * + * @param path + * The vhost-user socket file path + * @param features + * Supported features + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_set_features(const char *path, uint64_t features); + +/** + * Enable vhost-user driver features. + * + * Note that + * - the param features should be a subset of the feature bits provided + * by rte_vhost_driver_set_features(). + * - it must be invoked before vhost-user negotiation starts. + * + * @param path + * The vhost-user socket file path + * @param features + * Features to enable + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_enable_features(const char *path, uint64_t features); + +/** + * Disable vhost-user driver features. + * + * The two notes at rte_vhost_driver_enable_features() also apply here. + * + * @param path + * The vhost-user socket file path + * @param features + * Features to disable + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_disable_features(const char *path, uint64_t features); + +/** + * Get the feature bits before feature negotiation. + * + * @param path + * The vhost-user socket file path + * @param features + * A pointer to store the queried feature bits + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_get_features(const char *path, uint64_t *features); + +/** + * Get the feature bits after negotiation + * + * @param vid + * Vhost device ID + * @param features + * A pointer to store the queried feature bits + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_negotiated_features(int vid, uint64_t *features); + +/* Register callbacks. */ +int rte_vhost_driver_callback_register(const char *path, + struct vhost_device_ops const * const ops); + +/** + * + * Start the vhost-user driver. + * + * This function triggers the vhost-user negotiation. + * + * @param path + * The vhost-user socket file path + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_start(const char *path); + +/** + * Get the MTU value of the device if set in QEMU. + * + * @param vid + * virtio-net device ID + * @param mtu + * The variable to store the MTU value + * + * @return + * 0: success + * -EAGAIN: device not yet started + * -ENOTSUP: device does not support MTU feature + */ +int rte_vhost_get_mtu(int vid, uint16_t *mtu); + +/** + * Get the numa node from which the virtio net device's memory + * is allocated. + * + * @param vid + * vhost device ID + * + * @return + * The numa node, -1 on failure + */ +int rte_vhost_get_numa_node(int vid); + +/** + * @deprecated + * Get the number of queues the device supports. + * + * Note this function is deprecated, as it returns a queue pair number, + * which is vhost specific. Instead, rte_vhost_get_vring_num should + * be used. + * + * @param vid + * vhost device ID + * + * @return + * The number of queues, 0 on failure + */ +__rte_deprecated +uint32_t rte_vhost_get_queue_num(int vid); + +/** + * Get the number of vrings the device supports. + * + * @param vid + * vhost device ID + * + * @return + * The number of vrings, 0 on failure + */ +uint16_t rte_vhost_get_vring_num(int vid); + +/** + * Get the virtio net device's ifname, which is the vhost-user socket + * file path. + * + * @param vid + * vhost device ID + * @param buf + * The buffer to stored the queried ifname + * @param len + * The length of buf + * + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_ifname(int vid, char *buf, size_t len); + +/** + * Get how many avail entries are left in the queue + * + * @param vid + * vhost device ID + * @param queue_id + * virtio queue index + * + * @return + * num of avail entires left + */ +uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id); + +struct rte_mbuf; +struct rte_mempool; +/** + * This function adds buffers to the virtio devices RX virtqueue. Buffers can + * be received from the physical port or from another virtual device. A packet + * count is returned to indicate the number of packets that were succesfully + * added to the RX queue. + * @param vid + * vhost device ID + * @param queue_id + * virtio queue index in mq case + * @param pkts + * array to contain packets to be enqueued + * @param count + * packets num to be enqueued + * @return + * num of packets enqueued + */ +uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id, + struct rte_mbuf **pkts, uint16_t count); + +/** + * This function gets guest buffers from the virtio device TX virtqueue, + * construct host mbufs, copies guest buffer content to host mbufs and + * store them in pkts to be processed. + * @param vid + * vhost device ID + * @param queue_id + * virtio queue index in mq case + * @param mbuf_pool + * mbuf_pool where host mbuf is allocated. + * @param pkts + * array to contain packets to be dequeued + * @param count + * packets num to be dequeued + * @return + * num of packets dequeued + */ +uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count); + +/** + * Get guest mem table: a list of memory regions. + * + * An rte_vhost_vhost_memory object will be allocated internaly, to hold the + * guest memory regions. Application should free it at destroy_device() + * callback. + * + * @param vid + * vhost device ID + * @param mem + * To store the returned mem regions + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem); + +/** + * Get guest vring info, including the vring address, vring size, etc. + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @param vring + * the structure to hold the requested vring info + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx, + struct rte_vhost_vring *vring); + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_VHOST_H_ */ diff --git a/lib/librte_vhost/rte_vhost_version.map b/lib/librte_vhost/rte_vhost_version.map index 5ceaa8a5..07858732 100644 --- a/lib/librte_vhost/rte_vhost_version.map +++ b/lib/librte_vhost/rte_vhost_version.map @@ -4,12 +4,8 @@ DPDK_2.0 { rte_vhost_dequeue_burst; rte_vhost_driver_callback_register; rte_vhost_driver_register; - rte_vhost_driver_session_start; rte_vhost_enable_guest_notification; rte_vhost_enqueue_burst; - rte_vhost_feature_disable; - rte_vhost_feature_enable; - rte_vhost_feature_get; local: *; }; @@ -30,3 +26,22 @@ DPDK_16.07 { rte_vhost_get_queue_num; } DPDK_2.1; + +DPDK_17.05 { + global: + + rte_vhost_driver_disable_features; + rte_vhost_driver_enable_features; + rte_vhost_driver_get_features; + rte_vhost_driver_set_features; + rte_vhost_driver_start; + rte_vhost_get_mem_table; + rte_vhost_get_mtu; + rte_vhost_get_negotiated_features; + rte_vhost_get_vhost_vring; + rte_vhost_get_vring_num; + rte_vhost_gpa_to_vva; + rte_vhost_log_used_vring; + rte_vhost_log_write; + +} DPDK_16.07; diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h deleted file mode 100644 index 926039c5..00000000 --- a/lib/librte_vhost/rte_virtio_net.h +++ /dev/null @@ -1,193 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _VIRTIO_NET_H_ -#define _VIRTIO_NET_H_ - -/** - * @file - * Interface to vhost net - */ - -#include <stdint.h> -#include <linux/vhost.h> -#include <linux/virtio_ring.h> -#include <linux/virtio_net.h> -#include <sys/eventfd.h> -#include <sys/socket.h> -#include <linux/if.h> - -#include <rte_memory.h> -#include <rte_mempool.h> -#include <rte_ether.h> - -#define RTE_VHOST_USER_CLIENT (1ULL << 0) -#define RTE_VHOST_USER_NO_RECONNECT (1ULL << 1) -#define RTE_VHOST_USER_DEQUEUE_ZERO_COPY (1ULL << 2) - -/* Enum for virtqueue management. */ -enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM}; - -/** - * Device and vring operations. - */ -struct virtio_net_device_ops { - int (*new_device)(int vid); /**< Add device. */ - void (*destroy_device)(int vid); /**< Remove device. */ - - int (*vring_state_changed)(int vid, uint16_t queue_id, int enable); /**< triggered when a vring is enabled or disabled */ - - void *reserved[5]; /**< Reserved for future extension */ -}; - -/** - * Disable features in feature_mask. Returns 0 on success. - */ -int rte_vhost_feature_disable(uint64_t feature_mask); - -/** - * Enable features in feature_mask. Returns 0 on success. - */ -int rte_vhost_feature_enable(uint64_t feature_mask); - -/* Returns currently supported vhost features */ -uint64_t rte_vhost_feature_get(void); - -int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable); - -/** - * Register vhost driver. path could be different for multiple - * instance support. - */ -int rte_vhost_driver_register(const char *path, uint64_t flags); - -/* Unregister vhost driver. This is only meaningful to vhost user. */ -int rte_vhost_driver_unregister(const char *path); - -/* Register callbacks. */ -int rte_vhost_driver_callback_register(struct virtio_net_device_ops const * const); -/* Start vhost driver session blocking loop. */ -int rte_vhost_driver_session_start(void); - -/** - * Get the numa node from which the virtio net device's memory - * is allocated. - * - * @param vid - * virtio-net device ID - * - * @return - * The numa node, -1 on failure - */ -int rte_vhost_get_numa_node(int vid); - -/** - * Get the number of queues the device supports. - * - * @param vid - * virtio-net device ID - * - * @return - * The number of queues, 0 on failure - */ -uint32_t rte_vhost_get_queue_num(int vid); - -/** - * Get the virtio net device's ifname, which is the vhost-user socket - * file path. - * - * @param vid - * virtio-net device ID - * @param buf - * The buffer to stored the queried ifname - * @param len - * The length of buf - * - * @return - * 0 on success, -1 on failure - */ -int rte_vhost_get_ifname(int vid, char *buf, size_t len); - -/** - * Get how many avail entries are left in the queue - * - * @param vid - * virtio-net device ID - * @param queue_id - * virtio queue index - * - * @return - * num of avail entires left - */ -uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id); - -/** - * This function adds buffers to the virtio devices RX virtqueue. Buffers can - * be received from the physical port or from another virtual device. A packet - * count is returned to indicate the number of packets that were succesfully - * added to the RX queue. - * @param vid - * virtio-net device ID - * @param queue_id - * virtio queue index in mq case - * @param pkts - * array to contain packets to be enqueued - * @param count - * packets num to be enqueued - * @return - * num of packets enqueued - */ -uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id, - struct rte_mbuf **pkts, uint16_t count); - -/** - * This function gets guest buffers from the virtio device TX virtqueue, - * construct host mbufs, copies guest buffer content to host mbufs and - * store them in pkts to be processed. - * @param vid - * virtio-net device - * @param queue_id - * virtio queue index in mq case - * @param mbuf_pool - * mbuf_pool where host mbuf is allocated. - * @param pkts - * array to contain packets to be dequeued - * @param count - * packets num to be dequeued - * @return - * num of packets dequeued - */ -uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id, - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count); - -#endif /* _VIRTIO_NET_H_ */ diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c index aaa9c270..c7f99b08 100644 --- a/lib/librte_vhost/socket.c +++ b/lib/librte_vhost/socket.c @@ -52,22 +52,42 @@ #include "vhost.h" #include "vhost_user.h" + +TAILQ_HEAD(vhost_user_connection_list, vhost_user_connection); + /* * Every time rte_vhost_driver_register() is invoked, an associated * vhost_user_socket struct will be created. */ struct vhost_user_socket { + struct vhost_user_connection_list conn_list; + pthread_mutex_t conn_mutex; char *path; - int listenfd; - int connfd; + int socket_fd; + struct sockaddr_un un; bool is_server; bool reconnect; bool dequeue_zero_copy; + + /* + * The "supported_features" indicates the feature bits the + * vhost driver supports. The "features" indicates the feature + * bits after the rte_vhost_driver_features_disable/enable(). + * It is also the final feature bits used for vhost-user + * features negotiation. + */ + uint64_t supported_features; + uint64_t features; + + struct vhost_device_ops const *notify_ops; }; struct vhost_user_connection { struct vhost_user_socket *vsocket; + int connfd; int vid; + + TAILQ_ENTRY(vhost_user_connection) next; }; #define MAX_VHOST_SOCKET 1024 @@ -82,7 +102,8 @@ struct vhost_user { static void vhost_user_server_new_connection(int fd, void *data, int *remove); static void vhost_user_read_cb(int fd, void *dat, int *remove); -static int vhost_user_create_client(struct vhost_user_socket *vsocket); +static int create_unix_socket(struct vhost_user_socket *vsocket); +static int vhost_user_start_client(struct vhost_user_socket *vsocket); static struct vhost_user vhost_user = { .fdset = { @@ -209,19 +230,24 @@ vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket) RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid); - vsocket->connfd = fd; + conn->connfd = fd; conn->vsocket = vsocket; conn->vid = vid; ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb, NULL, conn); if (ret < 0) { - vsocket->connfd = -1; + conn->connfd = -1; free(conn); close(fd); RTE_LOG(ERR, VHOST_CONFIG, "failed to add fd %d into vhost server fdset\n", fd); + return; } + + pthread_mutex_lock(&vsocket->conn_mutex); + TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next); + pthread_mutex_unlock(&vsocket->conn_mutex); } /* call back when there is new vhost-user connection from client */ @@ -247,29 +273,36 @@ vhost_user_read_cb(int connfd, void *dat, int *remove) ret = vhost_user_msg_handler(conn->vid, connfd); if (ret < 0) { - vsocket->connfd = -1; close(connfd); *remove = 1; vhost_destroy_device(conn->vid); + + pthread_mutex_lock(&vsocket->conn_mutex); + TAILQ_REMOVE(&vsocket->conn_list, conn, next); + pthread_mutex_unlock(&vsocket->conn_mutex); + free(conn); - if (vsocket->reconnect) - vhost_user_create_client(vsocket); + if (vsocket->reconnect) { + create_unix_socket(vsocket); + vhost_user_start_client(vsocket); + } } } static int -create_unix_socket(const char *path, struct sockaddr_un *un, bool is_server) +create_unix_socket(struct vhost_user_socket *vsocket) { int fd; + struct sockaddr_un *un = &vsocket->un; fd = socket(AF_UNIX, SOCK_STREAM, 0); if (fd < 0) return -1; RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n", - is_server ? "server" : "client", fd); + vsocket->is_server ? "server" : "client", fd); - if (!is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) { + if (!vsocket->is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) { RTE_LOG(ERR, VHOST_CONFIG, "vhost-user: can't set nonblocking mode for socket, fd: " "%d (%s)\n", fd, strerror(errno)); @@ -279,25 +312,21 @@ create_unix_socket(const char *path, struct sockaddr_un *un, bool is_server) memset(un, 0, sizeof(*un)); un->sun_family = AF_UNIX; - strncpy(un->sun_path, path, sizeof(un->sun_path)); + strncpy(un->sun_path, vsocket->path, sizeof(un->sun_path)); un->sun_path[sizeof(un->sun_path) - 1] = '\0'; - return fd; + vsocket->socket_fd = fd; + return 0; } static int -vhost_user_create_server(struct vhost_user_socket *vsocket) +vhost_user_start_server(struct vhost_user_socket *vsocket) { - int fd; int ret; - struct sockaddr_un un; + int fd = vsocket->socket_fd; const char *path = vsocket->path; - fd = create_unix_socket(path, &un, vsocket->is_server); - if (fd < 0) - return -1; - - ret = bind(fd, (struct sockaddr *)&un, sizeof(un)); + ret = bind(fd, (struct sockaddr *)&vsocket->un, sizeof(vsocket->un)); if (ret < 0) { RTE_LOG(ERR, VHOST_CONFIG, "failed to bind to %s: %s; remove it and try again\n", @@ -310,7 +339,6 @@ vhost_user_create_server(struct vhost_user_socket *vsocket) if (ret < 0) goto err; - vsocket->listenfd = fd; ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection, NULL, vsocket); if (ret < 0) { @@ -429,26 +457,21 @@ vhost_user_reconnect_init(void) } static int -vhost_user_create_client(struct vhost_user_socket *vsocket) +vhost_user_start_client(struct vhost_user_socket *vsocket) { - int fd; int ret; - struct sockaddr_un un; + int fd = vsocket->socket_fd; const char *path = vsocket->path; struct vhost_user_reconnect *reconn; - fd = create_unix_socket(path, &un, vsocket->is_server); - if (fd < 0) - return -1; - - ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&un, - sizeof(un)); + ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&vsocket->un, + sizeof(vsocket->un)); if (ret == 0) { vhost_user_add_connection(fd, vsocket); return 0; } - RTE_LOG(ERR, VHOST_CONFIG, + RTE_LOG(WARNING, VHOST_CONFIG, "failed to connect to %s: %s\n", path, strerror(errno)); @@ -457,7 +480,7 @@ vhost_user_create_client(struct vhost_user_socket *vsocket) return -1; } - RTE_LOG(ERR, VHOST_CONFIG, "%s: reconnecting...\n", path); + RTE_LOG(INFO, VHOST_CONFIG, "%s: reconnecting...\n", path); reconn = malloc(sizeof(*reconn)); if (reconn == NULL) { RTE_LOG(ERR, VHOST_CONFIG, @@ -465,7 +488,7 @@ vhost_user_create_client(struct vhost_user_socket *vsocket) close(fd); return -1; } - reconn->un = un; + reconn->un = vsocket->un; reconn->fd = fd; reconn->vsocket = vsocket; pthread_mutex_lock(&reconn_list.mutex); @@ -475,6 +498,94 @@ vhost_user_create_client(struct vhost_user_socket *vsocket) return 0; } +static struct vhost_user_socket * +find_vhost_user_socket(const char *path) +{ + int i; + + for (i = 0; i < vhost_user.vsocket_cnt; i++) { + struct vhost_user_socket *vsocket = vhost_user.vsockets[i]; + + if (!strcmp(vsocket->path, path)) + return vsocket; + } + + return NULL; +} + +int +rte_vhost_driver_disable_features(const char *path, uint64_t features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + vsocket->features &= ~features; + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_enable_features(const char *path, uint64_t features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) { + if ((vsocket->supported_features & features) != features) { + /* + * trying to enable features the driver doesn't + * support. + */ + pthread_mutex_unlock(&vhost_user.mutex); + return -1; + } + vsocket->features |= features; + } + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_set_features(const char *path, uint64_t features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) { + vsocket->supported_features = features; + vsocket->features = features; + } + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_get_features(const char *path, uint64_t *features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + *features = vsocket->features; + pthread_mutex_unlock(&vhost_user.mutex); + + if (!vsocket) { + RTE_LOG(ERR, VHOST_CONFIG, + "socket file %s is not registered yet.\n", path); + return -1; + } else { + return 0; + } +} + /* * Register a new vhost-user socket; here we could act as server * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag @@ -502,9 +613,25 @@ rte_vhost_driver_register(const char *path, uint64_t flags) goto out; memset(vsocket, 0, sizeof(struct vhost_user_socket)); vsocket->path = strdup(path); - vsocket->connfd = -1; + TAILQ_INIT(&vsocket->conn_list); + pthread_mutex_init(&vsocket->conn_mutex, NULL); vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY; + /* + * Set the supported features correctly for the builtin vhost-user + * net driver. + * + * Applications know nothing about features the builtin virtio net + * driver (virtio_net.c) supports, thus it's not possible for them + * to invoke rte_vhost_driver_set_features(). To workaround it, here + * we set it unconditionally. If the application want to implement + * another vhost-user driver (say SCSI), it should call the + * rte_vhost_driver_set_features(), which will overwrite following + * two values. + */ + vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES; + vsocket->features = VIRTIO_NET_SUPPORTED_FEATURES; + if ((flags & RTE_VHOST_USER_CLIENT) != 0) { vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT); if (vsocket->reconnect && reconn_tid == 0) { @@ -514,11 +641,10 @@ rte_vhost_driver_register(const char *path, uint64_t flags) goto out; } } - ret = vhost_user_create_client(vsocket); } else { vsocket->is_server = true; - ret = vhost_user_create_server(vsocket); } + ret = create_unix_socket(vsocket); if (ret < 0) { free(vsocket->path); free(vsocket); @@ -565,7 +691,7 @@ rte_vhost_driver_unregister(const char *path) { int i; int count; - struct vhost_user_connection *conn; + struct vhost_user_connection *conn, *next; pthread_mutex_lock(&vhost_user.mutex); @@ -574,22 +700,29 @@ rte_vhost_driver_unregister(const char *path) if (!strcmp(vsocket->path, path)) { if (vsocket->is_server) { - fdset_del(&vhost_user.fdset, vsocket->listenfd); - close(vsocket->listenfd); + fdset_del(&vhost_user.fdset, vsocket->socket_fd); + close(vsocket->socket_fd); unlink(path); } else if (vsocket->reconnect) { vhost_user_remove_reconnect(vsocket); } - conn = fdset_del(&vhost_user.fdset, vsocket->connfd); - if (conn) { + pthread_mutex_lock(&vsocket->conn_mutex); + for (conn = TAILQ_FIRST(&vsocket->conn_list); + conn != NULL; + conn = next) { + next = TAILQ_NEXT(conn, next); + + fdset_del(&vhost_user.fdset, conn->connfd); RTE_LOG(INFO, VHOST_CONFIG, "free connfd = %d for device '%s'\n", - vsocket->connfd, path); - close(vsocket->connfd); + conn->connfd, path); + close(conn->connfd); vhost_destroy_device(conn->vid); + TAILQ_REMOVE(&vsocket->conn_list, conn, next); free(conn); } + pthread_mutex_unlock(&vsocket->conn_mutex); free(vsocket->path); free(vsocket); @@ -607,9 +740,59 @@ rte_vhost_driver_unregister(const char *path) return -1; } +/* + * Register ops so that we can add/remove device to data core. + */ +int +rte_vhost_driver_callback_register(const char *path, + struct vhost_device_ops const * const ops) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + vsocket->notify_ops = ops; + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +struct vhost_device_ops const * +vhost_driver_callback_get(const char *path) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? vsocket->notify_ops : NULL; +} + int -rte_vhost_driver_session_start(void) +rte_vhost_driver_start(const char *path) { - fdset_event_dispatch(&vhost_user.fdset); - return 0; + struct vhost_user_socket *vsocket; + static pthread_t fdset_tid; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + pthread_mutex_unlock(&vhost_user.mutex); + + if (!vsocket) + return -1; + + if (fdset_tid == 0) { + int ret = pthread_create(&fdset_tid, NULL, fdset_event_dispatch, + &vhost_user.fdset); + if (ret < 0) + RTE_LOG(ERR, VHOST_CONFIG, + "failed to create fdset handling thread"); + } + + if (vsocket->is_server) + return vhost_user_start_server(vsocket); + else + return vhost_user_start_client(vsocket); } diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index e4150934..0b19d2eb 100644 --- a/lib/librte_vhost/vhost.c +++ b/lib/librte_vhost/vhost.c @@ -45,36 +45,12 @@ #include <rte_string_fns.h> #include <rte_memory.h> #include <rte_malloc.h> -#include <rte_virtio_net.h> +#include <rte_vhost.h> #include "vhost.h" -#define VHOST_USER_F_PROTOCOL_FEATURES 30 - -/* Features supported by this lib. */ -#define VHOST_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \ - (1ULL << VIRTIO_NET_F_CTRL_VQ) | \ - (1ULL << VIRTIO_NET_F_CTRL_RX) | \ - (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \ - (VHOST_SUPPORTS_MQ) | \ - (1ULL << VIRTIO_F_VERSION_1) | \ - (1ULL << VHOST_F_LOG_ALL) | \ - (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \ - (1ULL << VIRTIO_NET_F_HOST_TSO4) | \ - (1ULL << VIRTIO_NET_F_HOST_TSO6) | \ - (1ULL << VIRTIO_NET_F_CSUM) | \ - (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \ - (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ - (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ - (1ULL << VIRTIO_RING_F_INDIRECT_DESC)) - -uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES; - struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; -/* device ops to add/remove device to/from data core. */ -struct virtio_net_device_ops const *notify_ops; - struct virtio_net * get_device(int vid) { @@ -108,10 +84,8 @@ cleanup_device(struct virtio_net *dev, int destroy) vhost_backend_cleanup(dev); - for (i = 0; i < dev->virt_qp_nb; i++) { - cleanup_vq(dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ], destroy); - cleanup_vq(dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ], destroy); - } + for (i = 0; i < dev->nr_vring; i++) + cleanup_vq(dev->virtqueue[i], destroy); } /* @@ -121,24 +95,21 @@ static void free_device(struct virtio_net *dev) { uint32_t i; - struct vhost_virtqueue *rxq, *txq; + struct vhost_virtqueue *vq; - for (i = 0; i < dev->virt_qp_nb; i++) { - rxq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ]; - txq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ]; + for (i = 0; i < dev->nr_vring; i++) { + vq = dev->virtqueue[i]; - rte_free(rxq->shadow_used_ring); - rte_free(txq->shadow_used_ring); + rte_free(vq->shadow_used_ring); - /* rxq and txq are allocated together as queue-pair */ - rte_free(rxq); + rte_free(vq); } rte_free(dev); } static void -init_vring_queue(struct vhost_virtqueue *vq, int qp_idx) +init_vring_queue(struct vhost_virtqueue *vq) { memset(vq, 0, sizeof(struct vhost_virtqueue)); @@ -148,69 +119,48 @@ init_vring_queue(struct vhost_virtqueue *vq, int qp_idx) /* Backends are set to -1 indicating an inactive device. */ vq->backend = -1; - /* always set the default vq pair to enabled */ - if (qp_idx == 0) - vq->enabled = 1; + /* + * always set the vq to enabled; this is to keep compatibility + * with the old QEMU, whereas there is no SET_VRING_ENABLE message. + */ + vq->enabled = 1; TAILQ_INIT(&vq->zmbuf_list); } static void -init_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx) -{ - uint32_t base_idx = qp_idx * VIRTIO_QNUM; - - init_vring_queue(dev->virtqueue[base_idx + VIRTIO_RXQ], qp_idx); - init_vring_queue(dev->virtqueue[base_idx + VIRTIO_TXQ], qp_idx); -} - -static void -reset_vring_queue(struct vhost_virtqueue *vq, int qp_idx) +reset_vring_queue(struct vhost_virtqueue *vq) { int callfd; callfd = vq->callfd; - init_vring_queue(vq, qp_idx); + init_vring_queue(vq); vq->callfd = callfd; } -static void -reset_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx) -{ - uint32_t base_idx = qp_idx * VIRTIO_QNUM; - - reset_vring_queue(dev->virtqueue[base_idx + VIRTIO_RXQ], qp_idx); - reset_vring_queue(dev->virtqueue[base_idx + VIRTIO_TXQ], qp_idx); -} - int -alloc_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx) +alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx) { - struct vhost_virtqueue *virtqueue = NULL; - uint32_t virt_rx_q_idx = qp_idx * VIRTIO_QNUM + VIRTIO_RXQ; - uint32_t virt_tx_q_idx = qp_idx * VIRTIO_QNUM + VIRTIO_TXQ; + struct vhost_virtqueue *vq; - virtqueue = rte_malloc(NULL, - sizeof(struct vhost_virtqueue) * VIRTIO_QNUM, 0); - if (virtqueue == NULL) { + vq = rte_malloc(NULL, sizeof(struct vhost_virtqueue), 0); + if (vq == NULL) { RTE_LOG(ERR, VHOST_CONFIG, - "Failed to allocate memory for virt qp:%d.\n", qp_idx); + "Failed to allocate memory for vring:%u.\n", vring_idx); return -1; } - dev->virtqueue[virt_rx_q_idx] = virtqueue; - dev->virtqueue[virt_tx_q_idx] = virtqueue + VIRTIO_TXQ; - - init_vring_queue_pair(dev, qp_idx); + dev->virtqueue[vring_idx] = vq; + init_vring_queue(vq); - dev->virt_qp_nb += 1; + dev->nr_vring += 1; return 0; } /* * Reset some variables in device structure, while keeping few - * others untouched, such as vid, ifname, virt_qp_nb: they + * others untouched, such as vid, ifname, nr_vring: they * should be same unless the device is removed. */ void @@ -222,8 +172,8 @@ reset_device(struct virtio_net *dev) dev->protocol_features = 0; dev->flags = 0; - for (i = 0; i < dev->virt_qp_nb; i++) - reset_vring_queue_pair(dev, i); + for (i = 0; i < dev->nr_vring; i++) + reset_vring_queue(dev->virtqueue[i]); } /* @@ -274,7 +224,7 @@ vhost_destroy_device(int vid) if (dev->flags & VIRTIO_DEV_RUNNING) { dev->flags &= ~VIRTIO_DEV_RUNNING; - notify_ops->destroy_device(vid); + dev->notify_ops->destroy_device(vid); } cleanup_device(dev, 1); @@ -312,6 +262,25 @@ vhost_enable_dequeue_zero_copy(int vid) } int +rte_vhost_get_mtu(int vid, uint16_t *mtu) +{ + struct virtio_net *dev = get_device(vid); + + if (!dev) + return -ENODEV; + + if (!(dev->flags & VIRTIO_DEV_READY)) + return -EAGAIN; + + if (!(dev->features & VIRTIO_NET_F_MTU)) + return -ENOTSUP; + + *mtu = dev->mtu; + + return 0; +} + +int rte_vhost_get_numa_node(int vid) { #ifdef RTE_LIBRTE_VHOST_NUMA @@ -345,7 +314,18 @@ rte_vhost_get_queue_num(int vid) if (dev == NULL) return 0; - return dev->virt_qp_nb; + return dev->nr_vring / 2; +} + +uint16_t +rte_vhost_get_vring_num(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return 0; + + return dev->nr_vring; } int @@ -364,6 +344,72 @@ rte_vhost_get_ifname(int vid, char *buf, size_t len) return 0; } +int +rte_vhost_get_negotiated_features(int vid, uint64_t *features) +{ + struct virtio_net *dev; + + dev = get_device(vid); + if (!dev) + return -1; + + *features = dev->features; + return 0; +} + +int +rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem) +{ + struct virtio_net *dev; + struct rte_vhost_memory *m; + size_t size; + + dev = get_device(vid); + if (!dev) + return -1; + + size = dev->mem->nregions * sizeof(struct rte_vhost_mem_region); + m = malloc(size); + if (!m) + return -1; + + m->nregions = dev->mem->nregions; + memcpy(m->regions, dev->mem->regions, size); + *mem = m; + + return 0; +} + +int +rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx, + struct rte_vhost_vring *vring) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return -1; + + if (vring_idx >= VHOST_MAX_VRING) + return -1; + + vq = dev->virtqueue[vring_idx]; + if (!vq) + return -1; + + vring->desc = vq->desc; + vring->avail = vq->avail; + vring->used = vq->used; + vring->log_guest_addr = vq->log_guest_addr; + + vring->callfd = vq->callfd; + vring->kickfd = vq->kickfd; + vring->size = vq->size; + + return 0; +} + uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id) { @@ -399,33 +445,33 @@ rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable) return 0; } -uint64_t rte_vhost_feature_get(void) +void +rte_vhost_log_write(int vid, uint64_t addr, uint64_t len) { - return VHOST_FEATURES; -} + struct virtio_net *dev = get_device(vid); -int rte_vhost_feature_disable(uint64_t feature_mask) -{ - VHOST_FEATURES = VHOST_FEATURES & ~feature_mask; - return 0; -} + if (dev == NULL) + return; -int rte_vhost_feature_enable(uint64_t feature_mask) -{ - if ((feature_mask & VHOST_SUPPORTED_FEATURES) == feature_mask) { - VHOST_FEATURES = VHOST_FEATURES | feature_mask; - return 0; - } - return -1; + vhost_log_write(dev, addr, len); } -/* - * Register ops so that we can add/remove device to data core. - */ -int -rte_vhost_driver_callback_register(struct virtio_net_device_ops const * const ops) +void +rte_vhost_log_used_vring(int vid, uint16_t vring_idx, + uint64_t offset, uint64_t len) { - notify_ops = ops; + struct virtio_net *dev; + struct vhost_virtqueue *vq; - return 0; + dev = get_device(vid); + if (dev == NULL) + return; + + if (vring_idx >= VHOST_MAX_VRING) + return; + vq = dev->virtqueue[vring_idx]; + if (!vq) + return; + + vhost_log_used_vring(dev, vq, offset, len); } diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 22564f1c..ddd8a9c4 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,13 +39,19 @@ #include <sys/queue.h> #include <unistd.h> #include <linux/vhost.h> +#include <linux/virtio_net.h> +#include <sys/socket.h> +#include <linux/if.h> #include <rte_log.h> +#include <rte_ether.h> -#include "rte_virtio_net.h" +#include "rte_vhost.h" /* Used to indicate that the device is running on a data core */ #define VIRTIO_DEV_RUNNING 1 +/* Used to indicate that the device is ready to operate */ +#define VIRTIO_DEV_READY 2 /* Backend value set by guest. */ #define VIRTIO_DEV_STOPPED -1 @@ -110,24 +116,20 @@ struct vhost_virtqueue { uint16_t shadow_used_idx; } __rte_cache_aligned; -/* Old kernels have no such macro defined */ +/* Old kernels have no such macros defined */ #ifndef VIRTIO_NET_F_GUEST_ANNOUNCE #define VIRTIO_NET_F_GUEST_ANNOUNCE 21 #endif +#ifndef VIRTIO_NET_F_MQ + #define VIRTIO_NET_F_MQ 22 +#endif -/* - * Make an extra wrapper for VIRTIO_NET_F_MQ and - * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX as they are - * introduced since kernel v3.8. This makes our - * code buildable for older kernel. - */ -#ifdef VIRTIO_NET_F_MQ - #define VHOST_MAX_QUEUE_PAIRS VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX - #define VHOST_SUPPORTS_MQ (1ULL << VIRTIO_NET_F_MQ) -#else - #define VHOST_MAX_QUEUE_PAIRS 1 - #define VHOST_SUPPORTS_MQ 0 +#define VHOST_MAX_VRING 0x100 +#define VHOST_MAX_QUEUE_PAIRS 0x80 + +#ifndef VIRTIO_NET_F_MTU + #define VIRTIO_NET_F_MTU 3 #endif /* @@ -137,6 +139,27 @@ struct vhost_virtqueue { #define VIRTIO_F_VERSION_1 32 #endif +#define VHOST_USER_F_PROTOCOL_FEATURES 30 + +/* Features supported by this builtin vhost-user net driver. */ +#define VIRTIO_NET_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \ + (1ULL << VIRTIO_NET_F_CTRL_VQ) | \ + (1ULL << VIRTIO_NET_F_CTRL_RX) | \ + (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \ + (1ULL << VIRTIO_NET_F_MQ) | \ + (1ULL << VIRTIO_F_VERSION_1) | \ + (1ULL << VHOST_F_LOG_ALL) | \ + (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \ + (1ULL << VIRTIO_NET_F_HOST_TSO4) | \ + (1ULL << VIRTIO_NET_F_HOST_TSO6) | \ + (1ULL << VIRTIO_NET_F_CSUM) | \ + (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \ + (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ + (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ + (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \ + (1ULL << VIRTIO_NET_F_MTU)) + + struct guest_page { uint64_t guest_phys_addr; uint64_t host_phys_addr; @@ -149,7 +172,7 @@ struct guest_page { */ struct virtio_net { /* Frontend (QEMU) memory and memory region information */ - struct virtio_memory *mem; + struct rte_vhost_memory *mem; uint64_t features; uint64_t protocol_features; int vid; @@ -157,7 +180,7 @@ struct virtio_net { uint16_t vhost_hlen; /* to tell if we need broadcast rarp packet */ rte_atomic16_t broadcast_rarp; - uint32_t virt_qp_nb; + uint32_t nr_vring; int dequeue_zero_copy; struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2]; #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ) @@ -166,35 +189,52 @@ struct virtio_net { uint64_t log_base; uint64_t log_addr; struct ether_addr mac; + uint16_t mtu; + + struct vhost_device_ops const *notify_ops; uint32_t nr_guest_pages; uint32_t max_guest_pages; struct guest_page *guest_pages; } __rte_cache_aligned; -/** - * Information relating to memory regions including offsets to - * addresses in QEMUs memory file. - */ -struct virtio_memory_region { - uint64_t guest_phys_addr; - uint64_t guest_user_addr; - uint64_t host_user_addr; - uint64_t size; - void *mmap_addr; - uint64_t mmap_size; - int fd; -}; +#define VHOST_LOG_PAGE 4096 -/** - * Memory structure includes region and mapping information. - */ -struct virtio_memory { - uint32_t nregions; - struct virtio_memory_region regions[0]; -}; +static inline void __attribute__((always_inline)) +vhost_log_page(uint8_t *log_base, uint64_t page) +{ + log_base[page / 8] |= 1 << (page % 8); +} + +static inline void __attribute__((always_inline)) +vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len) +{ + uint64_t page; + + if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) || + !dev->log_base || !len)) + return; + + if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8))) + return; + + /* To make sure guest memory updates are committed before logging */ + rte_smp_wmb(); + + page = addr / VHOST_LOG_PAGE; + while (page * VHOST_LOG_PAGE < addr + len) { + vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page); + page += 1; + } +} +static inline void __attribute__((always_inline)) +vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t offset, uint64_t len) +{ + vhost_log_write(dev, vq->log_guest_addr + offset, len); +} /* Macros for printing using RTE_LOG */ #define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1 @@ -231,25 +271,6 @@ extern uint64_t VHOST_FEATURES; #define MAX_VHOST_DEVICE 1024 extern struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; -/* Convert guest physical Address to host virtual address */ -static inline uint64_t __attribute__((always_inline)) -gpa_to_vva(struct virtio_net *dev, uint64_t gpa) -{ - struct virtio_memory_region *reg; - uint32_t i; - - for (i = 0; i < dev->mem->nregions; i++) { - reg = &dev->mem->regions[i]; - if (gpa >= reg->guest_phys_addr && - gpa < reg->guest_phys_addr + reg->size) { - return gpa - reg->guest_phys_addr + - reg->host_user_addr; - } - } - - return 0; -} - /* Convert guest physical address to host physical address */ static inline phys_addr_t __attribute__((always_inline)) gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size) @@ -270,7 +291,6 @@ gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size) return 0; } -struct virtio_net_device_ops const *notify_ops; struct virtio_net *get_device(int vid); int vhost_new_device(void); @@ -278,11 +298,13 @@ void cleanup_device(struct virtio_net *dev, int destroy); void reset_device(struct virtio_net *dev); void vhost_destroy_device(int); -int alloc_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx); +int alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx); void vhost_set_ifname(int, const char *if_name, unsigned int if_len); void vhost_enable_dequeue_zero_copy(int vid); +struct vhost_device_ops const *vhost_driver_callback_get(const char *path); + /* * Backend-specific cleanup. * diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c index 0cb1c677..5c8058b6 100644 --- a/lib/librte_vhost/vhost_user.c +++ b/lib/librte_vhost/vhost_user.c @@ -51,6 +51,9 @@ #include "vhost.h" #include "vhost_user.h" +#define VIRTIO_MIN_MTU 68 +#define VIRTIO_MAX_MTU 65535 + static const char *vhost_message_str[VHOST_USER_MAX] = { [VHOST_USER_NONE] = "VHOST_USER_NONE", [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", @@ -72,6 +75,7 @@ static const char *vhost_message_str[VHOST_USER_MAX] = { [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE", [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP", + [VHOST_USER_NET_SET_MTU] = "VHOST_USER_NET_SET_MTU", }; static uint64_t @@ -88,7 +92,7 @@ static void free_mem_region(struct virtio_net *dev) { uint32_t i; - struct virtio_memory_region *reg; + struct rte_vhost_mem_region *reg; if (!dev || !dev->mem) return; @@ -131,7 +135,7 @@ vhost_user_reset_owner(struct virtio_net *dev) { if (dev->flags & VIRTIO_DEV_RUNNING) { dev->flags &= ~VIRTIO_DEV_RUNNING; - notify_ops->destroy_device(dev->vid); + dev->notify_ops->destroy_device(dev->vid); } cleanup_device(dev, 0); @@ -143,9 +147,12 @@ vhost_user_reset_owner(struct virtio_net *dev) * The features that we support are requested. */ static uint64_t -vhost_user_get_features(void) +vhost_user_get_features(struct virtio_net *dev) { - return VHOST_FEATURES; + uint64_t features = 0; + + rte_vhost_driver_get_features(dev->ifname, &features); + return features; } /* @@ -154,9 +161,17 @@ vhost_user_get_features(void) static int vhost_user_set_features(struct virtio_net *dev, uint64_t features) { - if (features & ~VHOST_FEATURES) + uint64_t vhost_features = 0; + + rte_vhost_driver_get_features(dev->ifname, &vhost_features); + if (features & ~vhost_features) return -1; + if ((dev->flags & VIRTIO_DEV_RUNNING) && dev->features != features) { + if (dev->notify_ops->features_changed) + dev->notify_ops->features_changed(dev->vid, features); + } + dev->features = features; if (dev->features & ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) { @@ -223,12 +238,7 @@ numa_realloc(struct virtio_net *dev, int index) struct vhost_virtqueue *old_vq, *vq; int ret; - /* - * vq is allocated on pairs, we should try to do realloc - * on first queue of one queue pair only. - */ - if (index % VIRTIO_QNUM != 0) - return dev; + enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM}; old_dev = dev; vq = old_vq = dev->virtqueue[index]; @@ -247,8 +257,7 @@ numa_realloc(struct virtio_net *dev, int index) if (oldnode != newnode) { RTE_LOG(INFO, VHOST_CONFIG, "reallocate vq from %d to %d node\n", oldnode, newnode); - vq = rte_malloc_socket(NULL, sizeof(*vq) * VIRTIO_QNUM, 0, - newnode); + vq = rte_malloc_socket(NULL, sizeof(*vq), 0, newnode); if (!vq) return dev; @@ -280,7 +289,6 @@ numa_realloc(struct virtio_net *dev, int index) out: dev->virtqueue[index] = vq; - dev->virtqueue[index + 1] = vq + 1; vhost_devices[dev->vid] = dev; return dev; @@ -300,7 +308,7 @@ numa_realloc(struct virtio_net *dev, int index __rte_unused) static uint64_t qva_to_vva(struct virtio_net *dev, uint64_t qva) { - struct virtio_memory_region *reg; + struct rte_vhost_mem_region *reg; uint32_t i; /* Find the region where the address lives. */ @@ -428,7 +436,7 @@ add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr, } static void -add_guest_pages(struct virtio_net *dev, struct virtio_memory_region *reg, +add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg, uint64_t page_size) { uint64_t reg_size = reg->size; @@ -488,7 +496,7 @@ static int vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg) { struct VhostUserMemory memory = pmsg->payload.memory; - struct virtio_memory_region *reg; + struct rte_vhost_mem_region *reg; void *mmap_addr; uint64_t mmap_size; uint64_t mmap_offset; @@ -496,12 +504,6 @@ vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg) uint32_t i; int fd; - /* Remove from the data plane. */ - if (dev->flags & VIRTIO_DEV_RUNNING) { - dev->flags &= ~VIRTIO_DEV_RUNNING; - notify_ops->destroy_device(dev->vid); - } - if (dev->mem) { free_mem_region(dev); rte_free(dev->mem); @@ -515,8 +517,8 @@ vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg) sizeof(struct guest_page)); } - dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct virtio_memory) + - sizeof(struct virtio_memory_region) * memory.nregions, 0); + dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct rte_vhost_memory) + + sizeof(struct rte_vhost_mem_region) * memory.nregions, 0); if (dev->mem == NULL) { RTE_LOG(ERR, VHOST_CONFIG, "(%d) failed to allocate memory for dev->mem\n", @@ -611,18 +613,17 @@ vq_is_ready(struct vhost_virtqueue *vq) static int virtio_is_ready(struct virtio_net *dev) { - struct vhost_virtqueue *rvq, *tvq; + struct vhost_virtqueue *vq; uint32_t i; - for (i = 0; i < dev->virt_qp_nb; i++) { - rvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ]; - tvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ]; + if (dev->nr_vring == 0) + return 0; - if (!vq_is_ready(rvq) || !vq_is_ready(tvq)) { - RTE_LOG(INFO, VHOST_CONFIG, - "virtio is not ready for processing.\n"); + for (i = 0; i < dev->nr_vring; i++) { + vq = dev->virtqueue[i]; + + if (!vq_is_ready(vq)) return 0; - } } RTE_LOG(INFO, VHOST_CONFIG, @@ -635,7 +636,6 @@ vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg) { struct vhost_vring_file file; struct vhost_virtqueue *vq; - uint32_t cur_qp_idx; file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) @@ -645,29 +645,13 @@ vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg) RTE_LOG(INFO, VHOST_CONFIG, "vring call idx:%d file:%d\n", file.index, file.fd); - /* - * FIXME: VHOST_SET_VRING_CALL is the first per-vring message - * we get, so we do vring queue pair allocation here. - */ - cur_qp_idx = file.index / VIRTIO_QNUM; - if (cur_qp_idx + 1 > dev->virt_qp_nb) { - if (alloc_vring_queue_pair(dev, cur_qp_idx) < 0) - return; - } - vq = dev->virtqueue[file.index]; - assert(vq != NULL); - if (vq->callfd >= 0) close(vq->callfd); vq->callfd = file.fd; } -/* - * In vhost-user, when we receive kick message, will test whether virtio - * device is ready for packet processing. - */ static void vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg) { @@ -686,16 +670,6 @@ vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg) if (vq->kickfd >= 0) close(vq->kickfd); vq->kickfd = file.fd; - - if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) { - if (dev->dequeue_zero_copy) { - RTE_LOG(INFO, VHOST_CONFIG, - "dequeue zero copy is enabled\n"); - } - - if (notify_ops->new_device(dev->vid) == 0) - dev->flags |= VIRTIO_DEV_RUNNING; - } } static void @@ -726,9 +700,11 @@ vhost_user_get_vring_base(struct virtio_net *dev, /* We have to stop the queue (virtio) if it is running. */ if (dev->flags & VIRTIO_DEV_RUNNING) { dev->flags &= ~VIRTIO_DEV_RUNNING; - notify_ops->destroy_device(dev->vid); + dev->notify_ops->destroy_device(dev->vid); } + dev->flags &= ~VIRTIO_DEV_READY; + /* Here we are safe to get the last used index */ state->num = vq->last_used_idx; @@ -766,8 +742,8 @@ vhost_user_set_vring_enable(struct virtio_net *dev, "set queue enable: %d to qp idx: %d\n", enable, state->index); - if (notify_ops->vring_state_changed) - notify_ops->vring_state_changed(dev->vid, state->index, enable); + if (dev->notify_ops->vring_state_changed) + dev->notify_ops->vring_state_changed(dev->vid, state->index, enable); dev->virtqueue[state->index]->enabled = enable; @@ -865,6 +841,22 @@ vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg) return 0; } +static int +vhost_user_net_set_mtu(struct virtio_net *dev, struct VhostUserMsg *msg) +{ + if (msg->payload.u64 < VIRTIO_MIN_MTU || + msg->payload.u64 > VIRTIO_MAX_MTU) { + RTE_LOG(ERR, VHOST_CONFIG, "Invalid MTU size (%"PRIu64")\n", + msg->payload.u64); + + return -1; + } + + dev->mtu = msg->payload.u64; + + return 0; +} + /* return bytes# of read on success or negative val on failure. */ static int read_vhost_message(int sockfd, struct VhostUserMsg *msg) @@ -904,6 +896,7 @@ send_vhost_message(int sockfd, struct VhostUserMsg *msg) return 0; msg->flags &= ~VHOST_USER_VERSION_MASK; + msg->flags &= ~VHOST_USER_NEED_REPLY; msg->flags |= VHOST_USER_VERSION; msg->flags |= VHOST_USER_REPLY_MASK; @@ -913,6 +906,44 @@ send_vhost_message(int sockfd, struct VhostUserMsg *msg) return ret; } +/* + * Allocate a queue pair if it hasn't been allocated yet + */ +static int +vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, VhostUserMsg *msg) +{ + uint16_t vring_idx; + + switch (msg->request) { + case VHOST_USER_SET_VRING_KICK: + case VHOST_USER_SET_VRING_CALL: + case VHOST_USER_SET_VRING_ERR: + vring_idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + break; + case VHOST_USER_SET_VRING_NUM: + case VHOST_USER_SET_VRING_BASE: + case VHOST_USER_SET_VRING_ENABLE: + vring_idx = msg->payload.state.index; + break; + case VHOST_USER_SET_VRING_ADDR: + vring_idx = msg->payload.addr.index; + break; + default: + return 0; + } + + if (vring_idx >= VHOST_MAX_VRING) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid vring index: %u\n", vring_idx); + return -1; + } + + if (dev->virtqueue[vring_idx]) + return 0; + + return alloc_vring_queue(dev, vring_idx); +} + int vhost_user_msg_handler(int vid, int fd) { @@ -924,6 +955,16 @@ vhost_user_msg_handler(int vid, int fd) if (dev == NULL) return -1; + if (!dev->notify_ops) { + dev->notify_ops = vhost_driver_callback_get(dev->ifname); + if (!dev->notify_ops) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to get callback ops for driver %s\n", + dev->ifname); + return -1; + } + } + ret = read_vhost_message(fd, &msg); if (ret <= 0 || msg.request >= VHOST_USER_MAX) { if (ret < 0) @@ -939,11 +980,20 @@ vhost_user_msg_handler(int vid, int fd) return -1; } + ret = 0; RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n", vhost_message_str[msg.request]); + + ret = vhost_user_check_and_alloc_queue_pair(dev, &msg); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to alloc queue\n"); + return -1; + } + switch (msg.request) { case VHOST_USER_GET_FEATURES: - msg.payload.u64 = vhost_user_get_features(); + msg.payload.u64 = vhost_user_get_features(dev); msg.size = sizeof(msg.payload.u64); send_vhost_message(fd, &msg); break; @@ -968,7 +1018,7 @@ vhost_user_msg_handler(int vid, int fd) break; case VHOST_USER_SET_MEM_TABLE: - vhost_user_set_mem_table(dev, &msg); + ret = vhost_user_set_mem_table(dev, &msg); break; case VHOST_USER_SET_LOG_BASE: @@ -994,7 +1044,7 @@ vhost_user_msg_handler(int vid, int fd) break; case VHOST_USER_GET_VRING_BASE: - ret = vhost_user_get_vring_base(dev, &msg.payload.state); + vhost_user_get_vring_base(dev, &msg.payload.state); msg.size = sizeof(msg.payload.state); send_vhost_message(fd, &msg); break; @@ -1025,10 +1075,35 @@ vhost_user_msg_handler(int vid, int fd) vhost_user_send_rarp(dev, &msg); break; + case VHOST_USER_NET_SET_MTU: + ret = vhost_user_net_set_mtu(dev, &msg); + break; + default: + ret = -1; break; } + if (msg.flags & VHOST_USER_NEED_REPLY) { + msg.payload.u64 = !!ret; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + } + + if (!(dev->flags & VIRTIO_DEV_RUNNING) && virtio_is_ready(dev)) { + dev->flags |= VIRTIO_DEV_READY; + + if (!(dev->flags & VIRTIO_DEV_RUNNING)) { + if (dev->dequeue_zero_copy) { + RTE_LOG(INFO, VHOST_CONFIG, + "dequeue zero copy is enabled\n"); + } + + if (dev->notify_ops->new_device(dev->vid) == 0) + dev->flags |= VIRTIO_DEV_RUNNING; + } + } + return 0; } diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h index ba78d326..35ebd719 100644 --- a/lib/librte_vhost/vhost_user.h +++ b/lib/librte_vhost/vhost_user.h @@ -37,7 +37,7 @@ #include <stdint.h> #include <linux/vhost.h> -#include "rte_virtio_net.h" +#include "rte_vhost.h" /* refer to hw/virtio/vhost-user.c */ @@ -46,10 +46,18 @@ #define VHOST_USER_PROTOCOL_F_MQ 0 #define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 #define VHOST_USER_PROTOCOL_F_RARP 2 +#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 +#define VHOST_USER_PROTOCOL_F_NET_MTU 4 +/* + * disable REPLY_ACK feature to workaround the buggy QEMU implementation. + * Proved buggy QEMU includes v2.7 - v2.9. + */ #define VHOST_USER_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \ (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\ - (1ULL << VHOST_USER_PROTOCOL_F_RARP)) + (1ULL << VHOST_USER_PROTOCOL_F_RARP) | \ + (0ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK) | \ + (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU)) typedef enum VhostUserRequest { VHOST_USER_NONE = 0, @@ -72,6 +80,7 @@ typedef enum VhostUserRequest { VHOST_USER_GET_QUEUE_NUM = 17, VHOST_USER_SET_VRING_ENABLE = 18, VHOST_USER_SEND_RARP = 19, + VHOST_USER_NET_SET_MTU = 20, VHOST_USER_MAX } VhostUserRequest; @@ -98,6 +107,7 @@ typedef struct VhostUserMsg { #define VHOST_USER_VERSION_MASK 0x3 #define VHOST_USER_REPLY_MASK (0x1 << 2) +#define VHOST_USER_NEED_REPLY (0x1 << 3) uint32_t flags; uint32_t size; /* the following payload size */ union { diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 337470d6..48219e05 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -39,7 +39,7 @@ #include <rte_memcpy.h> #include <rte_ether.h> #include <rte_ip.h> -#include <rte_virtio_net.h> +#include <rte_vhost.h> #include <rte_tcp.h> #include <rte_udp.h> #include <rte_sctp.h> @@ -48,47 +48,11 @@ #include "vhost.h" #define MAX_PKT_BURST 32 -#define VHOST_LOG_PAGE 4096 - -static inline void __attribute__((always_inline)) -vhost_log_page(uint8_t *log_base, uint64_t page) -{ - log_base[page / 8] |= 1 << (page % 8); -} - -static inline void __attribute__((always_inline)) -vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len) -{ - uint64_t page; - - if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) || - !dev->log_base || !len)) - return; - - if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8))) - return; - - /* To make sure guest memory updates are committed before logging */ - rte_smp_wmb(); - - page = addr / VHOST_LOG_PAGE; - while (page * VHOST_LOG_PAGE < addr + len) { - vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page); - page += 1; - } -} - -static inline void __attribute__((always_inline)) -vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint64_t offset, uint64_t len) -{ - vhost_log_write(dev, vq->log_guest_addr + offset, len); -} static bool -is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb) +is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring) { - return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM; + return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring; } static inline void __attribute__((always_inline)) @@ -141,6 +105,12 @@ update_shadow_used_ring(struct vhost_virtqueue *vq, vq->shadow_used_ring[i].len = len; } +/* avoid write operation when necessary, to lessen cache issues */ +#define ASSIGN_UNLESS_EQUAL(var, val) do { \ + if ((var) != (val)) \ + (var) = (val); \ +} while (0) + static void virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr) { @@ -162,6 +132,10 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr) cksum)); break; } + } else { + ASSIGN_UNLESS_EQUAL(net_hdr->csum_start, 0); + ASSIGN_UNLESS_EQUAL(net_hdr->csum_offset, 0); + ASSIGN_UNLESS_EQUAL(net_hdr->flags, 0); } if (m_buf->ol_flags & PKT_TX_TCP_SEG) { @@ -172,19 +146,13 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr) net_hdr->gso_size = m_buf->tso_segsz; net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len + m_buf->l4_len; + } else { + ASSIGN_UNLESS_EQUAL(net_hdr->gso_type, 0); + ASSIGN_UNLESS_EQUAL(net_hdr->gso_size, 0); + ASSIGN_UNLESS_EQUAL(net_hdr->hdr_len, 0); } } -static inline void -copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr, - struct virtio_net_hdr_mrg_rxbuf hdr) -{ - if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf)) - *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr; - else - *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr; -} - static inline int __attribute__((always_inline)) copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs, struct rte_mbuf *m, uint16_t desc_idx, uint32_t size) @@ -194,12 +162,11 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs, uint32_t cpy_len; struct vring_desc *desc; uint64_t desc_addr; - struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0}; /* A counter to avoid desc dead loop chain */ uint16_t nr_desc = 1; desc = &descs[desc_idx]; - desc_addr = gpa_to_vva(dev, desc->addr); + desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); /* * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid * performance issue with some versions of gcc (4.8.4 and 5.3.0) which @@ -210,8 +177,7 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs, rte_prefetch0((void *)(uintptr_t)desc_addr); - virtio_enqueue_offload(m, &virtio_hdr.hdr); - copy_virtio_net_hdr(dev, desc_addr, virtio_hdr); + virtio_enqueue_offload(m, (struct virtio_net_hdr *)(uintptr_t)desc_addr); vhost_log_write(dev, desc->addr, dev->vhost_hlen); PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0); @@ -239,7 +205,7 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs, return -1; desc = &descs[desc->next]; - desc_addr = gpa_to_vva(dev, desc->addr); + desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); if (unlikely(!desc_addr)) return -1; @@ -283,7 +249,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, uint32_t i, sz; LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); - if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) { + if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) { RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", dev->vid, __func__, queue_id); return 0; @@ -323,7 +289,8 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, int err; if (vq->desc[desc_idx].flags & VRING_DESC_F_INDIRECT) { - descs = (struct vring_desc *)(uintptr_t)gpa_to_vva(dev, + descs = (struct vring_desc *)(uintptr_t) + rte_vhost_gpa_to_vva(dev->mem, vq->desc[desc_idx].addr); if (unlikely(!descs)) { count = i; @@ -383,7 +350,7 @@ fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq, if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) { descs = (struct vring_desc *)(uintptr_t) - gpa_to_vva(dev, vq->desc[idx].addr); + rte_vhost_gpa_to_vva(dev->mem, vq->desc[idx].addr); if (unlikely(!descs)) return -1; @@ -461,7 +428,6 @@ static inline int __attribute__((always_inline)) copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m, struct buf_vector *buf_vec, uint16_t num_buffers) { - struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0}; uint32_t vec_idx = 0; uint64_t desc_addr; uint32_t mbuf_offset, mbuf_avail; @@ -473,7 +439,7 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m, if (unlikely(m == NULL)) return -1; - desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr); + desc_addr = rte_vhost_gpa_to_vva(dev->mem, buf_vec[vec_idx].buf_addr); if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr) return -1; @@ -482,7 +448,6 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m, hdr_phys_addr = buf_vec[vec_idx].buf_addr; rte_prefetch0((void *)(uintptr_t)hdr_addr); - virtio_hdr.num_buffers = num_buffers; LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n", dev->vid, num_buffers); @@ -495,7 +460,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m, /* done with current desc buf, get the next one */ if (desc_avail == 0) { vec_idx++; - desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr); + desc_addr = rte_vhost_gpa_to_vva(dev->mem, + buf_vec[vec_idx].buf_addr); if (unlikely(!desc_addr)) return -1; @@ -514,8 +480,13 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m, } if (hdr_addr) { - virtio_enqueue_offload(hdr_mbuf, &virtio_hdr.hdr); - copy_virtio_net_hdr(dev, hdr_addr, virtio_hdr); + struct virtio_net_hdr_mrg_rxbuf *hdr; + + hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t) + hdr_addr; + virtio_enqueue_offload(hdr_mbuf, &hdr->hdr); + ASSIGN_UNLESS_EQUAL(hdr->num_buffers, num_buffers); + vhost_log_write(dev, hdr_phys_addr, dev->vhost_hlen); PRINT_PACKET(dev, (uintptr_t)hdr_addr, dev->vhost_hlen, 0); @@ -552,7 +523,7 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id, uint16_t avail_head; LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); - if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) { + if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) { RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", dev->vid, __func__, queue_id); return 0; @@ -663,14 +634,14 @@ parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr) switch (ethertype) { case ETHER_TYPE_IPv4: - ipv4_hdr = (struct ipv4_hdr *)l3_hdr; + ipv4_hdr = l3_hdr; *l4_proto = ipv4_hdr->next_proto_id; m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4; *l4_hdr = (char *)l3_hdr + m->l3_len; m->ol_flags |= PKT_TX_IPV4; break; case ETHER_TYPE_IPv6: - ipv6_hdr = (struct ipv6_hdr *)l3_hdr; + ipv6_hdr = l3_hdr; *l4_proto = ipv6_hdr->proto; m->l3_len = sizeof(struct ipv6_hdr); *l4_hdr = (char *)l3_hdr + m->l3_len; @@ -720,7 +691,7 @@ vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m) switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: case VIRTIO_NET_HDR_GSO_TCPV6: - tcp_hdr = (struct tcp_hdr *)l4_hdr; + tcp_hdr = l4_hdr; m->ol_flags |= PKT_TX_TCP_SEG; m->tso_segsz = hdr->gso_size; m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2; @@ -798,7 +769,7 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs, (desc->flags & VRING_DESC_F_INDIRECT)) return -1; - desc_addr = gpa_to_vva(dev, desc->addr); + desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); if (unlikely(!desc_addr)) return -1; @@ -818,7 +789,7 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs, if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) return -1; - desc_addr = gpa_to_vva(dev, desc->addr); + desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); if (unlikely(!desc_addr)) return -1; @@ -882,7 +853,7 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs, if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) return -1; - desc_addr = gpa_to_vva(dev, desc->addr); + desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); if (unlikely(!desc_addr)) return -1; @@ -905,6 +876,8 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs, "allocate memory for mbuf.\n"); return -1; } + if (unlikely(dev->dequeue_zero_copy)) + rte_mbuf_refcnt_update(cur, 1); prev->next = cur; prev->data_len = mbuf_offset; @@ -1017,7 +990,7 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, if (!dev) return 0; - if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) { + if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) { RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", dev->vid, __func__, queue_id); return 0; @@ -1056,9 +1029,21 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, * array, to looks like that guest actually send such packet. * * Check user_send_rarp() for more information. + * + * broadcast_rarp shares a cacheline in the virtio_net structure + * with some fields that are accessed during enqueue and + * rte_atomic16_cmpset() causes a write if using cmpxchg. This could + * result in false sharing between enqueue and dequeue. + * + * Prevent unnecessary false sharing by reading broadcast_rarp first + * and only performing cmpset if the read indicates it is likely to + * be set. */ - if (unlikely(rte_atomic16_cmpset((volatile uint16_t *) - &dev->broadcast_rarp.cnt, 1, 0))) { + + if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) && + rte_atomic16_cmpset((volatile uint16_t *) + &dev->broadcast_rarp.cnt, 1, 0))) { + rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool); if (rarp_mbuf == NULL) { RTE_LOG(ERR, VHOST_DATA, @@ -1113,7 +1098,8 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, rte_prefetch0(&vq->desc[desc_indexes[i + 1]]); if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) { - desc = (struct vring_desc *)(uintptr_t)gpa_to_vva(dev, + desc = (struct vring_desc *)(uintptr_t) + rte_vhost_gpa_to_vva(dev->mem, vq->desc[desc_indexes[i]].addr); if (unlikely(!desc)) break; |