diff options
Diffstat (limited to 'lib/librte_vhost')
-rw-r--r-- | lib/librte_vhost/Makefile | 13 | ||||
-rw-r--r-- | lib/librte_vhost/fd_man.c | 98 | ||||
-rw-r--r-- | lib/librte_vhost/fd_man.h | 17 | ||||
-rw-r--r-- | lib/librte_vhost/iotlb.c | 10 | ||||
-rw-r--r-- | lib/librte_vhost/iotlb.h | 2 | ||||
-rw-r--r-- | lib/librte_vhost/meson.build | 11 | ||||
-rw-r--r-- | lib/librte_vhost/rte_vdpa.h | 87 | ||||
-rw-r--r-- | lib/librte_vhost/rte_vhost.h | 212 | ||||
-rw-r--r-- | lib/librte_vhost/rte_vhost_crypto.h | 109 | ||||
-rw-r--r-- | lib/librte_vhost/rte_vhost_version.map | 24 | ||||
-rw-r--r-- | lib/librte_vhost/socket.c | 288 | ||||
-rw-r--r-- | lib/librte_vhost/vdpa.c | 115 | ||||
-rw-r--r-- | lib/librte_vhost/vhost.c | 312 | ||||
-rw-r--r-- | lib/librte_vhost/vhost.h | 358 | ||||
-rw-r--r-- | lib/librte_vhost/vhost_crypto.c | 1372 | ||||
-rw-r--r-- | lib/librte_vhost/vhost_user.c | 581 | ||||
-rw-r--r-- | lib/librte_vhost/vhost_user.h | 56 | ||||
-rw-r--r-- | lib/librte_vhost/virtio_crypto.h | 422 | ||||
-rw-r--r-- | lib/librte_vhost/virtio_net.c | 1460 |
19 files changed, 4758 insertions, 789 deletions
diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile index 5d6c6aba..de431fbb 100644 --- a/lib/librte_vhost/Makefile +++ b/lib/librte_vhost/Makefile @@ -18,13 +18,20 @@ LDLIBS += -lpthread ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y) LDLIBS += -lnuma endif -LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev -lrte_net +LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev -lrte_net \ + -lrte_cryptodev -lrte_hash # all source are stored in SRCS-y SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c iotlb.c socket.c vhost.c \ - vhost_user.c virtio_net.c + vhost_user.c virtio_net.c vdpa.c # install includes -SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h +SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h rte_vdpa.h + +# only compile vhost crypto when cryptodev is enabled +ifeq ($(CONFIG_RTE_LIBRTE_CRYPTODEV),y) +SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += vhost_crypto.c +SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost_crypto.h +endif include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_vhost/fd_man.c b/lib/librte_vhost/fd_man.c index 181711c2..38347ab1 100644 --- a/lib/librte_vhost/fd_man.c +++ b/lib/librte_vhost/fd_man.c @@ -16,6 +16,9 @@ #include "fd_man.h" + +#define RTE_LOGTYPE_VHOST_FDMAN RTE_LOGTYPE_USER1 + #define FDPOLLERR (POLLERR | POLLHUP | POLLNVAL) static int @@ -171,6 +174,38 @@ fdset_del(struct fdset *pfdset, int fd) return dat; } +/** + * Unregister the fd from the fdset. + * + * If parameters are invalid, return directly -2. + * And check whether fd is busy, if yes, return -1. + * Otherwise, try to delete the fd from fdset and + * return true. + */ +int +fdset_try_del(struct fdset *pfdset, int fd) +{ + int i; + + if (pfdset == NULL || fd == -1) + return -2; + + pthread_mutex_lock(&pfdset->fd_mutex); + i = fdset_find_fd(pfdset, fd); + if (i != -1 && pfdset->fd[i].busy) { + pthread_mutex_unlock(&pfdset->fd_mutex); + return -1; + } + + if (i != -1) { + pfdset->fd[i].fd = -1; + pfdset->fd[i].rcb = pfdset->fd[i].wcb = NULL; + pfdset->fd[i].dat = NULL; + } + + pthread_mutex_unlock(&pfdset->fd_mutex); + return 0; +} /** * This functions runs in infinite blocking loop until there is no fd in @@ -258,7 +293,7 @@ fdset_event_dispatch(void *arg) * because the fd is closed in the cb, * the old fd val could be reused by when creates new * listen fd in another thread, we couldn't call - * fd_set_del. + * fdset_del. */ if (remove1 || remove2) { pfdentry->fd = -1; @@ -272,3 +307,64 @@ fdset_event_dispatch(void *arg) return NULL; } + +static void +fdset_pipe_read_cb(int readfd, void *dat __rte_unused, + int *remove __rte_unused) +{ + char charbuf[16]; + int r = read(readfd, charbuf, sizeof(charbuf)); + /* + * Just an optimization, we don't care if read() failed + * so ignore explicitly its return value to make the + * compiler happy + */ + RTE_SET_USED(r); +} + +void +fdset_pipe_uninit(struct fdset *fdset) +{ + fdset_del(fdset, fdset->u.readfd); + close(fdset->u.readfd); + close(fdset->u.writefd); +} + +int +fdset_pipe_init(struct fdset *fdset) +{ + int ret; + + if (pipe(fdset->u.pipefd) < 0) { + RTE_LOG(ERR, VHOST_FDMAN, + "failed to create pipe for vhost fdset\n"); + return -1; + } + + ret = fdset_add(fdset, fdset->u.readfd, + fdset_pipe_read_cb, NULL, NULL); + + if (ret < 0) { + RTE_LOG(ERR, VHOST_FDMAN, + "failed to add pipe readfd %d into vhost server fdset\n", + fdset->u.readfd); + + fdset_pipe_uninit(fdset); + return -1; + } + + return 0; +} + +void +fdset_pipe_notify(struct fdset *fdset) +{ + int r = write(fdset->u.writefd, "1", 1); + /* + * Just an optimization, we don't care if write() failed + * so ignore explicitly its return value to make the + * compiler happy + */ + RTE_SET_USED(r); + +} diff --git a/lib/librte_vhost/fd_man.h b/lib/librte_vhost/fd_man.h index 3a9276c3..3331bcd9 100644 --- a/lib/librte_vhost/fd_man.h +++ b/lib/librte_vhost/fd_man.h @@ -25,6 +25,16 @@ struct fdset { struct fdentry fd[MAX_FDS]; pthread_mutex_t fd_mutex; int num; /* current fd number of this fdset */ + + union pipefds { + struct { + int pipefd[2]; + }; + struct { + int readfd; + int writefd; + }; + } u; }; @@ -34,7 +44,14 @@ int fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat); void *fdset_del(struct fdset *pfdset, int fd); +int fdset_try_del(struct fdset *pfdset, int fd); void *fdset_event_dispatch(void *arg); +int fdset_pipe_init(struct fdset *fdset); + +void fdset_pipe_uninit(struct fdset *fdset); + +void fdset_pipe_notify(struct fdset *fdset); + #endif diff --git a/lib/librte_vhost/iotlb.c b/lib/librte_vhost/iotlb.c index c11ebcaa..c6354fef 100644 --- a/lib/librte_vhost/iotlb.c +++ b/lib/librte_vhost/iotlb.c @@ -303,6 +303,13 @@ out: return vva; } +void +vhost_user_iotlb_flush_all(struct vhost_virtqueue *vq) +{ + vhost_user_iotlb_cache_remove_all(vq); + vhost_user_iotlb_pending_remove_all(vq); +} + int vhost_user_iotlb_init(struct virtio_net *dev, int vq_index) { @@ -315,8 +322,7 @@ vhost_user_iotlb_init(struct virtio_net *dev, int vq_index) * The cache has already been initialized, * just drop all cached and pending entries. */ - vhost_user_iotlb_cache_remove_all(vq); - vhost_user_iotlb_pending_remove_all(vq); + vhost_user_iotlb_flush_all(vq); } #ifdef RTE_LIBRTE_VHOST_NUMA diff --git a/lib/librte_vhost/iotlb.h b/lib/librte_vhost/iotlb.h index e7083e37..60b9e4c5 100644 --- a/lib/librte_vhost/iotlb.h +++ b/lib/librte_vhost/iotlb.h @@ -73,7 +73,7 @@ void vhost_user_iotlb_pending_insert(struct vhost_virtqueue *vq, uint64_t iova, uint8_t perm); void vhost_user_iotlb_pending_remove(struct vhost_virtqueue *vq, uint64_t iova, uint64_t size, uint8_t perm); - +void vhost_user_iotlb_flush_all(struct vhost_virtqueue *vq); int vhost_user_iotlb_init(struct virtio_net *dev, int vq_index); #endif /* _VHOST_IOTLB_H_ */ diff --git a/lib/librte_vhost/meson.build b/lib/librte_vhost/meson.build index 9e8c0e76..bd62e0e3 100644 --- a/lib/librte_vhost/meson.build +++ b/lib/librte_vhost/meson.build @@ -1,5 +1,5 @@ # SPDX-License-Identifier: BSD-3-Clause -# Copyright(c) 2017 Intel Corporation +# Copyright(c) 2017-2018 Intel Corporation if host_machine.system() != 'linux' build = false @@ -9,7 +9,8 @@ if has_libnuma == 1 endif version = 4 allow_experimental_apis = true -sources = files('fd_man.c', 'iotlb.c', 'socket.c', 'vhost.c', 'vhost_user.c', - 'virtio_net.c') -headers = files('rte_vhost.h') -deps += ['ethdev'] +sources = files('fd_man.c', 'iotlb.c', 'socket.c', 'vdpa.c', + 'vhost.c', 'vhost_user.c', + 'virtio_net.c', 'vhost_crypto.c') +headers = files('rte_vhost.h', 'rte_vdpa.h', 'rte_vhost_crypto.h') +deps += ['ethdev', 'cryptodev', 'hash', 'pci'] diff --git a/lib/librte_vhost/rte_vdpa.h b/lib/librte_vhost/rte_vdpa.h new file mode 100644 index 00000000..90465ca2 --- /dev/null +++ b/lib/librte_vhost/rte_vdpa.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef _RTE_VDPA_H_ +#define _RTE_VDPA_H_ + +/** + * @file + * + * Device specific vhost lib + */ + +#include <rte_pci.h> +#include "rte_vhost.h" + +#define MAX_VDPA_NAME_LEN 128 + +enum vdpa_addr_type { + PCI_ADDR, + VDPA_ADDR_MAX +}; + +struct rte_vdpa_dev_addr { + enum vdpa_addr_type type; + union { + uint8_t __dummy[64]; + struct rte_pci_addr pci_addr; + }; +}; + +struct rte_vdpa_dev_ops { + /* Get capabilities of this device */ + int (*get_queue_num)(int did, uint32_t *queue_num); + int (*get_features)(int did, uint64_t *features); + int (*get_protocol_features)(int did, uint64_t *protocol_features); + + /* Driver configure/close the device */ + int (*dev_conf)(int vid); + int (*dev_close)(int vid); + + /* Enable/disable this vring */ + int (*set_vring_state)(int vid, int vring, int state); + + /* Set features when changed */ + int (*set_features)(int vid); + + /* Destination operations when migration done */ + int (*migration_done)(int vid); + + /* Get the vfio group fd */ + int (*get_vfio_group_fd)(int vid); + + /* Get the vfio device fd */ + int (*get_vfio_device_fd)(int vid); + + /* Get the notify area info of the queue */ + int (*get_notify_area)(int vid, int qid, + uint64_t *offset, uint64_t *size); + + /* Reserved for future extension */ + void *reserved[5]; +}; + +struct rte_vdpa_device { + struct rte_vdpa_dev_addr addr; + struct rte_vdpa_dev_ops *ops; +} __rte_cache_aligned; + +/* Register a vdpa device, return did if successful, -1 on failure */ +int __rte_experimental +rte_vdpa_register_device(struct rte_vdpa_dev_addr *addr, + struct rte_vdpa_dev_ops *ops); + +/* Unregister a vdpa device, return -1 on failure */ +int __rte_experimental +rte_vdpa_unregister_device(int did); + +/* Find did of a vdpa device, return -1 on failure */ +int __rte_experimental +rte_vdpa_find_device_id(struct rte_vdpa_dev_addr *addr); + +/* Find a vdpa device based on did */ +struct rte_vdpa_device * __rte_experimental +rte_vdpa_get_device(int did); + +#endif /* _RTE_VDPA_H_ */ diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h index d3320699..b02673d4 100644 --- a/lib/librte_vhost/rte_vhost.h +++ b/lib/librte_vhost/rte_vhost.h @@ -29,6 +29,48 @@ extern "C" { #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY (1ULL << 2) #define RTE_VHOST_USER_IOMMU_SUPPORT (1ULL << 3) +/** Protocol features. */ +#ifndef VHOST_USER_PROTOCOL_F_MQ +#define VHOST_USER_PROTOCOL_F_MQ 0 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_LOG_SHMFD +#define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_RARP +#define VHOST_USER_PROTOCOL_F_RARP 2 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_REPLY_ACK +#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_NET_MTU +#define VHOST_USER_PROTOCOL_F_NET_MTU 4 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_SLAVE_REQ +#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_CRYPTO_SESSION +#define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD +#define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_HOST_NOTIFIER +#define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11 +#endif + +/** Indicate whether protocol features negotiation is supported. */ +#ifndef VHOST_USER_F_PROTOCOL_FEATURES +#define VHOST_USER_F_PROTOCOL_FEATURES 30 +#endif + /** * Information relating to memory regions including offsets to * addresses in QEMUs memory file. @@ -90,6 +132,11 @@ struct vhost_device_ops { /** * Convert guest physical address to host virtual address * + * This function is deprecated because unsafe. + * New rte_vhost_va_from_guest_pa() should be used instead to ensure + * guest physical ranges are fully and contiguously mapped into + * process virtual address space. + * * @param mem * the guest memory regions * @param gpa @@ -97,6 +144,7 @@ struct vhost_device_ops { * @return * the host virtual address on success, 0 on failure */ +__rte_deprecated static __rte_always_inline uint64_t rte_vhost_gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa) { @@ -115,6 +163,46 @@ rte_vhost_gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa) return 0; } +/** + * Convert guest physical address to host virtual address safely + * + * This variant of rte_vhost_gpa_to_vva() takes care all the + * requested length is mapped and contiguous in process address + * space. + * + * @param mem + * the guest memory regions + * @param gpa + * the guest physical address for querying + * @param len + * the size of the requested area to map, updated with actual size mapped + * @return + * the host virtual address on success, 0 on failure + */ +static __rte_always_inline uint64_t +rte_vhost_va_from_guest_pa(struct rte_vhost_memory *mem, + uint64_t gpa, uint64_t *len) +{ + struct rte_vhost_mem_region *r; + uint32_t i; + + for (i = 0; i < mem->nregions; i++) { + r = &mem->regions[i]; + if (gpa >= r->guest_phys_addr && + gpa < r->guest_phys_addr + r->size) { + + if (unlikely(*len > r->guest_phys_addr + r->size - gpa)) + *len = r->guest_phys_addr + r->size - gpa; + + return gpa - r->guest_phys_addr + + r->host_user_addr; + } + } + *len = 0; + + return 0; +} + #define RTE_VHOST_NEED_LOG(features) ((features) & (1ULL << VHOST_F_LOG_ALL)) /** @@ -170,6 +258,41 @@ int rte_vhost_driver_register(const char *path, uint64_t flags); int rte_vhost_driver_unregister(const char *path); /** + * Set the vdpa device id, enforce single connection per socket + * + * @param path + * The vhost-user socket file path + * @param did + * Device id + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_driver_attach_vdpa_device(const char *path, int did); + +/** + * Unset the vdpa device id + * + * @param path + * The vhost-user socket file path + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_driver_detach_vdpa_device(const char *path); + +/** + * Get the device id + * + * @param path + * The vhost-user socket file path + * @return + * Device id, -1 on failure + */ +int __rte_experimental +rte_vhost_driver_get_vdpa_device_id(const char *path); + +/** * Set the feature bits the vhost-user driver supports. * * @param path @@ -225,6 +348,33 @@ int rte_vhost_driver_disable_features(const char *path, uint64_t features); int rte_vhost_driver_get_features(const char *path, uint64_t *features); /** + * Get the protocol feature bits before feature negotiation. + * + * @param path + * The vhost-user socket file path + * @param protocol_features + * A pointer to store the queried protocol feature bits + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_driver_get_protocol_features(const char *path, + uint64_t *protocol_features); + +/** + * Get the queue number bits before feature negotiation. + * + * @param path + * The vhost-user socket file path + * @param queue_num + * A pointer to store the queried queue number bits + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num); + +/** * Get the feature bits after negotiation * * @param vid @@ -434,6 +584,68 @@ int rte_vhost_vring_call(int vid, uint16_t vring_idx); */ uint32_t rte_vhost_rx_queue_count(int vid, uint16_t qid); +/** + * Get log base and log size of the vhost device + * + * @param vid + * vhost device ID + * @param log_base + * vhost log base + * @param log_size + * vhost log size + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_get_log_base(int vid, uint64_t *log_base, uint64_t *log_size); + +/** + * Get last_avail/used_idx of the vhost virtqueue + * + * @param vid + * vhost device ID + * @param queue_id + * vhost queue index + * @param last_avail_idx + * vhost last_avail_idx to get + * @param last_used_idx + * vhost last_used_idx to get + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_get_vring_base(int vid, uint16_t queue_id, + uint16_t *last_avail_idx, uint16_t *last_used_idx); + +/** + * Set last_avail/used_idx of the vhost virtqueue + * + * @param vid + * vhost device ID + * @param queue_id + * vhost queue index + * @param last_avail_idx + * last_avail_idx to set + * @param last_used_idx + * last_used_idx to set + * @return + * 0 on success, -1 on failure + */ +int __rte_experimental +rte_vhost_set_vring_base(int vid, uint16_t queue_id, + uint16_t last_avail_idx, uint16_t last_used_idx); + +/** + * Get vdpa device id for vhost device. + * + * @param vid + * vhost device id + * @return + * device id + */ +int __rte_experimental +rte_vhost_get_vdpa_device_id(int vid); + #ifdef __cplusplus } #endif diff --git a/lib/librte_vhost/rte_vhost_crypto.h b/lib/librte_vhost/rte_vhost_crypto.h new file mode 100644 index 00000000..f9fbc054 --- /dev/null +++ b/lib/librte_vhost/rte_vhost_crypto.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#ifndef _VHOST_CRYPTO_H_ +#define _VHOST_CRYPTO_H_ + +#define VHOST_CRYPTO_MBUF_POOL_SIZE (8192) +#define VHOST_CRYPTO_MAX_BURST_SIZE (64) +#define VHOST_CRYPTO_SESSION_MAP_ENTRIES (1024) /**< Max nb sessions */ +/** max nb virtual queues in a burst for finalizing*/ +#define VIRTIO_CRYPTO_MAX_NUM_BURST_VQS (64) + +enum rte_vhost_crypto_zero_copy { + RTE_VHOST_CRYPTO_ZERO_COPY_DISABLE = 0, + RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE = 1, + RTE_VHOST_CRYPTO_MAX_ZERO_COPY_OPTIONS +}; + +/** + * Create Vhost-crypto instance + * + * @param vid + * The identifier of the vhost device. + * @param cryptodev_id + * The identifier of DPDK Cryptodev, the same cryptodev_id can be assigned to + * multiple Vhost-crypto devices. + * @param sess_pool + * The pointer to the created cryptodev session pool with the private data size + * matches the target DPDK Cryptodev. + * @param socket_id + * NUMA Socket ID to allocate resources on. * + * @return + * 0 if the Vhost Crypto Instance is created successfully. + * Negative integer if otherwise + */ +int __rte_experimental +rte_vhost_crypto_create(int vid, uint8_t cryptodev_id, + struct rte_mempool *sess_pool, int socket_id); + +/** + * Free the Vhost-crypto instance + * + * @param vid + * The identifier of the vhost device. + * @return + * 0 if the Vhost Crypto Instance is created successfully. + * Negative integer if otherwise. + */ +int __rte_experimental +rte_vhost_crypto_free(int vid); + +/** + * Enable or disable zero copy feature + * + * @param vid + * The identifier of the vhost device. + * @param option + * Flag of zero copy feature. + * @return + * 0 if completed successfully. + * Negative integer if otherwise. + */ +int __rte_experimental +rte_vhost_crypto_set_zero_copy(int vid, enum rte_vhost_crypto_zero_copy option); + +/** + * Fetch a number of vring descriptors from virt-queue and translate to DPDK + * crypto operations. After this function is executed, the user can enqueue + * the processed ops to the target cryptodev. + * + * @param vid + * The identifier of the vhost device. + * @param qid + * Virtio queue index. + * @param ops + * The address of an array of pointers to *rte_crypto_op* structures that must + * be large enough to store *nb_ops* pointers in it. + * @param nb_ops + * The maximum number of operations to be fetched and translated. + * @return + * The number of fetched and processed vhost crypto request operations. + */ +uint16_t __rte_experimental +rte_vhost_crypto_fetch_requests(int vid, uint32_t qid, + struct rte_crypto_op **ops, uint16_t nb_ops); +/** + * Finalize the dequeued crypto ops. After the translated crypto ops are + * dequeued from the cryptodev, this function shall be called to write the + * processed data back to the vring descriptor (if no-copy is turned off). + * + * @param ops + * The address of an array of *rte_crypto_op* structure that was dequeued + * from cryptodev. + * @param nb_ops + * The number of operations contained in the array. + * @callfds + * The callfd number(s) contained in this burst, this shall be an array with + * no less than VIRTIO_CRYPTO_MAX_NUM_BURST_VQS elements. + * @nb_callfds + * The number of call_fd numbers exist in the callfds. + * @return + * The number of ops processed. + */ +uint16_t __rte_experimental +rte_vhost_crypto_finalize_requests(struct rte_crypto_op **ops, + uint16_t nb_ops, int *callfds, uint16_t *nb_callfds); + +#endif /**< _VHOST_CRYPTO_H_ */ diff --git a/lib/librte_vhost/rte_vhost_version.map b/lib/librte_vhost/rte_vhost_version.map index df010312..da220dd0 100644 --- a/lib/librte_vhost/rte_vhost_version.map +++ b/lib/librte_vhost/rte_vhost_version.map @@ -59,3 +59,27 @@ DPDK_18.02 { rte_vhost_vring_call; } DPDK_17.08; + +EXPERIMENTAL { + global: + + rte_vdpa_register_device; + rte_vdpa_unregister_device; + rte_vdpa_find_device_id; + rte_vdpa_get_device; + rte_vhost_driver_attach_vdpa_device; + rte_vhost_driver_detach_vdpa_device; + rte_vhost_driver_get_vdpa_device_id; + rte_vhost_get_vdpa_device_id; + rte_vhost_driver_get_protocol_features; + rte_vhost_driver_get_queue_num; + rte_vhost_get_log_base; + rte_vhost_get_vring_base; + rte_vhost_set_vring_base; + rte_vhost_crypto_create; + rte_vhost_crypto_free; + rte_vhost_crypto_fetch_requests; + rte_vhost_crypto_finalize_requests; + rte_vhost_crypto_set_zero_copy; + rte_vhost_va_from_guest_pa; +}; diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c index 83befdce..d6303174 100644 --- a/lib/librte_vhost/socket.c +++ b/lib/librte_vhost/socket.c @@ -4,7 +4,6 @@ #include <stdint.h> #include <stdio.h> -#include <stdbool.h> #include <limits.h> #include <stdlib.h> #include <unistd.h> @@ -52,6 +51,13 @@ struct vhost_user_socket { uint64_t supported_features; uint64_t features; + /* + * Device id to identify a specific backend device. + * It's set to -1 for the default software implementation. + * If valid, one socket can have 1 connection only. + */ + int vdpa_dev_id; + struct vhost_device_ops const *notify_ops; }; @@ -97,6 +103,7 @@ read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) size_t fdsize = fd_num * sizeof(int); char control[CMSG_SPACE(fdsize)]; struct cmsghdr *cmsg; + int got_fds = 0; int ret; memset(&msgh, 0, sizeof(msgh)); @@ -123,11 +130,16 @@ read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) cmsg = CMSG_NXTHDR(&msgh, cmsg)) { if ((cmsg->cmsg_level == SOL_SOCKET) && (cmsg->cmsg_type == SCM_RIGHTS)) { - memcpy(fds, CMSG_DATA(cmsg), fdsize); + got_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int); + memcpy(fds, CMSG_DATA(cmsg), got_fds * sizeof(int)); break; } } + /* Clear out unused file descriptors */ + while (got_fds < fd_num) + fds[got_fds++] = -1; + return ret; } @@ -153,6 +165,11 @@ send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) msgh.msg_control = control; msgh.msg_controllen = sizeof(control); cmsg = CMSG_FIRSTHDR(&msgh); + if (cmsg == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, "cmsg == NULL\n"); + errno = EINVAL; + return -1; + } cmsg->cmsg_len = CMSG_LEN(fdsize); cmsg->cmsg_level = SOL_SOCKET; cmsg->cmsg_type = SCM_RIGHTS; @@ -163,7 +180,7 @@ send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) } do { - ret = sendmsg(sockfd, &msgh, 0); + ret = sendmsg(sockfd, &msgh, MSG_NOSIGNAL); } while (ret < 0 && errno == EINTR); if (ret < 0) { @@ -182,6 +199,9 @@ vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket) struct vhost_user_connection *conn; int ret; + if (vsocket == NULL) + return; + conn = malloc(sizeof(*conn)); if (conn == NULL) { close(fd); @@ -198,6 +218,8 @@ vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket) vhost_set_builtin_virtio_net(vid, vsocket->use_builtin_virtio_net); + vhost_attach_vdpa_device(vid, vsocket->vdpa_dev_id); + if (vsocket->dequeue_zero_copy) vhost_enable_dequeue_zero_copy(vid); @@ -232,6 +254,8 @@ vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket) pthread_mutex_lock(&vsocket->conn_mutex); TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next); pthread_mutex_unlock(&vsocket->conn_mutex); + + fdset_pipe_notify(&vhost_user.fdset); return; err: @@ -318,6 +342,16 @@ vhost_user_start_server(struct vhost_user_socket *vsocket) int fd = vsocket->socket_fd; const char *path = vsocket->path; + /* + * bind () may fail if the socket file with the same name already + * exists. But the library obviously should not delete the file + * provided by the user, since we can not be sure that it is not + * being used by other applications. Moreover, many applications form + * socket names based on user input, which is prone to errors. + * + * The user must ensure that the socket does not exist before + * registering the vhost driver in server mode. + */ ret = bind(fd, (struct sockaddr *)&vsocket->un, sizeof(vsocket->un)); if (ret < 0) { RTE_LOG(ERR, VHOST_CONFIG, @@ -436,7 +470,6 @@ static int vhost_user_reconnect_init(void) { int ret; - char thread_name[RTE_MAX_THREAD_NAME_LEN]; ret = pthread_mutex_init(&reconn_list.mutex, NULL); if (ret < 0) { @@ -445,7 +478,7 @@ vhost_user_reconnect_init(void) } TAILQ_INIT(&reconn_list.head); - ret = pthread_create(&reconn_tid, NULL, + ret = rte_ctrl_thread_create(&reconn_tid, "vhost_reconn", NULL, vhost_user_client_reconnect, NULL); if (ret != 0) { RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread"); @@ -453,14 +486,6 @@ vhost_user_reconnect_init(void) RTE_LOG(ERR, VHOST_CONFIG, "failed to destroy reconnect mutex"); } - } else { - snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, - "vhost-reconn"); - - if (rte_thread_setname(reconn_tid, thread_name)) { - RTE_LOG(DEBUG, VHOST_CONFIG, - "failed to set reconnect thread name"); - } } return ret; @@ -524,6 +549,52 @@ find_vhost_user_socket(const char *path) } int +rte_vhost_driver_attach_vdpa_device(const char *path, int did) +{ + struct vhost_user_socket *vsocket; + + if (rte_vdpa_get_device(did) == NULL) + return -1; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + vsocket->vdpa_dev_id = did; + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_detach_vdpa_device(const char *path) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + vsocket->vdpa_dev_id = -1; + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_get_vdpa_device_id(const char *path) +{ + struct vhost_user_socket *vsocket; + int did = -1; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + did = vsocket->vdpa_dev_id; + pthread_mutex_unlock(&vhost_user.mutex); + + return did; +} + +int rte_vhost_driver_disable_features(const char *path, uint64_t features) { struct vhost_user_socket *vsocket; @@ -591,19 +662,136 @@ int rte_vhost_driver_get_features(const char *path, uint64_t *features) { struct vhost_user_socket *vsocket; + uint64_t vdpa_features; + struct rte_vdpa_device *vdpa_dev; + int did = -1; + int ret = 0; pthread_mutex_lock(&vhost_user.mutex); vsocket = find_vhost_user_socket(path); - if (vsocket) + if (!vsocket) { + RTE_LOG(ERR, VHOST_CONFIG, + "socket file %s is not registered yet.\n", path); + ret = -1; + goto unlock_exit; + } + + did = vsocket->vdpa_dev_id; + vdpa_dev = rte_vdpa_get_device(did); + if (!vdpa_dev || !vdpa_dev->ops->get_features) { *features = vsocket->features; + goto unlock_exit; + } + + if (vdpa_dev->ops->get_features(did, &vdpa_features) < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to get vdpa features " + "for socket file %s.\n", path); + ret = -1; + goto unlock_exit; + } + + *features = vsocket->features & vdpa_features; + +unlock_exit: pthread_mutex_unlock(&vhost_user.mutex); + return ret; +} + +int +rte_vhost_driver_get_protocol_features(const char *path, + uint64_t *protocol_features) +{ + struct vhost_user_socket *vsocket; + uint64_t vdpa_protocol_features; + struct rte_vdpa_device *vdpa_dev; + int did = -1; + int ret = 0; + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); if (!vsocket) { RTE_LOG(ERR, VHOST_CONFIG, "socket file %s is not registered yet.\n", path); - return -1; - } else { - return 0; + ret = -1; + goto unlock_exit; + } + + did = vsocket->vdpa_dev_id; + vdpa_dev = rte_vdpa_get_device(did); + if (!vdpa_dev || !vdpa_dev->ops->get_protocol_features) { + *protocol_features = VHOST_USER_PROTOCOL_FEATURES; + goto unlock_exit; + } + + if (vdpa_dev->ops->get_protocol_features(did, + &vdpa_protocol_features) < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to get vdpa protocol features " + "for socket file %s.\n", path); + ret = -1; + goto unlock_exit; + } + + *protocol_features = VHOST_USER_PROTOCOL_FEATURES + & vdpa_protocol_features; + +unlock_exit: + pthread_mutex_unlock(&vhost_user.mutex); + return ret; +} + +int +rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num) +{ + struct vhost_user_socket *vsocket; + uint32_t vdpa_queue_num; + struct rte_vdpa_device *vdpa_dev; + int did = -1; + int ret = 0; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (!vsocket) { + RTE_LOG(ERR, VHOST_CONFIG, + "socket file %s is not registered yet.\n", path); + ret = -1; + goto unlock_exit; + } + + did = vsocket->vdpa_dev_id; + vdpa_dev = rte_vdpa_get_device(did); + if (!vdpa_dev || !vdpa_dev->ops->get_queue_num) { + *queue_num = VHOST_MAX_QUEUE_PAIRS; + goto unlock_exit; + } + + if (vdpa_dev->ops->get_queue_num(did, &vdpa_queue_num) < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to get vdpa queue number " + "for socket file %s.\n", path); + ret = -1; + goto unlock_exit; + } + + *queue_num = RTE_MIN((uint32_t)VHOST_MAX_QUEUE_PAIRS, vdpa_queue_num); + +unlock_exit: + pthread_mutex_unlock(&vhost_user.mutex); + return ret; +} + +static void +vhost_user_socket_mem_free(struct vhost_user_socket *vsocket) +{ + if (vsocket && vsocket->path) { + free(vsocket->path); + vsocket->path = NULL; + } + + if (vsocket) { + free(vsocket); + vsocket = NULL; } } @@ -637,7 +825,7 @@ rte_vhost_driver_register(const char *path, uint64_t flags) if (vsocket->path == NULL) { RTE_LOG(ERR, VHOST_CONFIG, "error: failed to copy socket path string\n"); - free(vsocket); + vhost_user_socket_mem_free(vsocket); goto out; } TAILQ_INIT(&vsocket->conn_list); @@ -665,6 +853,12 @@ rte_vhost_driver_register(const char *path, uint64_t flags) vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES; vsocket->features = VIRTIO_NET_SUPPORTED_FEATURES; + /* Dequeue zero copy can't assure descriptors returned in order */ + if (vsocket->dequeue_zero_copy) { + vsocket->supported_features &= ~(1ULL << VIRTIO_F_IN_ORDER); + vsocket->features &= ~(1ULL << VIRTIO_F_IN_ORDER); + } + if (!(flags & RTE_VHOST_USER_IOMMU_SUPPORT)) { vsocket->supported_features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM); vsocket->features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM); @@ -695,8 +889,7 @@ out_mutex: "error: failed to destroy connection mutex\n"); } out_free: - free(vsocket->path); - free(vsocket); + vhost_user_socket_mem_free(vsocket); out: pthread_mutex_unlock(&vhost_user.mutex); @@ -743,21 +936,25 @@ rte_vhost_driver_unregister(const char *path) struct vhost_user_socket *vsocket = vhost_user.vsockets[i]; if (!strcmp(vsocket->path, path)) { - if (vsocket->is_server) { - fdset_del(&vhost_user.fdset, vsocket->socket_fd); - close(vsocket->socket_fd); - unlink(path); - } else if (vsocket->reconnect) { - vhost_user_remove_reconnect(vsocket); - } - +again: pthread_mutex_lock(&vsocket->conn_mutex); for (conn = TAILQ_FIRST(&vsocket->conn_list); conn != NULL; conn = next) { next = TAILQ_NEXT(conn, next); - fdset_del(&vhost_user.fdset, conn->connfd); + /* + * If r/wcb is executing, release the + * conn_mutex lock, and try again since + * the r/wcb may use the conn_mutex lock. + */ + if (fdset_try_del(&vhost_user.fdset, + conn->connfd) == -1) { + pthread_mutex_unlock( + &vsocket->conn_mutex); + goto again; + } + RTE_LOG(INFO, VHOST_CONFIG, "free connfd = %d for device '%s'\n", conn->connfd, path); @@ -768,9 +965,17 @@ rte_vhost_driver_unregister(const char *path) } pthread_mutex_unlock(&vsocket->conn_mutex); + if (vsocket->is_server) { + fdset_del(&vhost_user.fdset, + vsocket->socket_fd); + close(vsocket->socket_fd); + unlink(path); + } else if (vsocket->reconnect) { + vhost_user_remove_reconnect(vsocket); + } + pthread_mutex_destroy(&vsocket->conn_mutex); - free(vsocket->path); - free(vsocket); + vhost_user_socket_mem_free(vsocket); count = --vhost_user.vsocket_cnt; vhost_user.vsockets[i] = vhost_user.vsockets[count]; @@ -829,11 +1034,26 @@ rte_vhost_driver_start(const char *path) return -1; if (fdset_tid == 0) { - int ret = pthread_create(&fdset_tid, NULL, fdset_event_dispatch, - &vhost_user.fdset); - if (ret != 0) + /** + * create a pipe which will be waited by poll and notified to + * rebuild the wait list of poll. + */ + if (fdset_pipe_init(&vhost_user.fdset) < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to create pipe for vhost fdset\n"); + return -1; + } + + int ret = rte_ctrl_thread_create(&fdset_tid, + "vhost-events", NULL, fdset_event_dispatch, + &vhost_user.fdset); + if (ret != 0) { RTE_LOG(ERR, VHOST_CONFIG, "failed to create fdset handling thread"); + + fdset_pipe_uninit(&vhost_user.fdset); + return -1; + } } if (vsocket->is_server) diff --git a/lib/librte_vhost/vdpa.c b/lib/librte_vhost/vdpa.c new file mode 100644 index 00000000..c82fd437 --- /dev/null +++ b/lib/librte_vhost/vdpa.c @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +/** + * @file + * + * Device specific vhost lib + */ + +#include <stdbool.h> + +#include <rte_malloc.h> +#include "rte_vdpa.h" +#include "vhost.h" + +static struct rte_vdpa_device *vdpa_devices[MAX_VHOST_DEVICE]; +static uint32_t vdpa_device_num; + +static bool +is_same_vdpa_device(struct rte_vdpa_dev_addr *a, + struct rte_vdpa_dev_addr *b) +{ + bool ret = true; + + if (a->type != b->type) + return false; + + switch (a->type) { + case PCI_ADDR: + if (a->pci_addr.domain != b->pci_addr.domain || + a->pci_addr.bus != b->pci_addr.bus || + a->pci_addr.devid != b->pci_addr.devid || + a->pci_addr.function != b->pci_addr.function) + ret = false; + break; + default: + break; + } + + return ret; +} + +int +rte_vdpa_register_device(struct rte_vdpa_dev_addr *addr, + struct rte_vdpa_dev_ops *ops) +{ + struct rte_vdpa_device *dev; + char device_name[MAX_VDPA_NAME_LEN]; + int i; + + if (vdpa_device_num >= MAX_VHOST_DEVICE) + return -1; + + for (i = 0; i < MAX_VHOST_DEVICE; i++) { + dev = vdpa_devices[i]; + if (dev && is_same_vdpa_device(&dev->addr, addr)) + return -1; + } + + for (i = 0; i < MAX_VHOST_DEVICE; i++) { + if (vdpa_devices[i] == NULL) + break; + } + + sprintf(device_name, "vdpa-dev-%d", i); + dev = rte_zmalloc(device_name, sizeof(struct rte_vdpa_device), + RTE_CACHE_LINE_SIZE); + if (!dev) + return -1; + + memcpy(&dev->addr, addr, sizeof(struct rte_vdpa_dev_addr)); + dev->ops = ops; + vdpa_devices[i] = dev; + vdpa_device_num++; + + return i; +} + +int +rte_vdpa_unregister_device(int did) +{ + if (did < 0 || did >= MAX_VHOST_DEVICE || vdpa_devices[did] == NULL) + return -1; + + rte_free(vdpa_devices[did]); + vdpa_devices[did] = NULL; + vdpa_device_num--; + + return did; +} + +int +rte_vdpa_find_device_id(struct rte_vdpa_dev_addr *addr) +{ + struct rte_vdpa_device *dev; + int i; + + for (i = 0; i < MAX_VHOST_DEVICE; ++i) { + dev = vdpa_devices[i]; + if (dev && is_same_vdpa_device(&dev->addr, addr)) + return i; + } + + return -1; +} + +struct rte_vdpa_device * +rte_vdpa_get_device(int did) +{ + if (did < 0 || did >= MAX_VHOST_DEVICE) + return NULL; + + return vdpa_devices[did]; +} diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index a407067e..3c9be10a 100644 --- a/lib/librte_vhost/vhost.c +++ b/lib/librte_vhost/vhost.c @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2016 Intel Corporation + * Copyright(c) 2010-2017 Intel Corporation */ #include <linux/vhost.h> @@ -29,17 +29,17 @@ struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; /* Called with iotlb_lock read-locked */ uint64_t __vhost_iova_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint64_t iova, uint64_t size, uint8_t perm) + uint64_t iova, uint64_t *size, uint8_t perm) { uint64_t vva, tmp_size; - if (unlikely(!size)) + if (unlikely(!*size)) return 0; - tmp_size = size; + tmp_size = *size; vva = vhost_user_iotlb_cache_find(vq, iova, &tmp_size, perm); - if (tmp_size == size) + if (tmp_size == *size) return vva; iova += tmp_size; @@ -68,19 +68,6 @@ __vhost_iova_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -struct virtio_net * -get_device(int vid) -{ - struct virtio_net *dev = vhost_devices[vid]; - - if (unlikely(!dev)) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%d) device not found.\n", vid); - } - - return dev; -} - void cleanup_vq(struct vhost_virtqueue *vq, int destroy) { @@ -106,9 +93,12 @@ cleanup_device(struct virtio_net *dev, int destroy) } void -free_vq(struct vhost_virtqueue *vq) +free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq) { - rte_free(vq->shadow_used_ring); + if (vq_is_packed(dev)) + rte_free(vq->shadow_used_packed); + else + rte_free(vq->shadow_used_split); rte_free(vq->batch_copy_elems); rte_mempool_free(vq->iotlb_pool); rte_free(vq); @@ -123,42 +113,95 @@ free_device(struct virtio_net *dev) uint32_t i; for (i = 0; i < dev->nr_vring; i++) - free_vq(dev->virtqueue[i]); + free_vq(dev, dev->virtqueue[i]); rte_free(dev); } -int -vring_translate(struct virtio_net *dev, struct vhost_virtqueue *vq) +static int +vring_translate_split(struct virtio_net *dev, struct vhost_virtqueue *vq) { - uint64_t size; + uint64_t req_size, size; - if (!(dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) - goto out; - - size = sizeof(struct vring_desc) * vq->size; + req_size = sizeof(struct vring_desc) * vq->size; + size = req_size; vq->desc = (struct vring_desc *)(uintptr_t)vhost_iova_to_vva(dev, vq, vq->ring_addrs.desc_user_addr, - size, VHOST_ACCESS_RW); - if (!vq->desc) + &size, VHOST_ACCESS_RW); + if (!vq->desc || size != req_size) return -1; - size = sizeof(struct vring_avail); - size += sizeof(uint16_t) * vq->size; + req_size = sizeof(struct vring_avail); + req_size += sizeof(uint16_t) * vq->size; + if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) + req_size += sizeof(uint16_t); + size = req_size; vq->avail = (struct vring_avail *)(uintptr_t)vhost_iova_to_vva(dev, vq, vq->ring_addrs.avail_user_addr, - size, VHOST_ACCESS_RW); - if (!vq->avail) + &size, VHOST_ACCESS_RW); + if (!vq->avail || size != req_size) return -1; - size = sizeof(struct vring_used); - size += sizeof(struct vring_used_elem) * vq->size; + req_size = sizeof(struct vring_used); + req_size += sizeof(struct vring_used_elem) * vq->size; + if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) + req_size += sizeof(uint16_t); + size = req_size; vq->used = (struct vring_used *)(uintptr_t)vhost_iova_to_vva(dev, vq, vq->ring_addrs.used_user_addr, - size, VHOST_ACCESS_RW); - if (!vq->used) + &size, VHOST_ACCESS_RW); + if (!vq->used || size != req_size) + return -1; + + return 0; +} + +static int +vring_translate_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + uint64_t req_size, size; + + req_size = sizeof(struct vring_packed_desc) * vq->size; + size = req_size; + vq->desc_packed = (struct vring_packed_desc *)(uintptr_t) + vhost_iova_to_vva(dev, vq, vq->ring_addrs.desc_user_addr, + &size, VHOST_ACCESS_RW); + if (!vq->desc_packed || size != req_size) return -1; + req_size = sizeof(struct vring_packed_desc_event); + size = req_size; + vq->driver_event = (struct vring_packed_desc_event *)(uintptr_t) + vhost_iova_to_vva(dev, vq, vq->ring_addrs.avail_user_addr, + &size, VHOST_ACCESS_RW); + if (!vq->driver_event || size != req_size) + return -1; + + req_size = sizeof(struct vring_packed_desc_event); + size = req_size; + vq->device_event = (struct vring_packed_desc_event *)(uintptr_t) + vhost_iova_to_vva(dev, vq, vq->ring_addrs.used_user_addr, + &size, VHOST_ACCESS_RW); + if (!vq->device_event || size != req_size) + return -1; + + return 0; +} + +int +vring_translate(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + + if (!(dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) + goto out; + + if (vq_is_packed(dev)) { + if (vring_translate_packed(dev, vq) < 0) + return -1; + } else { + if (vring_translate_split(dev, vq) < 0) + return -1; + } out: vq->access_ok = 1; @@ -240,6 +283,9 @@ alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx) dev->virtqueue[vring_idx] = vq; init_vring_queue(dev, vring_idx); rte_spinlock_init(&vq->access_lock); + vq->avail_wrap_counter = 1; + vq->used_wrap_counter = 1; + vq->signalled_used_valid = false; dev->nr_vring += 1; @@ -274,21 +320,21 @@ vhost_new_device(void) struct virtio_net *dev; int i; - dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0); - if (dev == NULL) { - RTE_LOG(ERR, VHOST_CONFIG, - "Failed to allocate memory for new dev.\n"); - return -1; - } - for (i = 0; i < MAX_VHOST_DEVICE; i++) { if (vhost_devices[i] == NULL) break; } + if (i == MAX_VHOST_DEVICE) { RTE_LOG(ERR, VHOST_CONFIG, "Failed to find a free slot for new device.\n"); - rte_free(dev); + return -1; + } + + dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0); + if (dev == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to allocate memory for new dev.\n"); return -1; } @@ -296,10 +342,28 @@ vhost_new_device(void) dev->vid = i; dev->flags = VIRTIO_DEV_BUILTIN_VIRTIO_NET; dev->slave_req_fd = -1; + dev->vdpa_dev_id = -1; + rte_spinlock_init(&dev->slave_req_lock); return i; } +void +vhost_destroy_device_notify(struct virtio_net *dev) +{ + struct rte_vdpa_device *vdpa_dev; + int did; + + if (dev->flags & VIRTIO_DEV_RUNNING) { + did = dev->vdpa_dev_id; + vdpa_dev = rte_vdpa_get_device(did); + if (vdpa_dev && vdpa_dev->ops->dev_close) + vdpa_dev->ops->dev_close(dev->vid); + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } +} + /* * Invoked when there is the vhost-user connection is broken (when * the virtio device is being detached). @@ -312,10 +376,7 @@ vhost_destroy_device(int vid) if (dev == NULL) return; - if (dev->flags & VIRTIO_DEV_RUNNING) { - dev->flags &= ~VIRTIO_DEV_RUNNING; - dev->notify_ops->destroy_device(vid); - } + vhost_destroy_device_notify(dev); cleanup_device(dev, 1); free_device(dev); @@ -324,6 +385,33 @@ vhost_destroy_device(int vid) } void +vhost_attach_vdpa_device(int vid, int did) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + if (rte_vdpa_get_device(did) == NULL) + return; + + dev->vdpa_dev_id = did; +} + +void +vhost_detach_vdpa_device(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + vhost_user_host_notifier_ctrl(vid, false); + + dev->vdpa_dev_id = -1; +} + +void vhost_set_ifname(int vid, const char *if_name, unsigned int if_len) { struct virtio_net *dev; @@ -532,7 +620,11 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx) if (!vq) return -1; - vhost_vring_call(dev, vq); + if (vq_is_packed(dev)) + vhost_vring_call_packed(dev, vq); + else + vhost_vring_call_split(dev, vq); + return 0; } @@ -553,21 +645,52 @@ rte_vhost_avail_entries(int vid, uint16_t queue_id) return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx; } +static inline void +vhost_enable_notify_split(struct vhost_virtqueue *vq, int enable) +{ + if (enable) + vq->used->flags &= ~VRING_USED_F_NO_NOTIFY; + else + vq->used->flags |= VRING_USED_F_NO_NOTIFY; +} + +static inline void +vhost_enable_notify_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, int enable) +{ + uint16_t flags; + + if (!enable) + vq->device_event->flags = VRING_EVENT_F_DISABLE; + + flags = VRING_EVENT_F_ENABLE; + if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) { + flags = VRING_EVENT_F_DESC; + vq->device_event->off_wrap = vq->last_avail_idx | + vq->avail_wrap_counter << 15; + } + + rte_smp_wmb(); + + vq->device_event->flags = flags; +} + int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable) { struct virtio_net *dev = get_device(vid); + struct vhost_virtqueue *vq; - if (dev == NULL) + if (!dev) return -1; - if (enable) { - RTE_LOG(ERR, VHOST_CONFIG, - "guest notification isn't supported.\n"); - return -1; - } + vq = dev->virtqueue[queue_id]; + + if (vq_is_packed(dev)) + vhost_enable_notify_packed(dev, vq, enable); + else + vhost_enable_notify_split(vq, enable); - dev->virtqueue[queue_id]->used->flags = VRING_USED_F_NO_NOTIFY; return 0; } @@ -627,3 +750,76 @@ rte_vhost_rx_queue_count(int vid, uint16_t qid) return *((volatile uint16_t *)&vq->avail->idx) - vq->last_avail_idx; } + +int rte_vhost_get_vdpa_device_id(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return -1; + + return dev->vdpa_dev_id; +} + +int rte_vhost_get_log_base(int vid, uint64_t *log_base, + uint64_t *log_size) +{ + struct virtio_net *dev = get_device(vid); + + if (!dev) + return -1; + + if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) { + RTE_LOG(ERR, VHOST_DATA, + "(%d) %s: built-in vhost net backend is disabled.\n", + dev->vid, __func__); + return -1; + } + + *log_base = dev->log_base; + *log_size = dev->log_size; + + return 0; +} + +int rte_vhost_get_vring_base(int vid, uint16_t queue_id, + uint16_t *last_avail_idx, uint16_t *last_used_idx) +{ + struct virtio_net *dev = get_device(vid); + + if (!dev) + return -1; + + if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) { + RTE_LOG(ERR, VHOST_DATA, + "(%d) %s: built-in vhost net backend is disabled.\n", + dev->vid, __func__); + return -1; + } + + *last_avail_idx = dev->virtqueue[queue_id]->last_avail_idx; + *last_used_idx = dev->virtqueue[queue_id]->last_used_idx; + + return 0; +} + +int rte_vhost_set_vring_base(int vid, uint16_t queue_id, + uint16_t last_avail_idx, uint16_t last_used_idx) +{ + struct virtio_net *dev = get_device(vid); + + if (!dev) + return -1; + + if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) { + RTE_LOG(ERR, VHOST_DATA, + "(%d) %s: built-in vhost net backend is disabled.\n", + dev->vid, __func__); + return -1; + } + + dev->virtqueue[queue_id]->last_avail_idx = last_avail_idx; + dev->virtqueue[queue_id]->last_used_idx = last_used_idx; + + return 0; +} diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index d947bc9e..760a09c0 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -1,11 +1,12 @@ /* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation + * Copyright(c) 2010-2018 Intel Corporation */ #ifndef _VHOST_NET_CDEV_H_ #define _VHOST_NET_CDEV_H_ #include <stdint.h> #include <stdio.h> +#include <stdbool.h> #include <sys/types.h> #include <sys/queue.h> #include <unistd.h> @@ -19,6 +20,7 @@ #include <rte_rwlock.h> #include "rte_vhost.h" +#include "rte_vdpa.h" /* Used to indicate that the device is running on a data core */ #define VIRTIO_DEV_RUNNING 1 @@ -26,17 +28,22 @@ #define VIRTIO_DEV_READY 2 /* Used to indicate that the built-in vhost net device backend is enabled */ #define VIRTIO_DEV_BUILTIN_VIRTIO_NET 4 +/* Used to indicate that the device has its own data path and configured */ +#define VIRTIO_DEV_VDPA_CONFIGURED 8 /* Backend value set by guest. */ #define VIRTIO_DEV_STOPPED -1 #define BUF_VECTOR_MAX 256 +#define VHOST_LOG_CACHE_NR 32 + /** * Structure contains buffer address, length and descriptor index * from vring to do scatter RX. */ struct buf_vector { + uint64_t buf_iova; uint64_t buf_addr; uint32_t buf_len; uint32_t desc_idx; @@ -49,6 +56,7 @@ struct buf_vector { struct zcopy_mbuf { struct rte_mbuf *mbuf; uint32_t desc_idx; + uint16_t desc_count; uint16_t in_use; TAILQ_ENTRY(zcopy_mbuf) next; @@ -65,19 +73,43 @@ struct batch_copy_elem { uint64_t log_addr; }; +/* + * Structure that contains the info for batched dirty logging. + */ +struct log_cache_entry { + uint32_t offset; + unsigned long val; +}; + +struct vring_used_elem_packed { + uint16_t id; + uint32_t len; + uint32_t count; +}; + /** * Structure contains variables relevant to RX/TX virtqueues. */ struct vhost_virtqueue { - struct vring_desc *desc; - struct vring_avail *avail; - struct vring_used *used; + union { + struct vring_desc *desc; + struct vring_packed_desc *desc_packed; + }; + union { + struct vring_avail *avail; + struct vring_packed_desc_event *driver_event; + }; + union { + struct vring_used *used; + struct vring_packed_desc_event *device_event; + }; uint32_t size; uint16_t last_avail_idx; uint16_t last_used_idx; /* Last used index we notify to front end. */ uint16_t signalled_used; + bool signalled_used_valid; #define VIRTIO_INVALID_EVENTFD (-1) #define VIRTIO_UNINITIALIZED_EVENTFD (-2) @@ -101,12 +133,20 @@ struct vhost_virtqueue { struct zcopy_mbuf *zmbufs; struct zcopy_mbuf_list zmbuf_list; - struct vring_used_elem *shadow_used_ring; + union { + struct vring_used_elem *shadow_used_split; + struct vring_used_elem_packed *shadow_used_packed; + }; uint16_t shadow_used_idx; struct vhost_vring_addr ring_addrs; struct batch_copy_elem *batch_copy_elems; uint16_t batch_copy_nb_elems; + bool used_wrap_counter; + bool avail_wrap_counter; + + struct log_cache_entry log_cache[VHOST_LOG_CACHE_NR]; + uint16_t log_cache_nb_elem; rte_rwlock_t iotlb_lock; rte_rwlock_t iotlb_pending_lock; @@ -174,7 +214,41 @@ struct vhost_msg { #define VIRTIO_F_VERSION_1 32 #endif -#define VHOST_USER_F_PROTOCOL_FEATURES 30 +/* Declare packed ring related bits for older kernels */ +#ifndef VIRTIO_F_RING_PACKED + +#define VIRTIO_F_RING_PACKED 34 + +#define VRING_DESC_F_NEXT 1 +#define VRING_DESC_F_WRITE 2 +#define VRING_DESC_F_INDIRECT 4 + +#define VRING_DESC_F_AVAIL (1ULL << 7) +#define VRING_DESC_F_USED (1ULL << 15) + +struct vring_packed_desc { + uint64_t addr; + uint32_t len; + uint16_t id; + uint16_t flags; +}; + +#define VRING_EVENT_F_ENABLE 0x0 +#define VRING_EVENT_F_DISABLE 0x1 +#define VRING_EVENT_F_DESC 0x2 + +struct vring_packed_desc_event { + uint16_t off_wrap; + uint16_t flags; +}; +#endif + +/* + * Available and used descs are in same order + */ +#ifndef VIRTIO_F_IN_ORDER +#define VIRTIO_F_IN_ORDER 35 +#endif /* Features supported by this builtin vhost-user net driver. */ #define VIRTIO_NET_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \ @@ -199,7 +273,8 @@ struct vhost_msg { (1ULL << VIRTIO_NET_F_GUEST_ECN) | \ (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \ (1ULL << VIRTIO_RING_F_EVENT_IDX) | \ - (1ULL << VIRTIO_NET_F_MTU) | \ + (1ULL << VIRTIO_NET_F_MTU) | \ + (1ULL << VIRTIO_F_IN_ORDER) | \ (1ULL << VIRTIO_F_IOMMU_PLATFORM)) @@ -210,6 +285,51 @@ struct guest_page { }; /** + * function prototype for the vhost backend to handler specific vhost user + * messages prior to the master message handling + * + * @param vid + * vhost device id + * @param msg + * Message pointer. + * @param require_reply + * If the handler requires sending a reply, this varaible shall be written 1, + * otherwise 0. + * @param skip_master + * If the handler requires skipping the master message handling, this variable + * shall be written 1, otherwise 0. + * @return + * 0 on success, -1 on failure + */ +typedef int (*vhost_msg_pre_handle)(int vid, void *msg, + uint32_t *require_reply, uint32_t *skip_master); + +/** + * function prototype for the vhost backend to handler specific vhost user + * messages after the master message handling is done + * + * @param vid + * vhost device id + * @param msg + * Message pointer. + * @param require_reply + * If the handler requires sending a reply, this varaible shall be written 1, + * otherwise 0. + * @return + * 0 on success, -1 on failure + */ +typedef int (*vhost_msg_post_handle)(int vid, void *msg, + uint32_t *require_reply); + +/** + * pre and post vhost user message handlers + */ +struct vhost_user_extern_ops { + vhost_msg_pre_handle pre_msg_handle; + vhost_msg_post_handle post_msg_handle; +}; + +/** * Device structure contains all configuration information relating * to the device. */ @@ -241,8 +361,32 @@ struct virtio_net { struct guest_page *guest_pages; int slave_req_fd; + rte_spinlock_t slave_req_lock; + + /* + * Device id to identify a specific backend device. + * It's set to -1 for the default software implementation. + */ + int vdpa_dev_id; + + /* private data for virtio device */ + void *extern_data; + /* pre and post vhost user message handlers for the device */ + struct vhost_user_extern_ops extern_ops; } __rte_cache_aligned; +static __rte_always_inline bool +vq_is_packed(struct virtio_net *dev) +{ + return dev->features & (1ull << VIRTIO_F_RING_PACKED); +} + +static inline bool +desc_is_avail(struct vring_packed_desc *desc, bool wrap_counter) +{ + return wrap_counter == !!(desc->flags & VRING_DESC_F_AVAIL) && + wrap_counter != !!(desc->flags & VRING_DESC_F_USED); +} #define VHOST_LOG_PAGE 4096 @@ -252,7 +396,15 @@ struct virtio_net { static __rte_always_inline void vhost_set_bit(unsigned int nr, volatile uint8_t *addr) { - __sync_fetch_and_or_8(addr, (1U << nr)); +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100) + /* + * __sync_ built-ins are deprecated, but __atomic_ ones + * are sub-optimized in older GCC versions. + */ + __sync_fetch_and_or_1(addr, (1U << nr)); +#else + __atomic_fetch_or(addr, (1U << nr), __ATOMIC_RELAXED); +#endif } static __rte_always_inline void @@ -284,6 +436,103 @@ vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len) } static __rte_always_inline void +vhost_log_cache_sync(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + unsigned long *log_base; + int i; + + if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) || + !dev->log_base)) + return; + + log_base = (unsigned long *)(uintptr_t)dev->log_base; + + /* + * It is expected a write memory barrier has been issued + * before this function is called. + */ + + for (i = 0; i < vq->log_cache_nb_elem; i++) { + struct log_cache_entry *elem = vq->log_cache + i; + +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100) + /* + * '__sync' builtins are deprecated, but '__atomic' ones + * are sub-optimized in older GCC versions. + */ + __sync_fetch_and_or(log_base + elem->offset, elem->val); +#else + __atomic_fetch_or(log_base + elem->offset, elem->val, + __ATOMIC_RELAXED); +#endif + } + + rte_smp_wmb(); + + vq->log_cache_nb_elem = 0; +} + +static __rte_always_inline void +vhost_log_cache_page(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t page) +{ + uint32_t bit_nr = page % (sizeof(unsigned long) << 3); + uint32_t offset = page / (sizeof(unsigned long) << 3); + int i; + + for (i = 0; i < vq->log_cache_nb_elem; i++) { + struct log_cache_entry *elem = vq->log_cache + i; + + if (elem->offset == offset) { + elem->val |= (1UL << bit_nr); + return; + } + } + + if (unlikely(i >= VHOST_LOG_CACHE_NR)) { + /* + * No more room for a new log cache entry, + * so write the dirty log map directly. + */ + rte_smp_wmb(); + vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page); + + return; + } + + vq->log_cache[i].offset = offset; + vq->log_cache[i].val = (1UL << bit_nr); + vq->log_cache_nb_elem++; +} + +static __rte_always_inline void +vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t addr, uint64_t len) +{ + uint64_t page; + + if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) || + !dev->log_base || !len)) + return; + + if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8))) + return; + + page = addr / VHOST_LOG_PAGE; + while (page * VHOST_LOG_PAGE < addr + len) { + vhost_log_cache_page(dev, vq, page); + page += 1; + } +} + +static __rte_always_inline void +vhost_log_cache_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t offset, uint64_t len) +{ + vhost_log_cache_write(dev, vq, vq->log_guest_addr + offset, len); +} + +static __rte_always_inline void vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, uint64_t offset, uint64_t len) { @@ -296,8 +545,8 @@ vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, #ifdef RTE_LIBRTE_VHOST_DEBUG #define VHOST_MAX_PRINT_BUFF 6072 -#define LOG_LEVEL RTE_LOG_DEBUG -#define LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args) +#define VHOST_LOG_DEBUG(log_type, fmt, args...) \ + RTE_LOG(DEBUG, log_type, fmt, ##args) #define PRINT_PACKET(device, addr, size, header) do { \ char *pkt_addr = (char *)(addr); \ unsigned int index; \ @@ -313,11 +562,10 @@ vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, } \ snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \ \ - LOG_DEBUG(VHOST_DATA, "%s", packet); \ + VHOST_LOG_DEBUG(VHOST_DATA, "%s", packet); \ } while (0) #else -#define LOG_LEVEL RTE_LOG_INFO -#define LOG_DEBUG(log_type, fmt, args...) do {} while (0) +#define VHOST_LOG_DEBUG(log_type, fmt, args...) do {} while (0) #define PRINT_PACKET(device, addr, size, header) do {} while (0) #endif @@ -345,18 +593,33 @@ gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size) return 0; } -struct virtio_net *get_device(int vid); +static __rte_always_inline struct virtio_net * +get_device(int vid) +{ + struct virtio_net *dev = vhost_devices[vid]; + + if (unlikely(!dev)) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) device not found.\n", vid); + } + + return dev; +} int vhost_new_device(void); void cleanup_device(struct virtio_net *dev, int destroy); void reset_device(struct virtio_net *dev); void vhost_destroy_device(int); +void vhost_destroy_device_notify(struct virtio_net *dev); void cleanup_vq(struct vhost_virtqueue *vq, int destroy); -void free_vq(struct vhost_virtqueue *vq); +void free_vq(struct virtio_net *dev, struct vhost_virtqueue *vq); int alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx); +void vhost_attach_vdpa_device(int vid, int did); +void vhost_detach_vdpa_device(int vid); + void vhost_set_ifname(int, const char *if_name, unsigned int if_len); void vhost_enable_dequeue_zero_copy(int vid); void vhost_set_builtin_virtio_net(int vid, bool enable); @@ -371,18 +634,18 @@ struct vhost_device_ops const *vhost_driver_callback_get(const char *path); void vhost_backend_cleanup(struct virtio_net *dev); uint64_t __vhost_iova_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint64_t iova, uint64_t size, uint8_t perm); + uint64_t iova, uint64_t *len, uint8_t perm); int vring_translate(struct virtio_net *dev, struct vhost_virtqueue *vq); void vring_invalidate(struct virtio_net *dev, struct vhost_virtqueue *vq); static __rte_always_inline uint64_t vhost_iova_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint64_t iova, uint64_t size, uint8_t perm) + uint64_t iova, uint64_t *len, uint8_t perm) { if (!(dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) - return rte_vhost_gpa_to_vva(dev->mem, iova); + return rte_vhost_va_from_guest_pa(dev->mem, iova, len); - return __vhost_iova_to_vva(dev, vq, iova, size, perm); + return __vhost_iova_to_vva(dev, vq, iova, len, perm); } #define vhost_used_event(vr) \ @@ -401,17 +664,17 @@ vhost_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old) } static __rte_always_inline void -vhost_vring_call(struct virtio_net *dev, struct vhost_virtqueue *vq) +vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq) { /* Flush used->idx update before we read avail->flags. */ - rte_mb(); + rte_smp_mb(); /* Don't kick guest if we don't reach index specified by guest. */ if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) { uint16_t old = vq->signalled_used; uint16_t new = vq->last_used_idx; - LOG_DEBUG(VHOST_DATA, "%s: used_event_idx=%d, old=%d, new=%d\n", + VHOST_LOG_DEBUG(VHOST_DATA, "%s: used_event_idx=%d, old=%d, new=%d\n", __func__, vhost_used_event(vq), old, new); @@ -428,4 +691,55 @@ vhost_vring_call(struct virtio_net *dev, struct vhost_virtqueue *vq) } } +static __rte_always_inline void +vhost_vring_call_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + uint16_t old, new, off, off_wrap; + bool signalled_used_valid, kick = false; + + /* Flush used desc update. */ + rte_smp_mb(); + + if (!(dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX))) { + if (vq->driver_event->flags != + VRING_EVENT_F_DISABLE) + kick = true; + goto kick; + } + + old = vq->signalled_used; + new = vq->last_used_idx; + vq->signalled_used = new; + signalled_used_valid = vq->signalled_used_valid; + vq->signalled_used_valid = true; + + if (vq->driver_event->flags != VRING_EVENT_F_DESC) { + if (vq->driver_event->flags != VRING_EVENT_F_DISABLE) + kick = true; + goto kick; + } + + if (unlikely(!signalled_used_valid)) { + kick = true; + goto kick; + } + + rte_smp_rmb(); + + off_wrap = vq->driver_event->off_wrap; + off = off_wrap & ~(1 << 15); + + if (new <= old) + old -= vq->size; + + if (vq->used_wrap_counter != off_wrap >> 15) + off -= vq->size; + + if (vhost_need_event(off, new, old)) + kick = true; +kick: + if (kick) + eventfd_write(vq->callfd, (eventfd_t)1); +} + #endif /* _VHOST_NET_CDEV_H_ */ diff --git a/lib/librte_vhost/vhost_crypto.c b/lib/librte_vhost/vhost_crypto.c new file mode 100644 index 00000000..57341ef8 --- /dev/null +++ b/lib/librte_vhost/vhost_crypto.c @@ -0,0 +1,1372 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ +#include <rte_malloc.h> +#include <rte_hash.h> +#include <rte_jhash.h> +#include <rte_mbuf.h> +#include <rte_cryptodev.h> + +#include "rte_vhost_crypto.h" +#include "vhost.h" +#include "vhost_user.h" +#include "virtio_crypto.h" + +#define INHDR_LEN (sizeof(struct virtio_crypto_inhdr)) +#define IV_OFFSET (sizeof(struct rte_crypto_op) + \ + sizeof(struct rte_crypto_sym_op)) + +#ifdef RTE_LIBRTE_VHOST_DEBUG +#define VC_LOG_ERR(fmt, args...) \ + RTE_LOG(ERR, USER1, "[%s] %s() line %u: " fmt "\n", \ + "Vhost-Crypto", __func__, __LINE__, ## args) +#define VC_LOG_INFO(fmt, args...) \ + RTE_LOG(INFO, USER1, "[%s] %s() line %u: " fmt "\n", \ + "Vhost-Crypto", __func__, __LINE__, ## args) + +#define VC_LOG_DBG(fmt, args...) \ + RTE_LOG(DEBUG, USER1, "[%s] %s() line %u: " fmt "\n", \ + "Vhost-Crypto", __func__, __LINE__, ## args) +#else +#define VC_LOG_ERR(fmt, args...) \ + RTE_LOG(ERR, USER1, "[VHOST-Crypto]: " fmt "\n", ## args) +#define VC_LOG_INFO(fmt, args...) \ + RTE_LOG(INFO, USER1, "[VHOST-Crypto]: " fmt "\n", ## args) +#define VC_LOG_DBG(fmt, args...) +#endif + +#define VIRTIO_CRYPTO_FEATURES ((1 << VIRTIO_F_NOTIFY_ON_EMPTY) | \ + (1 << VIRTIO_RING_F_INDIRECT_DESC) | \ + (1 << VIRTIO_RING_F_EVENT_IDX) | \ + (1 << VIRTIO_CRYPTO_SERVICE_CIPHER) | \ + (1 << VIRTIO_CRYPTO_SERVICE_MAC) | \ + (1 << VIRTIO_NET_F_CTRL_VQ)) + +#define IOVA_TO_VVA(t, r, a, l, p) \ + ((t)(uintptr_t)vhost_iova_to_vva(r->dev, r->vq, a, l, p)) + +static int +cipher_algo_transform(uint32_t virtio_cipher_algo) +{ + int ret; + + switch (virtio_cipher_algo) { + case VIRTIO_CRYPTO_CIPHER_AES_CBC: + ret = RTE_CRYPTO_CIPHER_AES_CBC; + break; + case VIRTIO_CRYPTO_CIPHER_AES_CTR: + ret = RTE_CRYPTO_CIPHER_AES_CTR; + break; + case VIRTIO_CRYPTO_CIPHER_DES_ECB: + ret = -VIRTIO_CRYPTO_NOTSUPP; + break; + case VIRTIO_CRYPTO_CIPHER_DES_CBC: + ret = RTE_CRYPTO_CIPHER_DES_CBC; + break; + case VIRTIO_CRYPTO_CIPHER_3DES_ECB: + ret = RTE_CRYPTO_CIPHER_3DES_ECB; + break; + case VIRTIO_CRYPTO_CIPHER_3DES_CBC: + ret = RTE_CRYPTO_CIPHER_3DES_CBC; + break; + case VIRTIO_CRYPTO_CIPHER_3DES_CTR: + ret = RTE_CRYPTO_CIPHER_3DES_CTR; + break; + case VIRTIO_CRYPTO_CIPHER_KASUMI_F8: + ret = RTE_CRYPTO_CIPHER_KASUMI_F8; + break; + case VIRTIO_CRYPTO_CIPHER_SNOW3G_UEA2: + ret = RTE_CRYPTO_CIPHER_SNOW3G_UEA2; + break; + case VIRTIO_CRYPTO_CIPHER_AES_F8: + ret = RTE_CRYPTO_CIPHER_AES_F8; + break; + case VIRTIO_CRYPTO_CIPHER_AES_XTS: + ret = RTE_CRYPTO_CIPHER_AES_XTS; + break; + case VIRTIO_CRYPTO_CIPHER_ZUC_EEA3: + ret = RTE_CRYPTO_CIPHER_ZUC_EEA3; + break; + default: + ret = -VIRTIO_CRYPTO_BADMSG; + break; + } + + return ret; +} + +static int +auth_algo_transform(uint32_t virtio_auth_algo) +{ + int ret; + + switch (virtio_auth_algo) { + + case VIRTIO_CRYPTO_NO_MAC: + ret = RTE_CRYPTO_AUTH_NULL; + break; + case VIRTIO_CRYPTO_MAC_HMAC_MD5: + ret = RTE_CRYPTO_AUTH_MD5_HMAC; + break; + case VIRTIO_CRYPTO_MAC_HMAC_SHA1: + ret = RTE_CRYPTO_AUTH_SHA1_HMAC; + break; + case VIRTIO_CRYPTO_MAC_HMAC_SHA_224: + ret = RTE_CRYPTO_AUTH_SHA224_HMAC; + break; + case VIRTIO_CRYPTO_MAC_HMAC_SHA_256: + ret = RTE_CRYPTO_AUTH_SHA256_HMAC; + break; + case VIRTIO_CRYPTO_MAC_HMAC_SHA_384: + ret = RTE_CRYPTO_AUTH_SHA384_HMAC; + break; + case VIRTIO_CRYPTO_MAC_HMAC_SHA_512: + ret = RTE_CRYPTO_AUTH_SHA512_HMAC; + break; + case VIRTIO_CRYPTO_MAC_CMAC_3DES: + ret = -VIRTIO_CRYPTO_NOTSUPP; + break; + case VIRTIO_CRYPTO_MAC_CMAC_AES: + ret = RTE_CRYPTO_AUTH_AES_CMAC; + break; + case VIRTIO_CRYPTO_MAC_KASUMI_F9: + ret = RTE_CRYPTO_AUTH_KASUMI_F9; + break; + case VIRTIO_CRYPTO_MAC_SNOW3G_UIA2: + ret = RTE_CRYPTO_AUTH_SNOW3G_UIA2; + break; + case VIRTIO_CRYPTO_MAC_GMAC_AES: + ret = RTE_CRYPTO_AUTH_AES_GMAC; + break; + case VIRTIO_CRYPTO_MAC_GMAC_TWOFISH: + ret = -VIRTIO_CRYPTO_NOTSUPP; + break; + case VIRTIO_CRYPTO_MAC_CBCMAC_AES: + ret = RTE_CRYPTO_AUTH_AES_CBC_MAC; + break; + case VIRTIO_CRYPTO_MAC_CBCMAC_KASUMI_F9: + ret = -VIRTIO_CRYPTO_NOTSUPP; + break; + case VIRTIO_CRYPTO_MAC_XCBC_AES: + ret = RTE_CRYPTO_AUTH_AES_XCBC_MAC; + break; + default: + ret = -VIRTIO_CRYPTO_BADMSG; + break; + } + + return ret; +} + +static int get_iv_len(enum rte_crypto_cipher_algorithm algo) +{ + int len; + + switch (algo) { + case RTE_CRYPTO_CIPHER_3DES_CBC: + len = 8; + break; + case RTE_CRYPTO_CIPHER_3DES_CTR: + len = 8; + break; + case RTE_CRYPTO_CIPHER_3DES_ECB: + len = 8; + break; + case RTE_CRYPTO_CIPHER_AES_CBC: + len = 16; + break; + + /* TODO: add common algos */ + + default: + len = -1; + break; + } + + return len; +} + +/** + * vhost_crypto struct is used to maintain a number of virtio_cryptos and + * one DPDK crypto device that deals with all crypto workloads. It is declared + * here and defined in vhost_crypto.c + */ +struct vhost_crypto { + /** Used to lookup DPDK Cryptodev Session based on VIRTIO crypto + * session ID. + */ + struct rte_hash *session_map; + struct rte_mempool *mbuf_pool; + struct rte_mempool *sess_pool; + + /** DPDK cryptodev ID */ + uint8_t cid; + uint16_t nb_qps; + + uint64_t last_session_id; + + uint64_t cache_session_id; + struct rte_cryptodev_sym_session *cache_session; + /** socket id for the device */ + int socket_id; + + struct virtio_net *dev; + + uint8_t option; +} __rte_cache_aligned; + +struct vhost_crypto_data_req { + struct vring_desc *head; + struct virtio_net *dev; + struct virtio_crypto_inhdr *inhdr; + struct vhost_virtqueue *vq; + struct vring_desc *wb_desc; + uint16_t wb_len; + uint16_t desc_idx; + uint16_t len; + uint16_t zero_copy; +}; + +static int +transform_cipher_param(struct rte_crypto_sym_xform *xform, + VhostUserCryptoSessionParam *param) +{ + int ret; + + ret = cipher_algo_transform(param->cipher_algo); + if (unlikely(ret < 0)) + return ret; + + xform->type = RTE_CRYPTO_SYM_XFORM_CIPHER; + xform->cipher.algo = (uint32_t)ret; + xform->cipher.key.length = param->cipher_key_len; + if (xform->cipher.key.length > 0) + xform->cipher.key.data = param->cipher_key_buf; + if (param->dir == VIRTIO_CRYPTO_OP_ENCRYPT) + xform->cipher.op = RTE_CRYPTO_CIPHER_OP_ENCRYPT; + else if (param->dir == VIRTIO_CRYPTO_OP_DECRYPT) + xform->cipher.op = RTE_CRYPTO_CIPHER_OP_DECRYPT; + else { + VC_LOG_DBG("Bad operation type"); + return -VIRTIO_CRYPTO_BADMSG; + } + + ret = get_iv_len(xform->cipher.algo); + if (unlikely(ret < 0)) + return ret; + xform->cipher.iv.length = (uint16_t)ret; + xform->cipher.iv.offset = IV_OFFSET; + return 0; +} + +static int +transform_chain_param(struct rte_crypto_sym_xform *xforms, + VhostUserCryptoSessionParam *param) +{ + struct rte_crypto_sym_xform *xform_cipher, *xform_auth; + int ret; + + switch (param->chaining_dir) { + case VIRTIO_CRYPTO_SYM_ALG_CHAIN_ORDER_HASH_THEN_CIPHER: + xform_auth = xforms; + xform_cipher = xforms->next; + xform_cipher->cipher.op = RTE_CRYPTO_CIPHER_OP_DECRYPT; + xform_auth->auth.op = RTE_CRYPTO_AUTH_OP_VERIFY; + break; + case VIRTIO_CRYPTO_SYM_ALG_CHAIN_ORDER_CIPHER_THEN_HASH: + xform_cipher = xforms; + xform_auth = xforms->next; + xform_cipher->cipher.op = RTE_CRYPTO_CIPHER_OP_ENCRYPT; + xform_auth->auth.op = RTE_CRYPTO_AUTH_OP_GENERATE; + break; + default: + return -VIRTIO_CRYPTO_BADMSG; + } + + /* cipher */ + ret = cipher_algo_transform(param->cipher_algo); + if (unlikely(ret < 0)) + return ret; + xform_cipher->type = RTE_CRYPTO_SYM_XFORM_CIPHER; + xform_cipher->cipher.algo = (uint32_t)ret; + xform_cipher->cipher.key.length = param->cipher_key_len; + xform_cipher->cipher.key.data = param->cipher_key_buf; + ret = get_iv_len(xform_cipher->cipher.algo); + if (unlikely(ret < 0)) + return ret; + xform_cipher->cipher.iv.length = (uint16_t)ret; + xform_cipher->cipher.iv.offset = IV_OFFSET; + + /* auth */ + xform_auth->type = RTE_CRYPTO_SYM_XFORM_AUTH; + ret = auth_algo_transform(param->hash_algo); + if (unlikely(ret < 0)) + return ret; + xform_auth->auth.algo = (uint32_t)ret; + xform_auth->auth.digest_length = param->digest_len; + xform_auth->auth.key.length = param->auth_key_len; + xform_auth->auth.key.data = param->auth_key_buf; + + return 0; +} + +static void +vhost_crypto_create_sess(struct vhost_crypto *vcrypto, + VhostUserCryptoSessionParam *sess_param) +{ + struct rte_crypto_sym_xform xform1 = {0}, xform2 = {0}; + struct rte_cryptodev_sym_session *session; + int ret; + + switch (sess_param->op_type) { + case VIRTIO_CRYPTO_SYM_OP_NONE: + case VIRTIO_CRYPTO_SYM_OP_CIPHER: + ret = transform_cipher_param(&xform1, sess_param); + if (unlikely(ret)) { + VC_LOG_ERR("Error transform session msg (%i)", ret); + sess_param->session_id = ret; + return; + } + break; + case VIRTIO_CRYPTO_SYM_OP_ALGORITHM_CHAINING: + if (unlikely(sess_param->hash_mode != + VIRTIO_CRYPTO_SYM_HASH_MODE_AUTH)) { + sess_param->session_id = -VIRTIO_CRYPTO_NOTSUPP; + VC_LOG_ERR("Error transform session message (%i)", + -VIRTIO_CRYPTO_NOTSUPP); + return; + } + + xform1.next = &xform2; + + ret = transform_chain_param(&xform1, sess_param); + if (unlikely(ret)) { + VC_LOG_ERR("Error transform session message (%i)", ret); + sess_param->session_id = ret; + return; + } + + break; + default: + VC_LOG_ERR("Algorithm not yet supported"); + sess_param->session_id = -VIRTIO_CRYPTO_NOTSUPP; + return; + } + + session = rte_cryptodev_sym_session_create(vcrypto->sess_pool); + if (!session) { + VC_LOG_ERR("Failed to create session"); + sess_param->session_id = -VIRTIO_CRYPTO_ERR; + return; + } + + if (rte_cryptodev_sym_session_init(vcrypto->cid, session, &xform1, + vcrypto->sess_pool) < 0) { + VC_LOG_ERR("Failed to initialize session"); + sess_param->session_id = -VIRTIO_CRYPTO_ERR; + return; + } + + /* insert hash to map */ + if (rte_hash_add_key_data(vcrypto->session_map, + &vcrypto->last_session_id, session) < 0) { + VC_LOG_ERR("Failed to insert session to hash table"); + + if (rte_cryptodev_sym_session_clear(vcrypto->cid, session) < 0) + VC_LOG_ERR("Failed to clear session"); + else { + if (rte_cryptodev_sym_session_free(session) < 0) + VC_LOG_ERR("Failed to free session"); + } + sess_param->session_id = -VIRTIO_CRYPTO_ERR; + return; + } + + VC_LOG_INFO("Session %"PRIu64" created for vdev %i.", + vcrypto->last_session_id, vcrypto->dev->vid); + + sess_param->session_id = vcrypto->last_session_id; + vcrypto->last_session_id++; +} + +static int +vhost_crypto_close_sess(struct vhost_crypto *vcrypto, uint64_t session_id) +{ + struct rte_cryptodev_sym_session *session; + uint64_t sess_id = session_id; + int ret; + + ret = rte_hash_lookup_data(vcrypto->session_map, &sess_id, + (void **)&session); + + if (unlikely(ret < 0)) { + VC_LOG_ERR("Failed to delete session %"PRIu64".", session_id); + return -VIRTIO_CRYPTO_INVSESS; + } + + if (rte_cryptodev_sym_session_clear(vcrypto->cid, session) < 0) { + VC_LOG_DBG("Failed to clear session"); + return -VIRTIO_CRYPTO_ERR; + } + + if (rte_cryptodev_sym_session_free(session) < 0) { + VC_LOG_DBG("Failed to free session"); + return -VIRTIO_CRYPTO_ERR; + } + + if (rte_hash_del_key(vcrypto->session_map, &sess_id) < 0) { + VC_LOG_DBG("Failed to delete session from hash table."); + return -VIRTIO_CRYPTO_ERR; + } + + VC_LOG_INFO("Session %"PRIu64" deleted for vdev %i.", sess_id, + vcrypto->dev->vid); + + return 0; +} + +static int +vhost_crypto_msg_post_handler(int vid, void *msg, uint32_t *require_reply) +{ + struct virtio_net *dev = get_device(vid); + struct vhost_crypto *vcrypto; + VhostUserMsg *vmsg = msg; + int ret = 0; + + if (dev == NULL || require_reply == NULL) { + VC_LOG_ERR("Invalid vid %i", vid); + return -EINVAL; + } + + vcrypto = dev->extern_data; + if (vcrypto == NULL) { + VC_LOG_ERR("Cannot find required data, is it initialized?"); + return -ENOENT; + } + + *require_reply = 0; + + if (vmsg->request.master == VHOST_USER_CRYPTO_CREATE_SESS) { + vhost_crypto_create_sess(vcrypto, + &vmsg->payload.crypto_session); + *require_reply = 1; + } else if (vmsg->request.master == VHOST_USER_CRYPTO_CLOSE_SESS) + ret = vhost_crypto_close_sess(vcrypto, vmsg->payload.u64); + else + ret = -EINVAL; + + return ret; +} + +static __rte_always_inline struct vring_desc * +find_write_desc(struct vring_desc *head, struct vring_desc *desc) +{ + if (desc->flags & VRING_DESC_F_WRITE) + return desc; + + while (desc->flags & VRING_DESC_F_NEXT) { + desc = &head[desc->next]; + if (desc->flags & VRING_DESC_F_WRITE) + return desc; + } + + return NULL; +} + +static struct virtio_crypto_inhdr * +reach_inhdr(struct vhost_crypto_data_req *vc_req, struct vring_desc *desc) +{ + uint64_t dlen; + struct virtio_crypto_inhdr *inhdr; + + while (desc->flags & VRING_DESC_F_NEXT) + desc = &vc_req->head[desc->next]; + + dlen = desc->len; + inhdr = IOVA_TO_VVA(struct virtio_crypto_inhdr *, vc_req, desc->addr, + &dlen, VHOST_ACCESS_WO); + if (unlikely(!inhdr || dlen != desc->len)) + return NULL; + + return inhdr; +} + +static __rte_always_inline int +move_desc(struct vring_desc *head, struct vring_desc **cur_desc, + uint32_t size) +{ + struct vring_desc *desc = *cur_desc; + int left = size; + + rte_prefetch0(&head[desc->next]); + left -= desc->len; + + while ((desc->flags & VRING_DESC_F_NEXT) && left > 0) { + desc = &head[desc->next]; + rte_prefetch0(&head[desc->next]); + left -= desc->len; + } + + if (unlikely(left > 0)) { + VC_LOG_ERR("Incorrect virtio descriptor"); + return -1; + } + + *cur_desc = &head[desc->next]; + return 0; +} + +static int +copy_data(void *dst_data, struct vhost_crypto_data_req *vc_req, + struct vring_desc **cur_desc, uint32_t size) +{ + struct vring_desc *desc = *cur_desc; + uint64_t remain, addr, dlen, len; + uint32_t to_copy; + uint8_t *data = dst_data; + uint8_t *src; + int left = size; + + rte_prefetch0(&vc_req->head[desc->next]); + to_copy = RTE_MIN(desc->len, (uint32_t)left); + dlen = to_copy; + src = IOVA_TO_VVA(uint8_t *, vc_req, desc->addr, &dlen, + VHOST_ACCESS_RO); + if (unlikely(!src || !dlen)) { + VC_LOG_ERR("Failed to map descriptor"); + return -1; + } + + rte_memcpy((uint8_t *)data, src, dlen); + data += dlen; + + if (unlikely(dlen < to_copy)) { + remain = to_copy - dlen; + addr = desc->addr + dlen; + + while (remain) { + len = remain; + src = IOVA_TO_VVA(uint8_t *, vc_req, addr, &len, + VHOST_ACCESS_RO); + if (unlikely(!src || !len)) { + VC_LOG_ERR("Failed to map descriptor"); + return -1; + } + + rte_memcpy(data, src, len); + addr += len; + remain -= len; + data += len; + } + } + + left -= to_copy; + + while ((desc->flags & VRING_DESC_F_NEXT) && left > 0) { + desc = &vc_req->head[desc->next]; + rte_prefetch0(&vc_req->head[desc->next]); + to_copy = RTE_MIN(desc->len, (uint32_t)left); + dlen = desc->len; + src = IOVA_TO_VVA(uint8_t *, vc_req, desc->addr, &dlen, + VHOST_ACCESS_RO); + if (unlikely(!src || !dlen)) { + VC_LOG_ERR("Failed to map descriptor"); + return -1; + } + + rte_memcpy(data, src, dlen); + data += dlen; + + if (unlikely(dlen < to_copy)) { + remain = to_copy - dlen; + addr = desc->addr + dlen; + + while (remain) { + len = remain; + src = IOVA_TO_VVA(uint8_t *, vc_req, addr, &len, + VHOST_ACCESS_RO); + if (unlikely(!src || !len)) { + VC_LOG_ERR("Failed to map descriptor"); + return -1; + } + + rte_memcpy(data, src, len); + addr += len; + remain -= len; + data += len; + } + } + + left -= to_copy; + } + + if (unlikely(left > 0)) { + VC_LOG_ERR("Incorrect virtio descriptor"); + return -1; + } + + *cur_desc = &vc_req->head[desc->next]; + + return 0; +} + +static __rte_always_inline void * +get_data_ptr(struct vhost_crypto_data_req *vc_req, struct vring_desc **cur_desc, + uint32_t size, uint8_t perm) +{ + void *data; + uint64_t dlen = (*cur_desc)->len; + + data = IOVA_TO_VVA(void *, vc_req, (*cur_desc)->addr, &dlen, perm); + if (unlikely(!data || dlen != (*cur_desc)->len)) { + VC_LOG_ERR("Failed to map object"); + return NULL; + } + + if (unlikely(move_desc(vc_req->head, cur_desc, size) < 0)) + return NULL; + + return data; +} + +static int +write_back_data(struct rte_crypto_op *op, struct vhost_crypto_data_req *vc_req) +{ + struct rte_mbuf *mbuf = op->sym->m_dst; + struct vring_desc *head = vc_req->head; + struct vring_desc *desc = vc_req->wb_desc; + int left = vc_req->wb_len; + uint32_t to_write; + uint8_t *src_data = mbuf->buf_addr, *dst; + uint64_t dlen; + + rte_prefetch0(&head[desc->next]); + to_write = RTE_MIN(desc->len, (uint32_t)left); + dlen = desc->len; + dst = IOVA_TO_VVA(uint8_t *, vc_req, desc->addr, &dlen, + VHOST_ACCESS_RW); + if (unlikely(!dst || dlen != desc->len)) { + VC_LOG_ERR("Failed to map descriptor"); + return -1; + } + + rte_memcpy(dst, src_data, to_write); + left -= to_write; + src_data += to_write; + + while ((desc->flags & VRING_DESC_F_NEXT) && left > 0) { + desc = &head[desc->next]; + rte_prefetch0(&head[desc->next]); + to_write = RTE_MIN(desc->len, (uint32_t)left); + dlen = desc->len; + dst = IOVA_TO_VVA(uint8_t *, vc_req, desc->addr, &dlen, + VHOST_ACCESS_RW); + if (unlikely(!dst || dlen != desc->len)) { + VC_LOG_ERR("Failed to map descriptor"); + return -1; + } + + rte_memcpy(dst, src_data, to_write); + left -= to_write; + src_data += to_write; + } + + if (unlikely(left < 0)) { + VC_LOG_ERR("Incorrect virtio descriptor"); + return -1; + } + + return 0; +} + +static uint8_t +prepare_sym_cipher_op(struct vhost_crypto *vcrypto, struct rte_crypto_op *op, + struct vhost_crypto_data_req *vc_req, + struct virtio_crypto_cipher_data_req *cipher, + struct vring_desc *cur_desc) +{ + struct vring_desc *desc = cur_desc; + struct rte_mbuf *m_src = op->sym->m_src, *m_dst = op->sym->m_dst; + uint8_t *iv_data = rte_crypto_op_ctod_offset(op, uint8_t *, IV_OFFSET); + uint8_t ret = 0; + + /* prepare */ + /* iv */ + if (unlikely(copy_data(iv_data, vc_req, &desc, + cipher->para.iv_len) < 0)) { + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + m_src->data_len = cipher->para.src_data_len; + + switch (vcrypto->option) { + case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE: + m_src->buf_iova = gpa_to_hpa(vcrypto->dev, desc->addr, + cipher->para.src_data_len); + m_src->buf_addr = get_data_ptr(vc_req, &desc, + cipher->para.src_data_len, VHOST_ACCESS_RO); + if (unlikely(m_src->buf_iova == 0 || + m_src->buf_addr == NULL)) { + VC_LOG_ERR("zero_copy may fail due to cross page data"); + ret = VIRTIO_CRYPTO_ERR; + goto error_exit; + } + break; + case RTE_VHOST_CRYPTO_ZERO_COPY_DISABLE: + if (unlikely(cipher->para.src_data_len > + RTE_MBUF_DEFAULT_BUF_SIZE)) { + VC_LOG_ERR("Not enough space to do data copy"); + ret = VIRTIO_CRYPTO_ERR; + goto error_exit; + } + if (unlikely(copy_data(rte_pktmbuf_mtod(m_src, uint8_t *), + vc_req, &desc, cipher->para.src_data_len) + < 0)) { + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + break; + default: + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + /* dst */ + desc = find_write_desc(vc_req->head, desc); + if (unlikely(!desc)) { + VC_LOG_ERR("Cannot find write location"); + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + switch (vcrypto->option) { + case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE: + m_dst->buf_iova = gpa_to_hpa(vcrypto->dev, + desc->addr, cipher->para.dst_data_len); + m_dst->buf_addr = get_data_ptr(vc_req, &desc, + cipher->para.dst_data_len, VHOST_ACCESS_RW); + if (unlikely(m_dst->buf_iova == 0 || m_dst->buf_addr == NULL)) { + VC_LOG_ERR("zero_copy may fail due to cross page data"); + ret = VIRTIO_CRYPTO_ERR; + goto error_exit; + } + + m_dst->data_len = cipher->para.dst_data_len; + break; + case RTE_VHOST_CRYPTO_ZERO_COPY_DISABLE: + vc_req->wb_desc = desc; + vc_req->wb_len = cipher->para.dst_data_len; + if (unlikely(move_desc(vc_req->head, &desc, + vc_req->wb_len) < 0)) { + ret = VIRTIO_CRYPTO_ERR; + goto error_exit; + } + break; + default: + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + /* src data */ + op->type = RTE_CRYPTO_OP_TYPE_SYMMETRIC; + op->sess_type = RTE_CRYPTO_OP_WITH_SESSION; + + op->sym->cipher.data.offset = 0; + op->sym->cipher.data.length = cipher->para.src_data_len; + + vc_req->inhdr = get_data_ptr(vc_req, &desc, INHDR_LEN, VHOST_ACCESS_WO); + if (unlikely(vc_req->inhdr == NULL)) { + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + vc_req->inhdr->status = VIRTIO_CRYPTO_OK; + vc_req->len = cipher->para.dst_data_len + INHDR_LEN; + + return 0; + +error_exit: + vc_req->len = INHDR_LEN; + return ret; +} + +static uint8_t +prepare_sym_chain_op(struct vhost_crypto *vcrypto, struct rte_crypto_op *op, + struct vhost_crypto_data_req *vc_req, + struct virtio_crypto_alg_chain_data_req *chain, + struct vring_desc *cur_desc) +{ + struct vring_desc *desc = cur_desc; + struct rte_mbuf *m_src = op->sym->m_src, *m_dst = op->sym->m_dst; + uint8_t *iv_data = rte_crypto_op_ctod_offset(op, uint8_t *, IV_OFFSET); + uint32_t digest_offset; + void *digest_addr; + uint8_t ret = 0; + + /* prepare */ + /* iv */ + if (unlikely(copy_data(iv_data, vc_req, &desc, + chain->para.iv_len) < 0)) { + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + m_src->data_len = chain->para.src_data_len; + m_dst->data_len = chain->para.dst_data_len; + + switch (vcrypto->option) { + case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE: + m_src->buf_iova = gpa_to_hpa(vcrypto->dev, desc->addr, + chain->para.src_data_len); + m_src->buf_addr = get_data_ptr(vc_req, &desc, + chain->para.src_data_len, VHOST_ACCESS_RO); + if (unlikely(m_src->buf_iova == 0 || m_src->buf_addr == NULL)) { + VC_LOG_ERR("zero_copy may fail due to cross page data"); + ret = VIRTIO_CRYPTO_ERR; + goto error_exit; + } + break; + case RTE_VHOST_CRYPTO_ZERO_COPY_DISABLE: + if (unlikely(chain->para.src_data_len > + RTE_MBUF_DEFAULT_BUF_SIZE)) { + VC_LOG_ERR("Not enough space to do data copy"); + ret = VIRTIO_CRYPTO_ERR; + goto error_exit; + } + if (unlikely(copy_data(rte_pktmbuf_mtod(m_src, uint8_t *), + vc_req, &desc, chain->para.src_data_len)) < 0) { + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + break; + default: + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + /* dst */ + desc = find_write_desc(vc_req->head, desc); + if (unlikely(!desc)) { + VC_LOG_ERR("Cannot find write location"); + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + switch (vcrypto->option) { + case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE: + m_dst->buf_iova = gpa_to_hpa(vcrypto->dev, + desc->addr, chain->para.dst_data_len); + m_dst->buf_addr = get_data_ptr(vc_req, &desc, + chain->para.dst_data_len, VHOST_ACCESS_RW); + if (unlikely(m_dst->buf_iova == 0 || m_dst->buf_addr == NULL)) { + VC_LOG_ERR("zero_copy may fail due to cross page data"); + ret = VIRTIO_CRYPTO_ERR; + goto error_exit; + } + + op->sym->auth.digest.phys_addr = gpa_to_hpa(vcrypto->dev, + desc->addr, chain->para.hash_result_len); + op->sym->auth.digest.data = get_data_ptr(vc_req, &desc, + chain->para.hash_result_len, VHOST_ACCESS_RW); + if (unlikely(op->sym->auth.digest.phys_addr == 0)) { + VC_LOG_ERR("zero_copy may fail due to cross page data"); + ret = VIRTIO_CRYPTO_ERR; + goto error_exit; + } + break; + case RTE_VHOST_CRYPTO_ZERO_COPY_DISABLE: + digest_offset = m_dst->data_len; + digest_addr = rte_pktmbuf_mtod_offset(m_dst, void *, + digest_offset); + + vc_req->wb_desc = desc; + vc_req->wb_len = m_dst->data_len + chain->para.hash_result_len; + + if (unlikely(move_desc(vc_req->head, &desc, + chain->para.dst_data_len) < 0)) { + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + if (unlikely(copy_data(digest_addr, vc_req, &desc, + chain->para.hash_result_len)) < 0) { + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + op->sym->auth.digest.data = digest_addr; + op->sym->auth.digest.phys_addr = rte_pktmbuf_iova_offset(m_dst, + digest_offset); + break; + default: + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + /* record inhdr */ + vc_req->inhdr = get_data_ptr(vc_req, &desc, INHDR_LEN, VHOST_ACCESS_WO); + if (unlikely(vc_req->inhdr == NULL)) { + ret = VIRTIO_CRYPTO_BADMSG; + goto error_exit; + } + + vc_req->inhdr->status = VIRTIO_CRYPTO_OK; + + op->type = RTE_CRYPTO_OP_TYPE_SYMMETRIC; + op->sess_type = RTE_CRYPTO_OP_WITH_SESSION; + + op->sym->cipher.data.offset = chain->para.cipher_start_src_offset; + op->sym->cipher.data.length = chain->para.src_data_len - + chain->para.cipher_start_src_offset; + + op->sym->auth.data.offset = chain->para.hash_start_src_offset; + op->sym->auth.data.length = chain->para.len_to_hash; + + vc_req->len = chain->para.dst_data_len + chain->para.hash_result_len + + INHDR_LEN; + return 0; + +error_exit: + vc_req->len = INHDR_LEN; + return ret; +} + +/** + * Process on descriptor + */ +static __rte_always_inline int +vhost_crypto_process_one_req(struct vhost_crypto *vcrypto, + struct vhost_virtqueue *vq, struct rte_crypto_op *op, + struct vring_desc *head, uint16_t desc_idx) +{ + struct vhost_crypto_data_req *vc_req = rte_mbuf_to_priv(op->sym->m_src); + struct rte_cryptodev_sym_session *session; + struct virtio_crypto_op_data_req *req, tmp_req; + struct virtio_crypto_inhdr *inhdr; + struct vring_desc *desc = NULL; + uint64_t session_id; + uint64_t dlen; + int err = 0; + + vc_req->desc_idx = desc_idx; + vc_req->dev = vcrypto->dev; + vc_req->vq = vq; + + if (likely(head->flags & VRING_DESC_F_INDIRECT)) { + dlen = head->len; + desc = IOVA_TO_VVA(struct vring_desc *, vc_req, head->addr, + &dlen, VHOST_ACCESS_RO); + if (unlikely(!desc || dlen != head->len)) + return -1; + desc_idx = 0; + head = desc; + } else { + desc = head; + } + + vc_req->head = head; + vc_req->zero_copy = vcrypto->option; + + req = get_data_ptr(vc_req, &desc, sizeof(*req), VHOST_ACCESS_RO); + if (unlikely(req == NULL)) { + switch (vcrypto->option) { + case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE: + err = VIRTIO_CRYPTO_BADMSG; + VC_LOG_ERR("Invalid descriptor"); + goto error_exit; + case RTE_VHOST_CRYPTO_ZERO_COPY_DISABLE: + req = &tmp_req; + if (unlikely(copy_data(req, vc_req, &desc, sizeof(*req)) + < 0)) { + err = VIRTIO_CRYPTO_BADMSG; + VC_LOG_ERR("Invalid descriptor"); + goto error_exit; + } + break; + default: + err = VIRTIO_CRYPTO_ERR; + VC_LOG_ERR("Invalid option"); + goto error_exit; + } + } + + switch (req->header.opcode) { + case VIRTIO_CRYPTO_CIPHER_ENCRYPT: + case VIRTIO_CRYPTO_CIPHER_DECRYPT: + session_id = req->header.session_id; + + /* one branch to avoid unnecessary table lookup */ + if (vcrypto->cache_session_id != session_id) { + err = rte_hash_lookup_data(vcrypto->session_map, + &session_id, (void **)&session); + if (unlikely(err < 0)) { + err = VIRTIO_CRYPTO_ERR; + VC_LOG_ERR("Failed to find session %"PRIu64, + session_id); + goto error_exit; + } + + vcrypto->cache_session = session; + vcrypto->cache_session_id = session_id; + } + + session = vcrypto->cache_session; + + err = rte_crypto_op_attach_sym_session(op, session); + if (unlikely(err < 0)) { + err = VIRTIO_CRYPTO_ERR; + VC_LOG_ERR("Failed to attach session to op"); + goto error_exit; + } + + switch (req->u.sym_req.op_type) { + case VIRTIO_CRYPTO_SYM_OP_NONE: + err = VIRTIO_CRYPTO_NOTSUPP; + break; + case VIRTIO_CRYPTO_SYM_OP_CIPHER: + err = prepare_sym_cipher_op(vcrypto, op, vc_req, + &req->u.sym_req.u.cipher, desc); + break; + case VIRTIO_CRYPTO_SYM_OP_ALGORITHM_CHAINING: + err = prepare_sym_chain_op(vcrypto, op, vc_req, + &req->u.sym_req.u.chain, desc); + break; + } + if (unlikely(err != 0)) { + VC_LOG_ERR("Failed to process sym request"); + goto error_exit; + } + break; + default: + VC_LOG_ERR("Unsupported symmetric crypto request type %u", + req->header.opcode); + goto error_exit; + } + + return 0; + +error_exit: + + inhdr = reach_inhdr(vc_req, desc); + if (likely(inhdr != NULL)) + inhdr->status = (uint8_t)err; + + return -1; +} + +static __rte_always_inline struct vhost_virtqueue * +vhost_crypto_finalize_one_request(struct rte_crypto_op *op, + struct vhost_virtqueue *old_vq) +{ + struct rte_mbuf *m_src = op->sym->m_src; + struct rte_mbuf *m_dst = op->sym->m_dst; + struct vhost_crypto_data_req *vc_req = rte_mbuf_to_priv(m_src); + uint16_t desc_idx; + int ret = 0; + + if (unlikely(!vc_req)) { + VC_LOG_ERR("Failed to retrieve vc_req"); + return NULL; + } + + if (old_vq && (vc_req->vq != old_vq)) + return vc_req->vq; + + desc_idx = vc_req->desc_idx; + + if (unlikely(op->status != RTE_CRYPTO_OP_STATUS_SUCCESS)) + vc_req->inhdr->status = VIRTIO_CRYPTO_ERR; + else { + if (vc_req->zero_copy == 0) { + ret = write_back_data(op, vc_req); + if (unlikely(ret != 0)) + vc_req->inhdr->status = VIRTIO_CRYPTO_ERR; + } + } + + vc_req->vq->used->ring[desc_idx].id = desc_idx; + vc_req->vq->used->ring[desc_idx].len = vc_req->len; + + rte_mempool_put(m_dst->pool, (void *)m_dst); + rte_mempool_put(m_src->pool, (void *)m_src); + + return vc_req->vq; +} + +static __rte_always_inline uint16_t +vhost_crypto_complete_one_vm_requests(struct rte_crypto_op **ops, + uint16_t nb_ops, int *callfd) +{ + uint16_t processed = 1; + struct vhost_virtqueue *vq, *tmp_vq; + + if (unlikely(nb_ops == 0)) + return 0; + + vq = vhost_crypto_finalize_one_request(ops[0], NULL); + if (unlikely(vq == NULL)) + return 0; + tmp_vq = vq; + + while ((processed < nb_ops)) { + tmp_vq = vhost_crypto_finalize_one_request(ops[processed], + tmp_vq); + + if (unlikely(vq != tmp_vq)) + break; + + processed++; + } + + *callfd = vq->callfd; + + *(volatile uint16_t *)&vq->used->idx += processed; + + return processed; +} + +int __rte_experimental +rte_vhost_crypto_create(int vid, uint8_t cryptodev_id, + struct rte_mempool *sess_pool, int socket_id) +{ + struct virtio_net *dev = get_device(vid); + struct rte_hash_parameters params = {0}; + struct vhost_crypto *vcrypto; + char name[128]; + int ret; + + if (!dev) { + VC_LOG_ERR("Invalid vid %i", vid); + return -EINVAL; + } + + ret = rte_vhost_driver_set_features(dev->ifname, + VIRTIO_CRYPTO_FEATURES); + if (ret < 0) { + VC_LOG_ERR("Error setting features"); + return -1; + } + + vcrypto = rte_zmalloc_socket(NULL, sizeof(*vcrypto), + RTE_CACHE_LINE_SIZE, socket_id); + if (!vcrypto) { + VC_LOG_ERR("Insufficient memory"); + return -ENOMEM; + } + + vcrypto->sess_pool = sess_pool; + vcrypto->cid = cryptodev_id; + vcrypto->cache_session_id = UINT64_MAX; + vcrypto->last_session_id = 1; + vcrypto->dev = dev; + vcrypto->option = RTE_VHOST_CRYPTO_ZERO_COPY_DISABLE; + + snprintf(name, 127, "HASH_VHOST_CRYPT_%u", (uint32_t)vid); + params.name = name; + params.entries = VHOST_CRYPTO_SESSION_MAP_ENTRIES; + params.hash_func = rte_jhash; + params.key_len = sizeof(uint64_t); + params.socket_id = socket_id; + vcrypto->session_map = rte_hash_create(¶ms); + if (!vcrypto->session_map) { + VC_LOG_ERR("Failed to creath session map"); + ret = -ENOMEM; + goto error_exit; + } + + snprintf(name, 127, "MBUF_POOL_VM_%u", (uint32_t)vid); + vcrypto->mbuf_pool = rte_pktmbuf_pool_create(name, + VHOST_CRYPTO_MBUF_POOL_SIZE, 512, + sizeof(struct vhost_crypto_data_req), + RTE_MBUF_DEFAULT_DATAROOM * 2 + RTE_PKTMBUF_HEADROOM, + rte_socket_id()); + if (!vcrypto->mbuf_pool) { + VC_LOG_ERR("Failed to creath mbuf pool"); + ret = -ENOMEM; + goto error_exit; + } + + dev->extern_data = vcrypto; + dev->extern_ops.pre_msg_handle = NULL; + dev->extern_ops.post_msg_handle = vhost_crypto_msg_post_handler; + + return 0; + +error_exit: + if (vcrypto->session_map) + rte_hash_free(vcrypto->session_map); + if (vcrypto->mbuf_pool) + rte_mempool_free(vcrypto->mbuf_pool); + + rte_free(vcrypto); + + return ret; +} + +int __rte_experimental +rte_vhost_crypto_free(int vid) +{ + struct virtio_net *dev = get_device(vid); + struct vhost_crypto *vcrypto; + + if (unlikely(dev == NULL)) { + VC_LOG_ERR("Invalid vid %i", vid); + return -EINVAL; + } + + vcrypto = dev->extern_data; + if (unlikely(vcrypto == NULL)) { + VC_LOG_ERR("Cannot find required data, is it initialized?"); + return -ENOENT; + } + + rte_hash_free(vcrypto->session_map); + rte_mempool_free(vcrypto->mbuf_pool); + rte_free(vcrypto); + + dev->extern_data = NULL; + dev->extern_ops.pre_msg_handle = NULL; + dev->extern_ops.post_msg_handle = NULL; + + return 0; +} + +int __rte_experimental +rte_vhost_crypto_set_zero_copy(int vid, enum rte_vhost_crypto_zero_copy option) +{ + struct virtio_net *dev = get_device(vid); + struct vhost_crypto *vcrypto; + + if (unlikely(dev == NULL)) { + VC_LOG_ERR("Invalid vid %i", vid); + return -EINVAL; + } + + if (unlikely((uint32_t)option >= + RTE_VHOST_CRYPTO_MAX_ZERO_COPY_OPTIONS)) { + VC_LOG_ERR("Invalid option %i", option); + return -EINVAL; + } + + vcrypto = (struct vhost_crypto *)dev->extern_data; + if (unlikely(vcrypto == NULL)) { + VC_LOG_ERR("Cannot find required data, is it initialized?"); + return -ENOENT; + } + + if (vcrypto->option == (uint8_t)option) + return 0; + + if (!(rte_mempool_full(vcrypto->mbuf_pool))) { + VC_LOG_ERR("Cannot update zero copy as mempool is not full"); + return -EINVAL; + } + + vcrypto->option = (uint8_t)option; + + return 0; +} + +uint16_t __rte_experimental +rte_vhost_crypto_fetch_requests(int vid, uint32_t qid, + struct rte_crypto_op **ops, uint16_t nb_ops) +{ + struct rte_mbuf *mbufs[VHOST_CRYPTO_MAX_BURST_SIZE * 2]; + struct virtio_net *dev = get_device(vid); + struct vhost_crypto *vcrypto; + struct vhost_virtqueue *vq; + uint16_t avail_idx; + uint16_t start_idx; + uint16_t required; + uint16_t count; + uint16_t i; + + if (unlikely(dev == NULL)) { + VC_LOG_ERR("Invalid vid %i", vid); + return -EINVAL; + } + + if (unlikely(qid >= VHOST_MAX_QUEUE_PAIRS)) { + VC_LOG_ERR("Invalid qid %u", qid); + return -EINVAL; + } + + vcrypto = (struct vhost_crypto *)dev->extern_data; + if (unlikely(vcrypto == NULL)) { + VC_LOG_ERR("Cannot find required data, is it initialized?"); + return -ENOENT; + } + + vq = dev->virtqueue[qid]; + + avail_idx = *((volatile uint16_t *)&vq->avail->idx); + start_idx = vq->last_used_idx; + count = avail_idx - start_idx; + count = RTE_MIN(count, VHOST_CRYPTO_MAX_BURST_SIZE); + count = RTE_MIN(count, nb_ops); + + if (unlikely(count == 0)) + return 0; + + /* for zero copy, we need 2 empty mbufs for src and dst, otherwise + * we need only 1 mbuf as src and dst + */ + required = count * 2; + if (unlikely(rte_mempool_get_bulk(vcrypto->mbuf_pool, (void **)mbufs, + required) < 0)) { + VC_LOG_ERR("Insufficient memory"); + return -ENOMEM; + } + + for (i = 0; i < count; i++) { + uint16_t used_idx = (start_idx + i) & (vq->size - 1); + uint16_t desc_idx = vq->avail->ring[used_idx]; + struct vring_desc *head = &vq->desc[desc_idx]; + struct rte_crypto_op *op = ops[i]; + + op->sym->m_src = mbufs[i * 2]; + op->sym->m_dst = mbufs[i * 2 + 1]; + op->sym->m_src->data_off = 0; + op->sym->m_dst->data_off = 0; + + if (unlikely(vhost_crypto_process_one_req(vcrypto, vq, op, head, + desc_idx)) < 0) + break; + } + + vq->last_used_idx += i; + + return i; +} + +uint16_t __rte_experimental +rte_vhost_crypto_finalize_requests(struct rte_crypto_op **ops, + uint16_t nb_ops, int *callfds, uint16_t *nb_callfds) +{ + struct rte_crypto_op **tmp_ops = ops; + uint16_t count = 0, left = nb_ops; + int callfd; + uint16_t idx = 0; + + while (left) { + count = vhost_crypto_complete_one_vm_requests(tmp_ops, left, + &callfd); + if (unlikely(count == 0)) + break; + + tmp_ops = &tmp_ops[count]; + left -= count; + + callfds[idx++] = callfd; + + if (unlikely(idx >= VIRTIO_CRYPTO_MAX_NUM_BURST_VQS)) { + VC_LOG_ERR("Too many vqs"); + break; + } + } + + *nb_callfds = idx; + + return nb_ops - left; +} diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c index 90ed2112..a2d4c9ff 100644 --- a/lib/librte_vhost/vhost_user.c +++ b/lib/librte_vhost/vhost_user.c @@ -1,5 +1,22 @@ /* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2016 Intel Corporation + * Copyright(c) 2010-2018 Intel Corporation + */ + +/* Security model + * -------------- + * The vhost-user protocol connection is an external interface, so it must be + * robust against invalid inputs. + * + * This is important because the vhost-user master is only one step removed + * from the guest. Malicious guests that have escaped will then launch further + * attacks from the vhost-user master. + * + * Even in deployments where guests are trusted, a bug in the vhost-user master + * can still cause invalid messages to be sent. Such messages must not + * compromise the stability of the DPDK application by causing crashes, memory + * corruption, or other problematic behavior. + * + * Do not assume received VhostUserMsg fields contain sensible values! */ #include <stdint.h> @@ -50,6 +67,8 @@ static const char *vhost_message_str[VHOST_USER_MAX] = { [VHOST_USER_NET_SET_MTU] = "VHOST_USER_NET_SET_MTU", [VHOST_USER_SET_SLAVE_REQ_FD] = "VHOST_USER_SET_SLAVE_REQ_FD", [VHOST_USER_IOTLB_MSG] = "VHOST_USER_IOTLB_MSG", + [VHOST_USER_CRYPTO_CREATE_SESS] = "VHOST_USER_CRYPTO_CREATE_SESS", + [VHOST_USER_CRYPTO_CLOSE_SESS] = "VHOST_USER_CRYPTO_CLOSE_SESS", }; static uint64_t @@ -116,10 +135,7 @@ vhost_user_set_owner(void) static int vhost_user_reset_owner(struct virtio_net *dev) { - if (dev->flags & VIRTIO_DEV_RUNNING) { - dev->flags &= ~VIRTIO_DEV_RUNNING; - dev->notify_ops->destroy_device(dev->vid); - } + vhost_destroy_device_notify(dev); cleanup_device(dev, 0); reset_device(dev); @@ -139,12 +155,26 @@ vhost_user_get_features(struct virtio_net *dev) } /* + * The queue number that we support are requested. + */ +static uint32_t +vhost_user_get_queue_num(struct virtio_net *dev) +{ + uint32_t queue_num = 0; + + rte_vhost_driver_get_queue_num(dev->ifname, &queue_num); + return queue_num; +} + +/* * We receive the negotiated features supported by us and the virtio device. */ static int vhost_user_set_features(struct virtio_net *dev, uint64_t features) { uint64_t vhost_features = 0; + struct rte_vdpa_device *vdpa_dev; + int did = -1; rte_vhost_driver_get_features(dev->ifname, &vhost_features); if (features & ~vhost_features) { @@ -181,7 +211,7 @@ vhost_user_set_features(struct virtio_net *dev, uint64_t features) } else { dev->vhost_hlen = sizeof(struct virtio_net_hdr); } - LOG_DEBUG(VHOST_CONFIG, + VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mergeable RX buffers %s, virtio 1 %s\n", dev->vid, (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off", @@ -203,10 +233,15 @@ vhost_user_set_features(struct virtio_net *dev, uint64_t features) dev->virtqueue[dev->nr_vring] = NULL; cleanup_vq(vq, 1); - free_vq(vq); + free_vq(dev, vq); } } + did = dev->vdpa_dev_id; + vdpa_dev = rte_vdpa_get_device(did); + if (vdpa_dev && vdpa_dev->ops->set_features) + vdpa_dev->ops->set_features(dev->vid); + return 0; } @@ -221,6 +256,17 @@ vhost_user_set_vring_num(struct virtio_net *dev, vq->size = msg->payload.state.num; + /* VIRTIO 1.0, 2.4 Virtqueues says: + * + * Queue Size value is always a power of 2. The maximum Queue Size + * value is 32768. + */ + if ((vq->size & (vq->size - 1)) || vq->size > 32768) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid virtqueue size %u\n", vq->size); + return -1; + } + if (dev->dequeue_zero_copy) { vq->nr_zmbuf = 0; vq->last_zmbuf_idx = 0; @@ -236,13 +282,26 @@ vhost_user_set_vring_num(struct virtio_net *dev, TAILQ_INIT(&vq->zmbuf_list); } - vq->shadow_used_ring = rte_malloc(NULL, + if (vq_is_packed(dev)) { + vq->shadow_used_packed = rte_malloc(NULL, + vq->size * + sizeof(struct vring_used_elem_packed), + RTE_CACHE_LINE_SIZE); + if (!vq->shadow_used_packed) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to allocate memory for shadow used ring.\n"); + return -1; + } + + } else { + vq->shadow_used_split = rte_malloc(NULL, vq->size * sizeof(struct vring_used_elem), RTE_CACHE_LINE_SIZE); - if (!vq->shadow_used_ring) { - RTE_LOG(ERR, VHOST_CONFIG, - "failed to allocate memory for shadow used ring.\n"); - return -1; + if (!vq->shadow_used_split) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to allocate memory for shadow used ring.\n"); + return -1; + } } vq->batch_copy_elems = rte_malloc(NULL, @@ -269,7 +328,8 @@ numa_realloc(struct virtio_net *dev, int index) struct virtio_net *old_dev; struct vhost_virtqueue *old_vq, *vq; struct zcopy_mbuf *new_zmbuf; - struct vring_used_elem *new_shadow_used_ring; + struct vring_used_elem *new_shadow_used_split; + struct vring_used_elem_packed *new_shadow_used_packed; struct batch_copy_elem *new_batch_copy_elems; int ret; @@ -304,13 +364,26 @@ numa_realloc(struct virtio_net *dev, int index) vq->zmbufs = new_zmbuf; } - new_shadow_used_ring = rte_malloc_socket(NULL, - vq->size * sizeof(struct vring_used_elem), - RTE_CACHE_LINE_SIZE, - newnode); - if (new_shadow_used_ring) { - rte_free(vq->shadow_used_ring); - vq->shadow_used_ring = new_shadow_used_ring; + if (vq_is_packed(dev)) { + new_shadow_used_packed = rte_malloc_socket(NULL, + vq->size * + sizeof(struct vring_used_elem_packed), + RTE_CACHE_LINE_SIZE, + newnode); + if (new_shadow_used_packed) { + rte_free(vq->shadow_used_packed); + vq->shadow_used_packed = new_shadow_used_packed; + } + } else { + new_shadow_used_split = rte_malloc_socket(NULL, + vq->size * + sizeof(struct vring_used_elem), + RTE_CACHE_LINE_SIZE, + newnode); + if (new_shadow_used_split) { + rte_free(vq->shadow_used_split); + vq->shadow_used_split = new_shadow_used_split; + } } new_batch_copy_elems = rte_malloc_socket(NULL, @@ -366,21 +439,26 @@ numa_realloc(struct virtio_net *dev, int index __rte_unused) /* Converts QEMU virtual address to Vhost virtual address. */ static uint64_t -qva_to_vva(struct virtio_net *dev, uint64_t qva) +qva_to_vva(struct virtio_net *dev, uint64_t qva, uint64_t *len) { - struct rte_vhost_mem_region *reg; + struct rte_vhost_mem_region *r; uint32_t i; /* Find the region where the address lives. */ for (i = 0; i < dev->mem->nregions; i++) { - reg = &dev->mem->regions[i]; + r = &dev->mem->regions[i]; + + if (qva >= r->guest_user_addr && + qva < r->guest_user_addr + r->size) { - if (qva >= reg->guest_user_addr && - qva < reg->guest_user_addr + reg->size) { - return qva - reg->guest_user_addr + - reg->host_user_addr; + if (unlikely(*len > r->guest_user_addr + r->size - qva)) + *len = r->guest_user_addr + r->size - qva; + + return qva - r->guest_user_addr + + r->host_user_addr; } } + *len = 0; return 0; } @@ -393,20 +471,20 @@ qva_to_vva(struct virtio_net *dev, uint64_t qva) */ static uint64_t ring_addr_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint64_t ra, uint64_t size) + uint64_t ra, uint64_t *size) { if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) { uint64_t vva; vva = vhost_user_iotlb_cache_find(vq, ra, - &size, VHOST_ACCESS_RW); + size, VHOST_ACCESS_RW); if (!vva) vhost_user_iotlb_miss(dev, ra, VHOST_ACCESS_RW); return vva; } - return qva_to_vva(dev, ra); + return qva_to_vva(dev, ra, size); } static struct virtio_net * @@ -414,16 +492,63 @@ translate_ring_addresses(struct virtio_net *dev, int vq_index) { struct vhost_virtqueue *vq = dev->virtqueue[vq_index]; struct vhost_vring_addr *addr = &vq->ring_addrs; + uint64_t len; + + if (vq_is_packed(dev)) { + len = sizeof(struct vring_packed_desc) * vq->size; + vq->desc_packed = (struct vring_packed_desc *)(uintptr_t) + ring_addr_to_vva(dev, vq, addr->desc_user_addr, &len); + vq->log_guest_addr = 0; + if (vq->desc_packed == NULL || + len != sizeof(struct vring_packed_desc) * + vq->size) { + RTE_LOG(DEBUG, VHOST_CONFIG, + "(%d) failed to map desc_packed ring.\n", + dev->vid); + return dev; + } + + dev = numa_realloc(dev, vq_index); + vq = dev->virtqueue[vq_index]; + addr = &vq->ring_addrs; + + len = sizeof(struct vring_packed_desc_event); + vq->driver_event = (struct vring_packed_desc_event *) + (uintptr_t)ring_addr_to_vva(dev, + vq, addr->avail_user_addr, &len); + if (vq->driver_event == NULL || + len != sizeof(struct vring_packed_desc_event)) { + RTE_LOG(DEBUG, VHOST_CONFIG, + "(%d) failed to find driver area address.\n", + dev->vid); + return dev; + } + + len = sizeof(struct vring_packed_desc_event); + vq->device_event = (struct vring_packed_desc_event *) + (uintptr_t)ring_addr_to_vva(dev, + vq, addr->used_user_addr, &len); + if (vq->device_event == NULL || + len != sizeof(struct vring_packed_desc_event)) { + RTE_LOG(DEBUG, VHOST_CONFIG, + "(%d) failed to find device area address.\n", + dev->vid); + return dev; + } + + return dev; + } /* The addresses are converted from QEMU virtual to Vhost virtual. */ if (vq->desc && vq->avail && vq->used) return dev; + len = sizeof(struct vring_desc) * vq->size; vq->desc = (struct vring_desc *)(uintptr_t)ring_addr_to_vva(dev, - vq, addr->desc_user_addr, sizeof(struct vring_desc)); - if (vq->desc == 0) { + vq, addr->desc_user_addr, &len); + if (vq->desc == 0 || len != sizeof(struct vring_desc) * vq->size) { RTE_LOG(DEBUG, VHOST_CONFIG, - "(%d) failed to find desc ring address.\n", + "(%d) failed to map desc ring.\n", dev->vid); return dev; } @@ -432,20 +557,26 @@ translate_ring_addresses(struct virtio_net *dev, int vq_index) vq = dev->virtqueue[vq_index]; addr = &vq->ring_addrs; + len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size; vq->avail = (struct vring_avail *)(uintptr_t)ring_addr_to_vva(dev, - vq, addr->avail_user_addr, sizeof(struct vring_avail)); - if (vq->avail == 0) { + vq, addr->avail_user_addr, &len); + if (vq->avail == 0 || + len != sizeof(struct vring_avail) + + sizeof(uint16_t) * vq->size) { RTE_LOG(DEBUG, VHOST_CONFIG, - "(%d) failed to find avail ring address.\n", + "(%d) failed to map avail ring.\n", dev->vid); return dev; } + len = sizeof(struct vring_used) + + sizeof(struct vring_used_elem) * vq->size; vq->used = (struct vring_used *)(uintptr_t)ring_addr_to_vva(dev, - vq, addr->used_user_addr, sizeof(struct vring_used)); - if (vq->used == 0) { + vq, addr->used_user_addr, &len); + if (vq->used == 0 || len != sizeof(struct vring_used) + + sizeof(struct vring_used_elem) * vq->size) { RTE_LOG(DEBUG, VHOST_CONFIG, - "(%d) failed to find used ring address.\n", + "(%d) failed to map used ring.\n", dev->vid); return dev; } @@ -461,13 +592,13 @@ translate_ring_addresses(struct virtio_net *dev, int vq_index) vq->log_guest_addr = addr->log_guest_addr; - LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n", + VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n", dev->vid, vq->desc); - LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n", + VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n", dev->vid, vq->avail); - LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n", + VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n", dev->vid, vq->used); - LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n", + VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n", dev->vid, vq->log_guest_addr); return dev; @@ -500,7 +631,7 @@ vhost_user_set_vring_addr(struct virtio_net **pdev, VhostUserMsg *msg) if (vq->enabled && (dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) { - dev = translate_ring_addresses(dev, msg->payload.state.index); + dev = translate_ring_addresses(dev, msg->payload.addr.index); if (!dev) return -1; @@ -525,7 +656,7 @@ vhost_user_set_vring_base(struct virtio_net *dev, return 0; } -static void +static int add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr, uint64_t host_phys_addr, uint64_t size) { @@ -535,6 +666,10 @@ add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr, dev->max_guest_pages *= 2; dev->guest_pages = realloc(dev->guest_pages, dev->max_guest_pages * sizeof(*page)); + if (!dev->guest_pages) { + RTE_LOG(ERR, VHOST_CONFIG, "cannot realloc guest_pages\n"); + return -1; + } } if (dev->nr_guest_pages > 0) { @@ -543,7 +678,7 @@ add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr, if (host_phys_addr == last_page->host_phys_addr + last_page->size) { last_page->size += size; - return; + return 0; } } @@ -551,9 +686,11 @@ add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr, page->guest_phys_addr = guest_phys_addr; page->host_phys_addr = host_phys_addr; page->size = size; + + return 0; } -static void +static int add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg, uint64_t page_size) { @@ -567,7 +704,9 @@ add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg, size = page_size - (guest_phys_addr & (page_size - 1)); size = RTE_MIN(size, reg_size); - add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size); + if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size) < 0) + return -1; + host_user_addr += size; guest_phys_addr += size; reg_size -= size; @@ -576,12 +715,16 @@ add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg, size = RTE_MIN(reg_size, page_size); host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t) host_user_addr); - add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size); + if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, + size) < 0) + return -1; host_user_addr += size; guest_phys_addr += size; reg_size -= size; } + + return 0; } #ifdef RTE_LIBRTE_VHOST_DEBUG @@ -635,8 +778,9 @@ vhost_memory_changed(struct VhostUserMemory *new, } static int -vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg) +vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *pmsg) { + struct virtio_net *dev = *pdev; struct VhostUserMemory memory = pmsg->payload.memory; struct rte_vhost_mem_region *reg; void *mmap_addr; @@ -644,8 +788,15 @@ vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg) uint64_t mmap_offset; uint64_t alignment; uint32_t i; + int populate; int fd; + if (memory.nregions > VHOST_MEMORY_MAX_NREGIONS) { + RTE_LOG(ERR, VHOST_CONFIG, + "too many memory regions (%u)\n", memory.nregions); + return -1; + } + if (dev->mem && !vhost_memory_changed(&memory, dev->mem)) { RTE_LOG(INFO, VHOST_CONFIG, "(%d) memory regions not changed\n", dev->vid); @@ -662,6 +813,11 @@ vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg) dev->mem = NULL; } + /* Flush IOTLB cache as previous HVAs are now invalid */ + if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) + for (i = 0; i < dev->nr_vring; i++) + vhost_user_iotlb_flush_all(dev->virtqueue[i]); + dev->nr_guest_pages = 0; if (!dev->guest_pages) { dev->max_guest_pages = 8; @@ -696,7 +852,17 @@ vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg) reg->fd = fd; mmap_offset = memory.regions[i].mmap_offset; - mmap_size = reg->size + mmap_offset; + + /* Check for memory_size + mmap_offset overflow */ + if (mmap_offset >= -reg->size) { + RTE_LOG(ERR, VHOST_CONFIG, + "mmap_offset (%#"PRIx64") and memory_size " + "(%#"PRIx64") overflow\n", + mmap_offset, reg->size); + goto err_mmap; + } + + mmap_size = reg->size + mmap_offset; /* mmap() without flag of MAP_ANONYMOUS, should be called * with length argument aligned with hugepagesz at older @@ -714,8 +880,9 @@ vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg) } mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment); + populate = (dev->dequeue_zero_copy) ? MAP_POPULATE : 0; mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, fd, 0); + MAP_SHARED | populate, fd, 0); if (mmap_addr == MAP_FAILED) { RTE_LOG(ERR, VHOST_CONFIG, @@ -729,7 +896,12 @@ vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg) mmap_offset; if (dev->dequeue_zero_copy) - add_guest_pages(dev, reg, alignment); + if (add_guest_pages(dev, reg, alignment) < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "adding guest pages to region %u failed.\n", + i); + goto err_mmap; + } RTE_LOG(INFO, VHOST_CONFIG, "guest memory region %u, size: 0x%" PRIx64 "\n" @@ -750,6 +922,25 @@ vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg) mmap_offset); } + for (i = 0; i < dev->nr_vring; i++) { + struct vhost_virtqueue *vq = dev->virtqueue[i]; + + if (vq->desc || vq->avail || vq->used) { + /* + * If the memory table got updated, the ring addresses + * need to be translated again as virtual addresses have + * changed. + */ + vring_invalidate(dev, vq); + + dev = translate_ring_addresses(dev, i); + if (!dev) + return -1; + + *pdev = dev; + } + } + dump_guest_pages(dev); return 0; @@ -761,10 +952,20 @@ err_mmap: return -1; } -static int -vq_is_ready(struct vhost_virtqueue *vq) +static bool +vq_is_ready(struct virtio_net *dev, struct vhost_virtqueue *vq) { - return vq && vq->desc && vq->avail && vq->used && + bool rings_ok; + + if (!vq) + return false; + + if (vq_is_packed(dev)) + rings_ok = !!vq->desc_packed; + else + rings_ok = vq->desc && vq->avail && vq->used; + + return rings_ok && vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD && vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD; } @@ -781,7 +982,7 @@ virtio_is_ready(struct virtio_net *dev) for (i = 0; i < dev->nr_vring; i++) { vq = dev->virtqueue[i]; - if (!vq_is_ready(vq)) + if (!vq_is_ready(dev, vq)) return 0; } @@ -874,15 +1075,13 @@ vhost_user_get_vring_base(struct virtio_net *dev, struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index]; /* We have to stop the queue (virtio) if it is running. */ - if (dev->flags & VIRTIO_DEV_RUNNING) { - dev->flags &= ~VIRTIO_DEV_RUNNING; - dev->notify_ops->destroy_device(dev->vid); - } + vhost_destroy_device_notify(dev); dev->flags &= ~VIRTIO_DEV_READY; + dev->flags &= ~VIRTIO_DEV_VDPA_CONFIGURED; - /* Here we are safe to get the last used index */ - msg->payload.state.num = vq->last_used_idx; + /* Here we are safe to get the last avail index */ + msg->payload.state.num = vq->last_avail_idx; RTE_LOG(INFO, VHOST_CONFIG, "vring base idx:%d file:%d\n", msg->payload.state.index, @@ -897,10 +1096,20 @@ vhost_user_get_vring_base(struct virtio_net *dev, vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; + if (vq->callfd >= 0) + close(vq->callfd); + + vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD; + if (dev->dequeue_zero_copy) free_zmbufs(vq); - rte_free(vq->shadow_used_ring); - vq->shadow_used_ring = NULL; + if (vq_is_packed(dev)) { + rte_free(vq->shadow_used_packed); + vq->shadow_used_packed = NULL; + } else { + rte_free(vq->shadow_used_split); + vq->shadow_used_split = NULL; + } rte_free(vq->batch_copy_elems); vq->batch_copy_elems = NULL; @@ -917,16 +1126,24 @@ vhost_user_set_vring_enable(struct virtio_net *dev, VhostUserMsg *msg) { int enable = (int)msg->payload.state.num; + int index = (int)msg->payload.state.index; + struct rte_vdpa_device *vdpa_dev; + int did = -1; RTE_LOG(INFO, VHOST_CONFIG, "set queue enable: %d to qp idx: %d\n", - enable, msg->payload.state.index); + enable, index); + + did = dev->vdpa_dev_id; + vdpa_dev = rte_vdpa_get_device(did); + if (vdpa_dev && vdpa_dev->ops->set_vring_state) + vdpa_dev->ops->set_vring_state(dev->vid, index, enable); if (dev->notify_ops->vring_state_changed) dev->notify_ops->vring_state_changed(dev->vid, - msg->payload.state.index, enable); + index, enable); - dev->virtqueue[msg->payload.state.index]->enabled = enable; + dev->virtqueue[index]->enabled = enable; return 0; } @@ -935,9 +1152,10 @@ static void vhost_user_get_protocol_features(struct virtio_net *dev, struct VhostUserMsg *msg) { - uint64_t features, protocol_features = VHOST_USER_PROTOCOL_FEATURES; + uint64_t features, protocol_features; rte_vhost_driver_get_features(dev->ifname, &features); + rte_vhost_driver_get_protocol_features(dev->ifname, &protocol_features); /* * REPLY_ACK protocol feature is only mandatory for now @@ -983,6 +1201,15 @@ vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg) size = msg->payload.log.mmap_size; off = msg->payload.log.mmap_offset; + + /* Don't allow mmap_offset to point outside the mmap region */ + if (off > size) { + RTE_LOG(ERR, VHOST_CONFIG, + "log offset %#"PRIx64" exceeds log size %#"PRIx64"\n", + off, size); + return -1; + } + RTE_LOG(INFO, VHOST_CONFIG, "log mmap size: %"PRId64", offset: %"PRId64"\n", size, off); @@ -991,7 +1218,7 @@ vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg) * mmap from 0 to workaround a hugepage mmap bug: mmap will * fail when offset is not page size aligned. */ - addr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); close(fd); if (addr == MAP_FAILED) { RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n"); @@ -1024,6 +1251,8 @@ static int vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg) { uint8_t *mac = (uint8_t *)&msg->payload.u64; + struct rte_vdpa_device *vdpa_dev; + int did = -1; RTE_LOG(DEBUG, VHOST_CONFIG, ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n", @@ -1039,6 +1268,10 @@ vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg) */ rte_smp_wmb(); rte_atomic16_set(&dev->broadcast_rarp, 1); + did = dev->vdpa_dev_id; + vdpa_dev = rte_vdpa_get_device(did); + if (vdpa_dev && vdpa_dev->ops->migration_done) + vdpa_dev->ops->migration_done(dev->vid); return 0; } @@ -1131,11 +1364,12 @@ vhost_user_iotlb_msg(struct virtio_net **pdev, struct VhostUserMsg *msg) struct virtio_net *dev = *pdev; struct vhost_iotlb_msg *imsg = &msg->payload.iotlb; uint16_t i; - uint64_t vva; + uint64_t vva, len; switch (imsg->type) { case VHOST_IOTLB_UPDATE: - vva = qva_to_vva(dev, imsg->uaddr); + len = imsg->size; + vva = qva_to_vva(dev, imsg->uaddr, &len); if (!vva) return -1; @@ -1143,7 +1377,7 @@ vhost_user_iotlb_msg(struct virtio_net **pdev, struct VhostUserMsg *msg) struct vhost_virtqueue *vq = dev->virtqueue[i]; vhost_user_iotlb_cache_insert(vq, imsg->iova, vva, - imsg->size, imsg->perm); + len, imsg->perm); if (is_vring_iotlb_update(vq, imsg)) *pdev = dev = translate_ring_addresses(dev, i); @@ -1200,13 +1434,13 @@ read_vhost_message(int sockfd, struct VhostUserMsg *msg) } static int -send_vhost_message(int sockfd, struct VhostUserMsg *msg) +send_vhost_message(int sockfd, struct VhostUserMsg *msg, int *fds, int fd_num) { if (!msg) return 0; return send_fd_message(sockfd, (char *)msg, - VHOST_USER_HDR_SIZE + msg->size, NULL, 0); + VHOST_USER_HDR_SIZE + msg->size, fds, fd_num); } static int @@ -1220,7 +1454,23 @@ send_vhost_reply(int sockfd, struct VhostUserMsg *msg) msg->flags |= VHOST_USER_VERSION; msg->flags |= VHOST_USER_REPLY_MASK; - return send_vhost_message(sockfd, msg); + return send_vhost_message(sockfd, msg, NULL, 0); +} + +static int +send_vhost_slave_message(struct virtio_net *dev, struct VhostUserMsg *msg, + int *fds, int fd_num) +{ + int ret; + + if (msg->flags & VHOST_USER_NEED_REPLY) + rte_spinlock_lock(&dev->slave_req_lock); + + ret = send_vhost_message(dev->slave_req_fd, msg, fds, fd_num); + if (ret < 0 && (msg->flags & VHOST_USER_NEED_REPLY)) + rte_spinlock_unlock(&dev->slave_req_lock); + + return ret; } /* @@ -1300,8 +1550,11 @@ vhost_user_msg_handler(int vid, int fd) { struct virtio_net *dev; struct VhostUserMsg msg; + struct rte_vdpa_device *vdpa_dev; + int did = -1; int ret; int unlock_required = 0; + uint32_t skip_master = 0; dev = get_device(vid); if (dev == NULL) @@ -1379,6 +1632,21 @@ vhost_user_msg_handler(int vid, int fd) } + if (dev->extern_ops.pre_msg_handle) { + uint32_t need_reply; + + ret = (*dev->extern_ops.pre_msg_handle)(dev->vid, + (void *)&msg, &need_reply, &skip_master); + if (ret < 0) + goto skip_to_reply; + + if (need_reply) + send_vhost_reply(fd, &msg); + + if (skip_master) + goto skip_to_post_handle; + } + switch (msg.request.master) { case VHOST_USER_GET_FEATURES: msg.payload.u64 = vhost_user_get_features(dev); @@ -1407,7 +1675,7 @@ vhost_user_msg_handler(int vid, int fd) break; case VHOST_USER_SET_MEM_TABLE: - ret = vhost_user_set_mem_table(dev, &msg); + ret = vhost_user_set_mem_table(&dev, &msg); break; case VHOST_USER_SET_LOG_BASE: @@ -1452,7 +1720,7 @@ vhost_user_msg_handler(int vid, int fd) break; case VHOST_USER_GET_QUEUE_NUM: - msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS; + msg.payload.u64 = (uint64_t)vhost_user_get_queue_num(dev); msg.size = sizeof(msg.payload.u64); send_vhost_reply(fd, &msg); break; @@ -1479,9 +1747,22 @@ vhost_user_msg_handler(int vid, int fd) default: ret = -1; break; + } + +skip_to_post_handle: + if (dev->extern_ops.post_msg_handle) { + uint32_t need_reply; + ret = (*dev->extern_ops.post_msg_handle)( + dev->vid, (void *)&msg, &need_reply); + if (ret < 0) + goto skip_to_reply; + + if (need_reply) + send_vhost_reply(fd, &msg); } +skip_to_reply: if (unlock_required) vhost_user_unlock_all_queue_pairs(dev); @@ -1505,9 +1786,53 @@ vhost_user_msg_handler(int vid, int fd) } } + did = dev->vdpa_dev_id; + vdpa_dev = rte_vdpa_get_device(did); + if (vdpa_dev && virtio_is_ready(dev) && + !(dev->flags & VIRTIO_DEV_VDPA_CONFIGURED) && + msg.request.master == VHOST_USER_SET_VRING_ENABLE) { + if (vdpa_dev->ops->dev_conf) + vdpa_dev->ops->dev_conf(dev->vid); + dev->flags |= VIRTIO_DEV_VDPA_CONFIGURED; + if (vhost_user_host_notifier_ctrl(dev->vid, true) != 0) { + RTE_LOG(INFO, VHOST_CONFIG, + "(%d) software relay is used for vDPA, performance may be low.\n", + dev->vid); + } + } + return 0; } +static int process_slave_message_reply(struct virtio_net *dev, + const VhostUserMsg *msg) +{ + VhostUserMsg msg_reply; + int ret; + + if ((msg->flags & VHOST_USER_NEED_REPLY) == 0) + return 0; + + if (read_vhost_message(dev->slave_req_fd, &msg_reply) < 0) { + ret = -1; + goto out; + } + + if (msg_reply.request.slave != msg->request.slave) { + RTE_LOG(ERR, VHOST_CONFIG, + "Received unexpected msg type (%u), expected %u\n", + msg_reply.request.slave, msg->request.slave); + ret = -1; + goto out; + } + + ret = msg_reply.payload.u64 ? -1 : 0; + +out: + rte_spinlock_unlock(&dev->slave_req_lock); + return ret; +} + int vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm) { @@ -1523,7 +1848,7 @@ vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm) }, }; - ret = send_vhost_message(dev->slave_req_fd, &msg); + ret = send_vhost_message(dev->slave_req_fd, &msg, NULL, 0); if (ret < 0) { RTE_LOG(ERR, VHOST_CONFIG, "Failed to send IOTLB miss message (%d)\n", @@ -1533,3 +1858,101 @@ vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm) return 0; } + +static int vhost_user_slave_set_vring_host_notifier(struct virtio_net *dev, + int index, int fd, + uint64_t offset, + uint64_t size) +{ + int *fdp = NULL; + size_t fd_num = 0; + int ret; + struct VhostUserMsg msg = { + .request.slave = VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG, + .flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY, + .size = sizeof(msg.payload.area), + .payload.area = { + .u64 = index & VHOST_USER_VRING_IDX_MASK, + .size = size, + .offset = offset, + }, + }; + + if (fd < 0) + msg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK; + else { + fdp = &fd; + fd_num = 1; + } + + ret = send_vhost_slave_message(dev, &msg, fdp, fd_num); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to set host notifier (%d)\n", ret); + return ret; + } + + return process_slave_message_reply(dev, &msg); +} + +int vhost_user_host_notifier_ctrl(int vid, bool enable) +{ + struct virtio_net *dev; + struct rte_vdpa_device *vdpa_dev; + int vfio_device_fd, did, ret = 0; + uint64_t offset, size; + unsigned int i; + + dev = get_device(vid); + if (!dev) + return -ENODEV; + + did = dev->vdpa_dev_id; + if (did < 0) + return -EINVAL; + + if (!(dev->features & (1ULL << VIRTIO_F_VERSION_1)) || + !(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)) || + !(dev->protocol_features & + (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ)) || + !(dev->protocol_features & + (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD)) || + !(dev->protocol_features & + (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER))) + return -ENOTSUP; + + vdpa_dev = rte_vdpa_get_device(did); + if (!vdpa_dev) + return -ENODEV; + + RTE_FUNC_PTR_OR_ERR_RET(vdpa_dev->ops->get_vfio_device_fd, -ENOTSUP); + RTE_FUNC_PTR_OR_ERR_RET(vdpa_dev->ops->get_notify_area, -ENOTSUP); + + vfio_device_fd = vdpa_dev->ops->get_vfio_device_fd(vid); + if (vfio_device_fd < 0) + return -ENOTSUP; + + if (enable) { + for (i = 0; i < dev->nr_vring; i++) { + if (vdpa_dev->ops->get_notify_area(vid, i, &offset, + &size) < 0) { + ret = -ENOTSUP; + goto disable; + } + + if (vhost_user_slave_set_vring_host_notifier(dev, i, + vfio_device_fd, offset, size) < 0) { + ret = -EFAULT; + goto disable; + } + } + } else { +disable: + for (i = 0; i < dev->nr_vring; i++) { + vhost_user_slave_set_vring_host_notifier(dev, i, -1, + 0, 0); + } + } + + return ret; +} diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h index d4bd604b..42166adf 100644 --- a/lib/librte_vhost/vhost_user.h +++ b/lib/librte_vhost/vhost_user.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation + * Copyright(c) 2010-2018 Intel Corporation */ #ifndef _VHOST_NET_USER_H @@ -14,19 +14,15 @@ #define VHOST_MEMORY_MAX_NREGIONS 8 -#define VHOST_USER_PROTOCOL_F_MQ 0 -#define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 -#define VHOST_USER_PROTOCOL_F_RARP 2 -#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 -#define VHOST_USER_PROTOCOL_F_NET_MTU 4 -#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5 - #define VHOST_USER_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \ (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\ (1ULL << VHOST_USER_PROTOCOL_F_RARP) | \ (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK) | \ (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU) | \ - (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ)) + (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ) | \ + (1ULL << VHOST_USER_PROTOCOL_F_CRYPTO_SESSION) | \ + (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) | \ + (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER)) typedef enum VhostUserRequest { VHOST_USER_NONE = 0, @@ -52,12 +48,15 @@ typedef enum VhostUserRequest { VHOST_USER_NET_SET_MTU = 20, VHOST_USER_SET_SLAVE_REQ_FD = 21, VHOST_USER_IOTLB_MSG = 22, - VHOST_USER_MAX + VHOST_USER_CRYPTO_CREATE_SESS = 26, + VHOST_USER_CRYPTO_CLOSE_SESS = 27, + VHOST_USER_MAX = 28 } VhostUserRequest; typedef enum VhostUserSlaveRequest { VHOST_USER_SLAVE_NONE = 0, VHOST_USER_SLAVE_IOTLB_MSG = 1, + VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3, VHOST_USER_SLAVE_MAX } VhostUserSlaveRequest; @@ -79,10 +78,40 @@ typedef struct VhostUserLog { uint64_t mmap_offset; } VhostUserLog; +/* Comply with Cryptodev-Linux */ +#define VHOST_USER_CRYPTO_MAX_HMAC_KEY_LENGTH 512 +#define VHOST_USER_CRYPTO_MAX_CIPHER_KEY_LENGTH 64 + +/* Same structure as vhost-user backend session info */ +typedef struct VhostUserCryptoSessionParam { + int64_t session_id; + uint32_t op_code; + uint32_t cipher_algo; + uint32_t cipher_key_len; + uint32_t hash_algo; + uint32_t digest_len; + uint32_t auth_key_len; + uint32_t aad_len; + uint8_t op_type; + uint8_t dir; + uint8_t hash_mode; + uint8_t chaining_dir; + uint8_t *ciphe_key; + uint8_t *auth_key; + uint8_t cipher_key_buf[VHOST_USER_CRYPTO_MAX_CIPHER_KEY_LENGTH]; + uint8_t auth_key_buf[VHOST_USER_CRYPTO_MAX_HMAC_KEY_LENGTH]; +} VhostUserCryptoSessionParam; + +typedef struct VhostUserVringArea { + uint64_t u64; + uint64_t size; + uint64_t offset; +} VhostUserVringArea; + typedef struct VhostUserMsg { union { - VhostUserRequest master; - VhostUserSlaveRequest slave; + uint32_t master; /* a VhostUserRequest value */ + uint32_t slave; /* a VhostUserSlaveRequest value*/ } request; #define VHOST_USER_VERSION_MASK 0x3 @@ -99,6 +128,8 @@ typedef struct VhostUserMsg { VhostUserMemory memory; VhostUserLog log; struct vhost_iotlb_msg iotlb; + VhostUserCryptoSessionParam crypto_session; + VhostUserVringArea area; } payload; int fds[VHOST_MEMORY_MAX_NREGIONS]; } __attribute((packed)) VhostUserMsg; @@ -112,6 +143,7 @@ typedef struct VhostUserMsg { /* vhost_user.c */ int vhost_user_msg_handler(int vid, int fd); int vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm); +int vhost_user_host_notifier_ctrl(int vid, bool enable); /* socket.c */ int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num); diff --git a/lib/librte_vhost/virtio_crypto.h b/lib/librte_vhost/virtio_crypto.h new file mode 100644 index 00000000..e3b93573 --- /dev/null +++ b/lib/librte_vhost/virtio_crypto.h @@ -0,0 +1,422 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD. + */ + +#ifndef _VIRTIO_CRYPTO_H +#define _VIRTIO_CRYPTO_H + +#define VIRTIO_CRYPTO_SERVICE_CIPHER 0 +#define VIRTIO_CRYPTO_SERVICE_HASH 1 +#define VIRTIO_CRYPTO_SERVICE_MAC 2 +#define VIRTIO_CRYPTO_SERVICE_AEAD 3 + +#define VIRTIO_CRYPTO_OPCODE(service, op) (((service) << 8) | (op)) + +struct virtio_crypto_ctrl_header { +#define VIRTIO_CRYPTO_CIPHER_CREATE_SESSION \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_CIPHER, 0x02) +#define VIRTIO_CRYPTO_CIPHER_DESTROY_SESSION \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_CIPHER, 0x03) +#define VIRTIO_CRYPTO_HASH_CREATE_SESSION \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_HASH, 0x02) +#define VIRTIO_CRYPTO_HASH_DESTROY_SESSION \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_HASH, 0x03) +#define VIRTIO_CRYPTO_MAC_CREATE_SESSION \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_MAC, 0x02) +#define VIRTIO_CRYPTO_MAC_DESTROY_SESSION \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_MAC, 0x03) +#define VIRTIO_CRYPTO_AEAD_CREATE_SESSION \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AEAD, 0x02) +#define VIRTIO_CRYPTO_AEAD_DESTROY_SESSION \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AEAD, 0x03) + uint32_t opcode; + uint32_t algo; + uint32_t flag; + /* data virtqueue id */ + uint32_t queue_id; +}; + +struct virtio_crypto_cipher_session_para { +#define VIRTIO_CRYPTO_NO_CIPHER 0 +#define VIRTIO_CRYPTO_CIPHER_ARC4 1 +#define VIRTIO_CRYPTO_CIPHER_AES_ECB 2 +#define VIRTIO_CRYPTO_CIPHER_AES_CBC 3 +#define VIRTIO_CRYPTO_CIPHER_AES_CTR 4 +#define VIRTIO_CRYPTO_CIPHER_DES_ECB 5 +#define VIRTIO_CRYPTO_CIPHER_DES_CBC 6 +#define VIRTIO_CRYPTO_CIPHER_3DES_ECB 7 +#define VIRTIO_CRYPTO_CIPHER_3DES_CBC 8 +#define VIRTIO_CRYPTO_CIPHER_3DES_CTR 9 +#define VIRTIO_CRYPTO_CIPHER_KASUMI_F8 10 +#define VIRTIO_CRYPTO_CIPHER_SNOW3G_UEA2 11 +#define VIRTIO_CRYPTO_CIPHER_AES_F8 12 +#define VIRTIO_CRYPTO_CIPHER_AES_XTS 13 +#define VIRTIO_CRYPTO_CIPHER_ZUC_EEA3 14 + uint32_t algo; + /* length of key */ + uint32_t keylen; + +#define VIRTIO_CRYPTO_OP_ENCRYPT 1 +#define VIRTIO_CRYPTO_OP_DECRYPT 2 + /* encrypt or decrypt */ + uint32_t op; + uint32_t padding; +}; + +struct virtio_crypto_session_input { + /* Device-writable part */ + uint64_t session_id; + uint32_t status; + uint32_t padding; +}; + +struct virtio_crypto_cipher_session_req { + struct virtio_crypto_cipher_session_para para; + uint8_t padding[32]; +}; + +struct virtio_crypto_hash_session_para { +#define VIRTIO_CRYPTO_NO_HASH 0 +#define VIRTIO_CRYPTO_HASH_MD5 1 +#define VIRTIO_CRYPTO_HASH_SHA1 2 +#define VIRTIO_CRYPTO_HASH_SHA_224 3 +#define VIRTIO_CRYPTO_HASH_SHA_256 4 +#define VIRTIO_CRYPTO_HASH_SHA_384 5 +#define VIRTIO_CRYPTO_HASH_SHA_512 6 +#define VIRTIO_CRYPTO_HASH_SHA3_224 7 +#define VIRTIO_CRYPTO_HASH_SHA3_256 8 +#define VIRTIO_CRYPTO_HASH_SHA3_384 9 +#define VIRTIO_CRYPTO_HASH_SHA3_512 10 +#define VIRTIO_CRYPTO_HASH_SHA3_SHAKE128 11 +#define VIRTIO_CRYPTO_HASH_SHA3_SHAKE256 12 + uint32_t algo; + /* hash result length */ + uint32_t hash_result_len; + uint8_t padding[8]; +}; + +struct virtio_crypto_hash_create_session_req { + struct virtio_crypto_hash_session_para para; + uint8_t padding[40]; +}; + +struct virtio_crypto_mac_session_para { +#define VIRTIO_CRYPTO_NO_MAC 0 +#define VIRTIO_CRYPTO_MAC_HMAC_MD5 1 +#define VIRTIO_CRYPTO_MAC_HMAC_SHA1 2 +#define VIRTIO_CRYPTO_MAC_HMAC_SHA_224 3 +#define VIRTIO_CRYPTO_MAC_HMAC_SHA_256 4 +#define VIRTIO_CRYPTO_MAC_HMAC_SHA_384 5 +#define VIRTIO_CRYPTO_MAC_HMAC_SHA_512 6 +#define VIRTIO_CRYPTO_MAC_CMAC_3DES 25 +#define VIRTIO_CRYPTO_MAC_CMAC_AES 26 +#define VIRTIO_CRYPTO_MAC_KASUMI_F9 27 +#define VIRTIO_CRYPTO_MAC_SNOW3G_UIA2 28 +#define VIRTIO_CRYPTO_MAC_GMAC_AES 41 +#define VIRTIO_CRYPTO_MAC_GMAC_TWOFISH 42 +#define VIRTIO_CRYPTO_MAC_CBCMAC_AES 49 +#define VIRTIO_CRYPTO_MAC_CBCMAC_KASUMI_F9 50 +#define VIRTIO_CRYPTO_MAC_XCBC_AES 53 + uint32_t algo; + /* hash result length */ + uint32_t hash_result_len; + /* length of authenticated key */ + uint32_t auth_key_len; + uint32_t padding; +}; + +struct virtio_crypto_mac_create_session_req { + struct virtio_crypto_mac_session_para para; + uint8_t padding[40]; +}; + +struct virtio_crypto_aead_session_para { +#define VIRTIO_CRYPTO_NO_AEAD 0 +#define VIRTIO_CRYPTO_AEAD_GCM 1 +#define VIRTIO_CRYPTO_AEAD_CCM 2 +#define VIRTIO_CRYPTO_AEAD_CHACHA20_POLY1305 3 + uint32_t algo; + /* length of key */ + uint32_t key_len; + /* hash result length */ + uint32_t hash_result_len; + /* length of the additional authenticated data (AAD) in bytes */ + uint32_t aad_len; + /* encrypt or decrypt, See above VIRTIO_CRYPTO_OP_* */ + uint32_t op; + uint32_t padding; +}; + +struct virtio_crypto_aead_create_session_req { + struct virtio_crypto_aead_session_para para; + uint8_t padding[32]; +}; + +struct virtio_crypto_alg_chain_session_para { +#define VIRTIO_CRYPTO_SYM_ALG_CHAIN_ORDER_HASH_THEN_CIPHER 1 +#define VIRTIO_CRYPTO_SYM_ALG_CHAIN_ORDER_CIPHER_THEN_HASH 2 + uint32_t alg_chain_order; +/* Plain hash */ +#define VIRTIO_CRYPTO_SYM_HASH_MODE_PLAIN 1 +/* Authenticated hash (mac) */ +#define VIRTIO_CRYPTO_SYM_HASH_MODE_AUTH 2 +/* Nested hash */ +#define VIRTIO_CRYPTO_SYM_HASH_MODE_NESTED 3 + uint32_t hash_mode; + struct virtio_crypto_cipher_session_para cipher_param; + union { + struct virtio_crypto_hash_session_para hash_param; + struct virtio_crypto_mac_session_para mac_param; + uint8_t padding[16]; + } u; + /* length of the additional authenticated data (AAD) in bytes */ + uint32_t aad_len; + uint32_t padding; +}; + +struct virtio_crypto_alg_chain_session_req { + struct virtio_crypto_alg_chain_session_para para; +}; + +struct virtio_crypto_sym_create_session_req { + union { + struct virtio_crypto_cipher_session_req cipher; + struct virtio_crypto_alg_chain_session_req chain; + uint8_t padding[48]; + } u; + + /* Device-readable part */ + +/* No operation */ +#define VIRTIO_CRYPTO_SYM_OP_NONE 0 +/* Cipher only operation on the data */ +#define VIRTIO_CRYPTO_SYM_OP_CIPHER 1 +/* + * Chain any cipher with any hash or mac operation. The order + * depends on the value of alg_chain_order param + */ +#define VIRTIO_CRYPTO_SYM_OP_ALGORITHM_CHAINING 2 + uint32_t op_type; + uint32_t padding; +}; + +struct virtio_crypto_destroy_session_req { + /* Device-readable part */ + uint64_t session_id; + uint8_t padding[48]; +}; + +/* The request of the control virtqueue's packet */ +struct virtio_crypto_op_ctrl_req { + struct virtio_crypto_ctrl_header header; + + union { + struct virtio_crypto_sym_create_session_req + sym_create_session; + struct virtio_crypto_hash_create_session_req + hash_create_session; + struct virtio_crypto_mac_create_session_req + mac_create_session; + struct virtio_crypto_aead_create_session_req + aead_create_session; + struct virtio_crypto_destroy_session_req + destroy_session; + uint8_t padding[56]; + } u; +}; + +struct virtio_crypto_op_header { +#define VIRTIO_CRYPTO_CIPHER_ENCRYPT \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_CIPHER, 0x00) +#define VIRTIO_CRYPTO_CIPHER_DECRYPT \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_CIPHER, 0x01) +#define VIRTIO_CRYPTO_HASH \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_HASH, 0x00) +#define VIRTIO_CRYPTO_MAC \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_MAC, 0x00) +#define VIRTIO_CRYPTO_AEAD_ENCRYPT \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AEAD, 0x00) +#define VIRTIO_CRYPTO_AEAD_DECRYPT \ + VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AEAD, 0x01) + uint32_t opcode; + /* algo should be service-specific algorithms */ + uint32_t algo; + /* session_id should be service-specific algorithms */ + uint64_t session_id; + /* control flag to control the request */ + uint32_t flag; + uint32_t padding; +}; + +struct virtio_crypto_cipher_para { + /* + * Byte Length of valid IV/Counter + * + * For block ciphers in CBC or F8 mode, or for Kasumi in F8 mode, or for + * SNOW3G in UEA2 mode, this is the length of the IV (which + * must be the same as the block length of the cipher). + * For block ciphers in CTR mode, this is the length of the counter + * (which must be the same as the block length of the cipher). + * For AES-XTS, this is the 128bit tweak, i, from IEEE Std 1619-2007. + * + * The IV/Counter will be updated after every partial cryptographic + * operation. + */ + uint32_t iv_len; + /* length of source data */ + uint32_t src_data_len; + /* length of dst data */ + uint32_t dst_data_len; + uint32_t padding; +}; + +struct virtio_crypto_hash_para { + /* length of source data */ + uint32_t src_data_len; + /* hash result length */ + uint32_t hash_result_len; +}; + +struct virtio_crypto_mac_para { + struct virtio_crypto_hash_para hash; +}; + +struct virtio_crypto_aead_para { + /* + * Byte Length of valid IV data pointed to by the below iv_addr + * parameter. + * + * For GCM mode, this is either 12 (for 96-bit IVs) or 16, in which + * case iv_addr points to J0. + * For CCM mode, this is the length of the nonce, which can be in the + * range 7 to 13 inclusive. + */ + uint32_t iv_len; + /* length of additional auth data */ + uint32_t aad_len; + /* length of source data */ + uint32_t src_data_len; + /* length of dst data */ + uint32_t dst_data_len; +}; + +struct virtio_crypto_cipher_data_req { + /* Device-readable part */ + struct virtio_crypto_cipher_para para; + uint8_t padding[24]; +}; + +struct virtio_crypto_hash_data_req { + /* Device-readable part */ + struct virtio_crypto_hash_para para; + uint8_t padding[40]; +}; + +struct virtio_crypto_mac_data_req { + /* Device-readable part */ + struct virtio_crypto_mac_para para; + uint8_t padding[40]; +}; + +struct virtio_crypto_alg_chain_data_para { + uint32_t iv_len; + /* Length of source data */ + uint32_t src_data_len; + /* Length of destination data */ + uint32_t dst_data_len; + /* Starting point for cipher processing in source data */ + uint32_t cipher_start_src_offset; + /* Length of the source data that the cipher will be computed on */ + uint32_t len_to_cipher; + /* Starting point for hash processing in source data */ + uint32_t hash_start_src_offset; + /* Length of the source data that the hash will be computed on */ + uint32_t len_to_hash; + /* Length of the additional auth data */ + uint32_t aad_len; + /* Length of the hash result */ + uint32_t hash_result_len; + uint32_t reserved; +}; + +struct virtio_crypto_alg_chain_data_req { + /* Device-readable part */ + struct virtio_crypto_alg_chain_data_para para; +}; + +struct virtio_crypto_sym_data_req { + union { + struct virtio_crypto_cipher_data_req cipher; + struct virtio_crypto_alg_chain_data_req chain; + uint8_t padding[40]; + } u; + + /* See above VIRTIO_CRYPTO_SYM_OP_* */ + uint32_t op_type; + uint32_t padding; +}; + +struct virtio_crypto_aead_data_req { + /* Device-readable part */ + struct virtio_crypto_aead_para para; + uint8_t padding[32]; +}; + +/* The request of the data virtqueue's packet */ +struct virtio_crypto_op_data_req { + struct virtio_crypto_op_header header; + + union { + struct virtio_crypto_sym_data_req sym_req; + struct virtio_crypto_hash_data_req hash_req; + struct virtio_crypto_mac_data_req mac_req; + struct virtio_crypto_aead_data_req aead_req; + uint8_t padding[48]; + } u; +}; + +#define VIRTIO_CRYPTO_OK 0 +#define VIRTIO_CRYPTO_ERR 1 +#define VIRTIO_CRYPTO_BADMSG 2 +#define VIRTIO_CRYPTO_NOTSUPP 3 +#define VIRTIO_CRYPTO_INVSESS 4 /* Invalid session id */ + +/* The accelerator hardware is ready */ +#define VIRTIO_CRYPTO_S_HW_READY (1 << 0) + +struct virtio_crypto_config { + /* See VIRTIO_CRYPTO_OP_* above */ + uint32_t status; + + /* + * Maximum number of data queue + */ + uint32_t max_dataqueues; + + /* + * Specifies the services mask which the device support, + * see VIRTIO_CRYPTO_SERVICE_* above + */ + uint32_t crypto_services; + + /* Detailed algorithms mask */ + uint32_t cipher_algo_l; + uint32_t cipher_algo_h; + uint32_t hash_algo; + uint32_t mac_algo_l; + uint32_t mac_algo_h; + uint32_t aead_algo; + /* Maximum length of cipher key */ + uint32_t max_cipher_key_len; + /* Maximum length of authenticated key */ + uint32_t max_auth_key_len; + uint32_t reserve; + /* Maximum size of each crypto request's content */ + uint64_t max_size; +}; + +struct virtio_crypto_inhdr { + /* See VIRTIO_CRYPTO_* above */ + uint8_t status; +}; +#endif /* _VIRTIO_CRYPTO_H */ diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 700aca7c..99c7afc8 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -16,6 +16,7 @@ #include <rte_sctp.h> #include <rte_arp.h> #include <rte_spinlock.h> +#include <rte_malloc.h> #include "iotlb.h" #include "vhost.h" @@ -24,60 +25,174 @@ #define MAX_BATCH_LEN 256 +static __rte_always_inline bool +rxvq_is_mergeable(struct virtio_net *dev) +{ + return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF); +} + static bool is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring) { return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring; } +static __rte_always_inline void * +alloc_copy_ind_table(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t desc_addr, uint64_t desc_len) +{ + void *idesc; + uint64_t src, dst; + uint64_t len, remain = desc_len; + + idesc = rte_malloc(__func__, desc_len, 0); + if (unlikely(!idesc)) + return 0; + + dst = (uint64_t)(uintptr_t)idesc; + + while (remain) { + len = remain; + src = vhost_iova_to_vva(dev, vq, desc_addr, &len, + VHOST_ACCESS_RO); + if (unlikely(!src || !len)) { + rte_free(idesc); + return 0; + } + + rte_memcpy((void *)(uintptr_t)dst, (void *)(uintptr_t)src, len); + + remain -= len; + dst += len; + desc_addr += len; + } + + return idesc; +} + static __rte_always_inline void -do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint16_t to, uint16_t from, uint16_t size) +free_ind_table(void *idesc) +{ + rte_free(idesc); +} + +static __rte_always_inline void +do_flush_shadow_used_ring_split(struct virtio_net *dev, + struct vhost_virtqueue *vq, + uint16_t to, uint16_t from, uint16_t size) { rte_memcpy(&vq->used->ring[to], - &vq->shadow_used_ring[from], + &vq->shadow_used_split[from], size * sizeof(struct vring_used_elem)); - vhost_log_used_vring(dev, vq, + vhost_log_cache_used_vring(dev, vq, offsetof(struct vring_used, ring[to]), size * sizeof(struct vring_used_elem)); } static __rte_always_inline void -flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq) +flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq) { uint16_t used_idx = vq->last_used_idx & (vq->size - 1); if (used_idx + vq->shadow_used_idx <= vq->size) { - do_flush_shadow_used_ring(dev, vq, used_idx, 0, + do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, vq->shadow_used_idx); } else { uint16_t size; /* update used ring interval [used_idx, vq->size] */ size = vq->size - used_idx; - do_flush_shadow_used_ring(dev, vq, used_idx, 0, size); + do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, size); /* update the left half used ring interval [0, left_size] */ - do_flush_shadow_used_ring(dev, vq, 0, size, + do_flush_shadow_used_ring_split(dev, vq, 0, size, vq->shadow_used_idx - size); } vq->last_used_idx += vq->shadow_used_idx; rte_smp_wmb(); + vhost_log_cache_sync(dev, vq); + *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx; + vq->shadow_used_idx = 0; vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), sizeof(vq->used->idx)); } static __rte_always_inline void -update_shadow_used_ring(struct vhost_virtqueue *vq, +update_shadow_used_ring_split(struct vhost_virtqueue *vq, uint16_t desc_idx, uint16_t len) { uint16_t i = vq->shadow_used_idx++; - vq->shadow_used_ring[i].id = desc_idx; - vq->shadow_used_ring[i].len = len; + vq->shadow_used_split[i].id = desc_idx; + vq->shadow_used_split[i].len = len; +} + +static __rte_always_inline void +flush_shadow_used_ring_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq) +{ + int i; + uint16_t used_idx = vq->last_used_idx; + + /* Split loop in two to save memory barriers */ + for (i = 0; i < vq->shadow_used_idx; i++) { + vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id; + vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len; + + used_idx += vq->shadow_used_packed[i].count; + if (used_idx >= vq->size) + used_idx -= vq->size; + } + + rte_smp_wmb(); + + for (i = 0; i < vq->shadow_used_idx; i++) { + uint16_t flags; + + if (vq->shadow_used_packed[i].len) + flags = VRING_DESC_F_WRITE; + else + flags = 0; + + if (vq->used_wrap_counter) { + flags |= VRING_DESC_F_USED; + flags |= VRING_DESC_F_AVAIL; + } else { + flags &= ~VRING_DESC_F_USED; + flags &= ~VRING_DESC_F_AVAIL; + } + + vq->desc_packed[vq->last_used_idx].flags = flags; + + vhost_log_cache_used_vring(dev, vq, + vq->last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + + vq->last_used_idx += vq->shadow_used_packed[i].count; + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } + } + + rte_smp_wmb(); + vq->shadow_used_idx = 0; + vhost_log_cache_sync(dev, vq); +} + +static __rte_always_inline void +update_shadow_used_ring_packed(struct vhost_virtqueue *vq, + uint16_t desc_idx, uint16_t len, uint16_t count) +{ + uint16_t i = vq->shadow_used_idx++; + + vq->shadow_used_packed[i].id = desc_idx; + vq->shadow_used_packed[i].len = len; + vq->shadow_used_packed[i].count = count; } static inline void @@ -89,9 +204,11 @@ do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) for (i = 0; i < count; i++) { rte_memcpy(elem[i].dst, elem[i].src, elem[i].len); - vhost_log_write(dev, elem[i].log_addr, elem[i].len); + vhost_log_cache_write(dev, vq, elem[i].log_addr, elem[i].len); PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0); } + + vq->batch_copy_nb_elems = 0; } static inline void @@ -103,6 +220,8 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq) for (i = 0; i < count; i++) rte_memcpy(elem[i].dst, elem[i].src, elem[i].len); + + vq->batch_copy_nb_elems = 0; } /* avoid write operation when necessary, to lessen cache issues */ @@ -111,7 +230,7 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq) (var) = (val); \ } while (0) -static void +static __rte_always_inline void virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr) { uint64_t csum_l4 = m_buf->ol_flags & PKT_TX_L4_MASK; @@ -173,273 +292,268 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr) } static __rte_always_inline int -copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, - struct vring_desc *descs, struct rte_mbuf *m, - uint16_t desc_idx, uint32_t size) +map_one_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct buf_vector *buf_vec, uint16_t *vec_idx, + uint64_t desc_iova, uint64_t desc_len, uint8_t perm) { - uint32_t desc_avail, desc_offset; - uint32_t mbuf_avail, mbuf_offset; - uint32_t cpy_len; - struct vring_desc *desc; - uint64_t desc_addr; - /* A counter to avoid desc dead loop chain */ - uint16_t nr_desc = 1; - struct batch_copy_elem *batch_copy = vq->batch_copy_elems; - uint16_t copy_nb = vq->batch_copy_nb_elems; - int error = 0; + uint16_t vec_id = *vec_idx; - desc = &descs[desc_idx]; - desc_addr = vhost_iova_to_vva(dev, vq, desc->addr, - desc->len, VHOST_ACCESS_RW); - /* - * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid - * performance issue with some versions of gcc (4.8.4 and 5.3.0) which - * otherwise stores offset on the stack instead of in a register. - */ - if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr) { - error = -1; - goto out; + while (desc_len) { + uint64_t desc_addr; + uint64_t desc_chunck_len = desc_len; + + if (unlikely(vec_id >= BUF_VECTOR_MAX)) + return -1; + + desc_addr = vhost_iova_to_vva(dev, vq, + desc_iova, + &desc_chunck_len, + perm); + if (unlikely(!desc_addr)) + return -1; + + buf_vec[vec_id].buf_iova = desc_iova; + buf_vec[vec_id].buf_addr = desc_addr; + buf_vec[vec_id].buf_len = desc_chunck_len; + + desc_len -= desc_chunck_len; + desc_iova += desc_chunck_len; + vec_id++; } + *vec_idx = vec_id; - rte_prefetch0((void *)(uintptr_t)desc_addr); + return 0; +} - virtio_enqueue_offload(m, (struct virtio_net_hdr *)(uintptr_t)desc_addr); - vhost_log_write(dev, desc->addr, dev->vhost_hlen); - PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0); +static __rte_always_inline int +fill_vec_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint32_t avail_idx, uint16_t *vec_idx, + struct buf_vector *buf_vec, uint16_t *desc_chain_head, + uint16_t *desc_chain_len, uint8_t perm) +{ + uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)]; + uint16_t vec_id = *vec_idx; + uint32_t len = 0; + uint64_t dlen; + struct vring_desc *descs = vq->desc; + struct vring_desc *idesc = NULL; - desc_offset = dev->vhost_hlen; - desc_avail = desc->len - dev->vhost_hlen; + *desc_chain_head = idx; - mbuf_avail = rte_pktmbuf_data_len(m); - mbuf_offset = 0; - while (mbuf_avail != 0 || m->next != NULL) { - /* done with current mbuf, fetch next */ - if (mbuf_avail == 0) { - m = m->next; + if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) { + dlen = vq->desc[idx].len; + descs = (struct vring_desc *)(uintptr_t) + vhost_iova_to_vva(dev, vq, vq->desc[idx].addr, + &dlen, + VHOST_ACCESS_RO); + if (unlikely(!descs)) + return -1; - mbuf_offset = 0; - mbuf_avail = rte_pktmbuf_data_len(m); - } + if (unlikely(dlen < vq->desc[idx].len)) { + /* + * The indirect desc table is not contiguous + * in process VA space, we have to copy it. + */ + idesc = alloc_copy_ind_table(dev, vq, + vq->desc[idx].addr, vq->desc[idx].len); + if (unlikely(!idesc)) + return -1; - /* done with current desc buf, fetch next */ - if (desc_avail == 0) { - if ((desc->flags & VRING_DESC_F_NEXT) == 0) { - /* Room in vring buffer is not enough */ - error = -1; - goto out; - } - if (unlikely(desc->next >= size || ++nr_desc > size)) { - error = -1; - goto out; - } + descs = idesc; + } - desc = &descs[desc->next]; - desc_addr = vhost_iova_to_vva(dev, vq, desc->addr, - desc->len, - VHOST_ACCESS_RW); - if (unlikely(!desc_addr)) { - error = -1; - goto out; - } + idx = 0; + } - desc_offset = 0; - desc_avail = desc->len; + while (1) { + if (unlikely(idx >= vq->size)) { + free_ind_table(idesc); + return -1; } - cpy_len = RTE_MIN(desc_avail, mbuf_avail); - if (likely(cpy_len > MAX_BATCH_LEN || copy_nb >= vq->size)) { - rte_memcpy((void *)((uintptr_t)(desc_addr + - desc_offset)), - rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), - cpy_len); - vhost_log_write(dev, desc->addr + desc_offset, cpy_len); - PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), - cpy_len, 0); - } else { - batch_copy[copy_nb].dst = - (void *)((uintptr_t)(desc_addr + desc_offset)); - batch_copy[copy_nb].src = - rte_pktmbuf_mtod_offset(m, void *, mbuf_offset); - batch_copy[copy_nb].log_addr = desc->addr + desc_offset; - batch_copy[copy_nb].len = cpy_len; - copy_nb++; + len += descs[idx].len; + + if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id, + descs[idx].addr, descs[idx].len, + perm))) { + free_ind_table(idesc); + return -1; } - mbuf_avail -= cpy_len; - mbuf_offset += cpy_len; - desc_avail -= cpy_len; - desc_offset += cpy_len; + if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0) + break; + + idx = descs[idx].next; } -out: - vq->batch_copy_nb_elems = copy_nb; + *desc_chain_len = len; + *vec_idx = vec_id; - return error; + if (unlikely(!!idesc)) + free_ind_table(idesc); + + return 0; } -/** - * This function adds buffers to the virtio devices RX virtqueue. Buffers can - * be received from the physical port or from another virtio device. A packet - * count is returned to indicate the number of packets that are successfully - * added to the RX queue. This function works when the mbuf is scattered, but - * it doesn't support the mergeable feature. +/* + * Returns -1 on fail, 0 on success */ -static __rte_always_inline uint32_t -virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, - struct rte_mbuf **pkts, uint32_t count) +static inline int +reserve_avail_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint32_t size, struct buf_vector *buf_vec, + uint16_t *num_buffers, uint16_t avail_head, + uint16_t *nr_vec) { - struct vhost_virtqueue *vq; - uint16_t avail_idx, free_entries, start_idx; - uint16_t desc_indexes[MAX_PKT_BURST]; - struct vring_desc *descs; - uint16_t used_idx; - uint32_t i, sz; + uint16_t cur_idx; + uint16_t vec_idx = 0; + uint16_t max_tries, tries = 0; - LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); - if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) { - RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", - dev->vid, __func__, queue_id); - return 0; - } + uint16_t head_idx = 0; + uint16_t len = 0; - vq = dev->virtqueue[queue_id]; + *num_buffers = 0; + cur_idx = vq->last_avail_idx; - rte_spinlock_lock(&vq->access_lock); + if (rxvq_is_mergeable(dev)) + max_tries = vq->size - 1; + else + max_tries = 1; - if (unlikely(vq->enabled == 0)) - goto out_access_unlock; + while (size > 0) { + if (unlikely(cur_idx == avail_head)) + return -1; + /* + * if we tried all available ring items, and still + * can't get enough buf, it means something abnormal + * happened. + */ + if (unlikely(++tries > max_tries)) + return -1; - if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) - vhost_user_iotlb_rd_lock(vq); + if (unlikely(fill_vec_buf_split(dev, vq, cur_idx, + &vec_idx, buf_vec, + &head_idx, &len, + VHOST_ACCESS_RW) < 0)) + return -1; + len = RTE_MIN(len, size); + update_shadow_used_ring_split(vq, head_idx, len); + size -= len; - if (unlikely(vq->access_ok == 0)) { - if (unlikely(vring_translate(dev, vq) < 0)) { - count = 0; - goto out; - } + cur_idx++; + *num_buffers += 1; } - avail_idx = *((volatile uint16_t *)&vq->avail->idx); - start_idx = vq->last_used_idx; - free_entries = avail_idx - start_idx; - count = RTE_MIN(count, free_entries); - count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST); - if (count == 0) - goto out; + *nr_vec = vec_idx; - LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n", - dev->vid, start_idx, start_idx + count); + return 0; +} - vq->batch_copy_nb_elems = 0; +static __rte_always_inline int +fill_vec_buf_packed_indirect(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct vring_packed_desc *desc, uint16_t *vec_idx, + struct buf_vector *buf_vec, uint16_t *len, uint8_t perm) +{ + uint16_t i; + uint32_t nr_descs; + uint16_t vec_id = *vec_idx; + uint64_t dlen; + struct vring_packed_desc *descs, *idescs = NULL; + + dlen = desc->len; + descs = (struct vring_packed_desc *)(uintptr_t) + vhost_iova_to_vva(dev, vq, desc->addr, &dlen, VHOST_ACCESS_RO); + if (unlikely(!descs)) + return -1; + + if (unlikely(dlen < desc->len)) { + /* + * The indirect desc table is not contiguous + * in process VA space, we have to copy it. + */ + idescs = alloc_copy_ind_table(dev, vq, desc->addr, desc->len); + if (unlikely(!idescs)) + return -1; - /* Retrieve all of the desc indexes first to avoid caching issues. */ - rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]); - for (i = 0; i < count; i++) { - used_idx = (start_idx + i) & (vq->size - 1); - desc_indexes[i] = vq->avail->ring[used_idx]; - vq->used->ring[used_idx].id = desc_indexes[i]; - vq->used->ring[used_idx].len = pkts[i]->pkt_len + - dev->vhost_hlen; - vhost_log_used_vring(dev, vq, - offsetof(struct vring_used, ring[used_idx]), - sizeof(vq->used->ring[used_idx])); + descs = idescs; } - rte_prefetch0(&vq->desc[desc_indexes[0]]); - for (i = 0; i < count; i++) { - uint16_t desc_idx = desc_indexes[i]; - int err; - - if (vq->desc[desc_idx].flags & VRING_DESC_F_INDIRECT) { - descs = (struct vring_desc *)(uintptr_t) - vhost_iova_to_vva(dev, - vq, vq->desc[desc_idx].addr, - vq->desc[desc_idx].len, - VHOST_ACCESS_RO); - if (unlikely(!descs)) { - count = i; - break; - } - - desc_idx = 0; - sz = vq->desc[desc_idx].len / sizeof(*descs); - } else { - descs = vq->desc; - sz = vq->size; - } + nr_descs = desc->len / sizeof(struct vring_packed_desc); + if (unlikely(nr_descs >= vq->size)) { + free_ind_table(idescs); + return -1; + } - err = copy_mbuf_to_desc(dev, vq, descs, pkts[i], desc_idx, sz); - if (unlikely(err)) { - count = i; - break; + for (i = 0; i < nr_descs; i++) { + if (unlikely(vec_id >= BUF_VECTOR_MAX)) { + free_ind_table(idescs); + return -1; } - if (i + 1 < count) - rte_prefetch0(&vq->desc[desc_indexes[i+1]]); + *len += descs[i].len; + if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id, + descs[i].addr, descs[i].len, + perm))) + return -1; } + *vec_idx = vec_id; - do_data_copy_enqueue(dev, vq); - - rte_smp_wmb(); - - *(volatile uint16_t *)&vq->used->idx += count; - vq->last_used_idx += count; - vhost_log_used_vring(dev, vq, - offsetof(struct vring_used, idx), - sizeof(vq->used->idx)); - - vhost_vring_call(dev, vq); -out: - if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) - vhost_user_iotlb_rd_unlock(vq); - -out_access_unlock: - rte_spinlock_unlock(&vq->access_lock); + if (unlikely(!!idescs)) + free_ind_table(idescs); - return count; + return 0; } static __rte_always_inline int -fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint32_t avail_idx, uint32_t *vec_idx, - struct buf_vector *buf_vec, uint16_t *desc_chain_head, - uint16_t *desc_chain_len) +fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint16_t avail_idx, uint16_t *desc_count, + struct buf_vector *buf_vec, uint16_t *vec_idx, + uint16_t *buf_id, uint16_t *len, uint8_t perm) { - uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)]; - uint32_t vec_id = *vec_idx; - uint32_t len = 0; - struct vring_desc *descs = vq->desc; + bool wrap_counter = vq->avail_wrap_counter; + struct vring_packed_desc *descs = vq->desc_packed; + uint16_t vec_id = *vec_idx; - *desc_chain_head = idx; + if (avail_idx < vq->last_avail_idx) + wrap_counter ^= 1; - if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) { - descs = (struct vring_desc *)(uintptr_t) - vhost_iova_to_vva(dev, vq, vq->desc[idx].addr, - vq->desc[idx].len, - VHOST_ACCESS_RO); - if (unlikely(!descs)) - return -1; + if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter))) + return -1; - idx = 0; - } + *desc_count = 0; while (1) { - if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size)) + if (unlikely(vec_id >= BUF_VECTOR_MAX)) return -1; - len += descs[idx].len; - buf_vec[vec_id].buf_addr = descs[idx].addr; - buf_vec[vec_id].buf_len = descs[idx].len; - buf_vec[vec_id].desc_idx = idx; - vec_id++; + *desc_count += 1; + *buf_id = descs[avail_idx].id; - if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0) + if (descs[avail_idx].flags & VRING_DESC_F_INDIRECT) { + if (unlikely(fill_vec_buf_packed_indirect(dev, vq, + &descs[avail_idx], + &vec_id, buf_vec, + len, perm) < 0)) + return -1; + } else { + *len += descs[avail_idx].len; + + if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id, + descs[avail_idx].addr, + descs[avail_idx].len, + perm))) + return -1; + } + + if ((descs[avail_idx].flags & VRING_DESC_F_NEXT) == 0) break; - idx = descs[idx].next; + if (++avail_idx >= vq->size) { + avail_idx -= vq->size; + wrap_counter ^= 1; + } } - *desc_chain_len = len; *vec_idx = vec_id; return 0; @@ -449,61 +563,74 @@ fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq, * Returns -1 on fail, 0 on success */ static inline int -reserve_avail_buf_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, +reserve_avail_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, uint32_t size, struct buf_vector *buf_vec, - uint16_t *num_buffers, uint16_t avail_head) + uint16_t *nr_vec, uint16_t *num_buffers, + uint16_t *nr_descs) { - uint16_t cur_idx; - uint32_t vec_idx = 0; - uint16_t tries = 0; + uint16_t avail_idx; + uint16_t vec_idx = 0; + uint16_t max_tries, tries = 0; - uint16_t head_idx = 0; + uint16_t buf_id = 0; uint16_t len = 0; + uint16_t desc_count; *num_buffers = 0; - cur_idx = vq->last_avail_idx; + avail_idx = vq->last_avail_idx; + + if (rxvq_is_mergeable(dev)) + max_tries = vq->size - 1; + else + max_tries = 1; while (size > 0) { - if (unlikely(cur_idx == avail_head)) + /* + * if we tried all available ring items, and still + * can't get enough buf, it means something abnormal + * happened. + */ + if (unlikely(++tries > max_tries)) return -1; - if (unlikely(fill_vec_buf(dev, vq, cur_idx, &vec_idx, buf_vec, - &head_idx, &len) < 0)) + if (unlikely(fill_vec_buf_packed(dev, vq, + avail_idx, &desc_count, + buf_vec, &vec_idx, + &buf_id, &len, + VHOST_ACCESS_RO) < 0)) return -1; + len = RTE_MIN(len, size); - update_shadow_used_ring(vq, head_idx, len); + update_shadow_used_ring_packed(vq, buf_id, len, desc_count); size -= len; - cur_idx++; - tries++; - *num_buffers += 1; + avail_idx += desc_count; + if (avail_idx >= vq->size) + avail_idx -= vq->size; - /* - * if we tried all available ring items, and still - * can't get enough buf, it means something abnormal - * happened. - */ - if (unlikely(tries >= vq->size)) - return -1; + *nr_descs += desc_count; + *num_buffers += 1; } + *nr_vec = vec_idx; + return 0; } static __rte_always_inline int -copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, +copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf *m, struct buf_vector *buf_vec, - uint16_t num_buffers) + uint16_t nr_vec, uint16_t num_buffers) { uint32_t vec_idx = 0; - uint64_t desc_addr; uint32_t mbuf_offset, mbuf_avail; - uint32_t desc_offset, desc_avail; + uint32_t buf_offset, buf_avail; + uint64_t buf_addr, buf_iova, buf_len; uint32_t cpy_len; - uint64_t hdr_addr, hdr_phys_addr; + uint64_t hdr_addr; struct rte_mbuf *hdr_mbuf; struct batch_copy_elem *batch_copy = vq->batch_copy_elems; - uint16_t copy_nb = vq->batch_copy_nb_elems; + struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL; int error = 0; if (unlikely(m == NULL)) { @@ -511,45 +638,61 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, goto out; } - desc_addr = vhost_iova_to_vva(dev, vq, buf_vec[vec_idx].buf_addr, - buf_vec[vec_idx].buf_len, - VHOST_ACCESS_RW); - if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr) { + buf_addr = buf_vec[vec_idx].buf_addr; + buf_iova = buf_vec[vec_idx].buf_iova; + buf_len = buf_vec[vec_idx].buf_len; + + if (nr_vec > 1) + rte_prefetch0((void *)(uintptr_t)buf_vec[1].buf_addr); + + if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) { error = -1; goto out; } hdr_mbuf = m; - hdr_addr = desc_addr; - hdr_phys_addr = buf_vec[vec_idx].buf_addr; - rte_prefetch0((void *)(uintptr_t)hdr_addr); + hdr_addr = buf_addr; + if (unlikely(buf_len < dev->vhost_hlen)) + hdr = &tmp_hdr; + else + hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr; - LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n", + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n", dev->vid, num_buffers); - desc_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen; - desc_offset = dev->vhost_hlen; + if (unlikely(buf_len < dev->vhost_hlen)) { + buf_offset = dev->vhost_hlen - buf_len; + vec_idx++; + buf_addr = buf_vec[vec_idx].buf_addr; + buf_iova = buf_vec[vec_idx].buf_iova; + buf_len = buf_vec[vec_idx].buf_len; + buf_avail = buf_len - buf_offset; + } else { + buf_offset = dev->vhost_hlen; + buf_avail = buf_len - dev->vhost_hlen; + } mbuf_avail = rte_pktmbuf_data_len(m); mbuf_offset = 0; while (mbuf_avail != 0 || m->next != NULL) { - /* done with current desc buf, get the next one */ - if (desc_avail == 0) { + /* done with current buf, get the next one */ + if (buf_avail == 0) { vec_idx++; - desc_addr = - vhost_iova_to_vva(dev, vq, - buf_vec[vec_idx].buf_addr, - buf_vec[vec_idx].buf_len, - VHOST_ACCESS_RW); - if (unlikely(!desc_addr)) { + if (unlikely(vec_idx >= nr_vec)) { error = -1; goto out; } - /* Prefetch buffer address. */ - rte_prefetch0((void *)(uintptr_t)desc_addr); - desc_offset = 0; - desc_avail = buf_vec[vec_idx].buf_len; + buf_addr = buf_vec[vec_idx].buf_addr; + buf_iova = buf_vec[vec_idx].buf_iova; + buf_len = buf_vec[vec_idx].buf_len; + + /* Prefetch next buffer address. */ + if (vec_idx + 1 < nr_vec) + rte_prefetch0((void *)(uintptr_t) + buf_vec[vec_idx + 1].buf_addr); + buf_offset = 0; + buf_avail = buf_len; } /* done with current mbuf, get the next one */ @@ -561,129 +704,221 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, } if (hdr_addr) { - struct virtio_net_hdr_mrg_rxbuf *hdr; - - hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t) - hdr_addr; virtio_enqueue_offload(hdr_mbuf, &hdr->hdr); - ASSIGN_UNLESS_EQUAL(hdr->num_buffers, num_buffers); - - vhost_log_write(dev, hdr_phys_addr, dev->vhost_hlen); - PRINT_PACKET(dev, (uintptr_t)hdr_addr, - dev->vhost_hlen, 0); + if (rxvq_is_mergeable(dev)) + ASSIGN_UNLESS_EQUAL(hdr->num_buffers, + num_buffers); + + if (unlikely(hdr == &tmp_hdr)) { + uint64_t len; + uint64_t remain = dev->vhost_hlen; + uint64_t src = (uint64_t)(uintptr_t)hdr, dst; + uint64_t iova = buf_vec[0].buf_iova; + uint16_t hdr_vec_idx = 0; + + while (remain) { + len = RTE_MIN(remain, + buf_vec[hdr_vec_idx].buf_len); + dst = buf_vec[hdr_vec_idx].buf_addr; + rte_memcpy((void *)(uintptr_t)dst, + (void *)(uintptr_t)src, + len); + + PRINT_PACKET(dev, (uintptr_t)dst, + (uint32_t)len, 0); + vhost_log_cache_write(dev, vq, + iova, len); + + remain -= len; + iova += len; + src += len; + hdr_vec_idx++; + } + } else { + PRINT_PACKET(dev, (uintptr_t)hdr_addr, + dev->vhost_hlen, 0); + vhost_log_cache_write(dev, vq, + buf_vec[0].buf_iova, + dev->vhost_hlen); + } hdr_addr = 0; } - cpy_len = RTE_MIN(desc_avail, mbuf_avail); + cpy_len = RTE_MIN(buf_avail, mbuf_avail); - if (likely(cpy_len > MAX_BATCH_LEN || copy_nb >= vq->size)) { - rte_memcpy((void *)((uintptr_t)(desc_addr + - desc_offset)), + if (likely(cpy_len > MAX_BATCH_LEN || + vq->batch_copy_nb_elems >= vq->size)) { + rte_memcpy((void *)((uintptr_t)(buf_addr + buf_offset)), rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), cpy_len); - vhost_log_write(dev, - buf_vec[vec_idx].buf_addr + desc_offset, - cpy_len); - PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), + vhost_log_cache_write(dev, vq, buf_iova + buf_offset, + cpy_len); + PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset), cpy_len, 0); } else { - batch_copy[copy_nb].dst = - (void *)((uintptr_t)(desc_addr + desc_offset)); - batch_copy[copy_nb].src = + batch_copy[vq->batch_copy_nb_elems].dst = + (void *)((uintptr_t)(buf_addr + buf_offset)); + batch_copy[vq->batch_copy_nb_elems].src = rte_pktmbuf_mtod_offset(m, void *, mbuf_offset); - batch_copy[copy_nb].log_addr = - buf_vec[vec_idx].buf_addr + desc_offset; - batch_copy[copy_nb].len = cpy_len; - copy_nb++; + batch_copy[vq->batch_copy_nb_elems].log_addr = + buf_iova + buf_offset; + batch_copy[vq->batch_copy_nb_elems].len = cpy_len; + vq->batch_copy_nb_elems++; } mbuf_avail -= cpy_len; mbuf_offset += cpy_len; - desc_avail -= cpy_len; - desc_offset += cpy_len; + buf_avail -= cpy_len; + buf_offset += cpy_len; } out: - vq->batch_copy_nb_elems = copy_nb; return error; } static __rte_always_inline uint32_t -virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id, +virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) { - struct vhost_virtqueue *vq; uint32_t pkt_idx = 0; uint16_t num_buffers; struct buf_vector buf_vec[BUF_VECTOR_MAX]; uint16_t avail_head; - LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); - if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) { - RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", - dev->vid, __func__, queue_id); - return 0; - } + rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); + avail_head = *((volatile uint16_t *)&vq->avail->idx); - vq = dev->virtqueue[queue_id]; + for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { + uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; + uint16_t nr_vec = 0; - rte_spinlock_lock(&vq->access_lock); + if (unlikely(reserve_avail_buf_split(dev, vq, + pkt_len, buf_vec, &num_buffers, + avail_head, &nr_vec) < 0)) { + VHOST_LOG_DEBUG(VHOST_DATA, + "(%d) failed to get enough desc from vring\n", + dev->vid); + vq->shadow_used_idx -= num_buffers; + break; + } - if (unlikely(vq->enabled == 0)) - goto out_access_unlock; + rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr); - if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) - vhost_user_iotlb_rd_lock(vq); + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", + dev->vid, vq->last_avail_idx, + vq->last_avail_idx + num_buffers); - if (unlikely(vq->access_ok == 0)) - if (unlikely(vring_translate(dev, vq) < 0)) - goto out; + if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx], + buf_vec, nr_vec, + num_buffers) < 0) { + vq->shadow_used_idx -= num_buffers; + break; + } - count = RTE_MIN((uint32_t)MAX_PKT_BURST, count); - if (count == 0) - goto out; + vq->last_avail_idx += num_buffers; + } - vq->batch_copy_nb_elems = 0; + do_data_copy_enqueue(dev, vq); - rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); + if (likely(vq->shadow_used_idx)) { + flush_shadow_used_ring_split(dev, vq); + vhost_vring_call_split(dev, vq); + } + + return pkt_idx; +} + +static __rte_always_inline uint32_t +virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mbuf **pkts, uint32_t count) +{ + uint32_t pkt_idx = 0; + uint16_t num_buffers; + struct buf_vector buf_vec[BUF_VECTOR_MAX]; - vq->shadow_used_idx = 0; - avail_head = *((volatile uint16_t *)&vq->avail->idx); for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; + uint16_t nr_vec = 0; + uint16_t nr_descs = 0; - if (unlikely(reserve_avail_buf_mergeable(dev, vq, - pkt_len, buf_vec, &num_buffers, - avail_head) < 0)) { - LOG_DEBUG(VHOST_DATA, + if (unlikely(reserve_avail_buf_packed(dev, vq, + pkt_len, buf_vec, &nr_vec, + &num_buffers, &nr_descs) < 0)) { + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) failed to get enough desc from vring\n", dev->vid); vq->shadow_used_idx -= num_buffers; break; } - LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", + rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr); + + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", dev->vid, vq->last_avail_idx, vq->last_avail_idx + num_buffers); - if (copy_mbuf_to_desc_mergeable(dev, vq, pkts[pkt_idx], - buf_vec, num_buffers) < 0) { + if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx], + buf_vec, nr_vec, + num_buffers) < 0) { vq->shadow_used_idx -= num_buffers; break; } - vq->last_avail_idx += num_buffers; + vq->last_avail_idx += nr_descs; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } } do_data_copy_enqueue(dev, vq); if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring(dev, vq); - vhost_vring_call(dev, vq); + flush_shadow_used_ring_packed(dev, vq); + vhost_vring_call_packed(dev, vq); } + return pkt_idx; +} + +static __rte_always_inline uint32_t +virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, + struct rte_mbuf **pkts, uint32_t count) +{ + struct vhost_virtqueue *vq; + + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); + if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) { + RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", + dev->vid, __func__, queue_id); + return 0; + } + + vq = dev->virtqueue[queue_id]; + + rte_spinlock_lock(&vq->access_lock); + + if (unlikely(vq->enabled == 0)) + goto out_access_unlock; + + if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) + vhost_user_iotlb_rd_lock(vq); + + if (unlikely(vq->access_ok == 0)) + if (unlikely(vring_translate(dev, vq) < 0)) + goto out; + + count = RTE_MIN((uint32_t)MAX_PKT_BURST, count); + if (count == 0) + goto out; + + if (vq_is_packed(dev)) + count = virtio_dev_rx_packed(dev, vq, pkts, count); + else + count = virtio_dev_rx_split(dev, vq, pkts, count); + out: if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) vhost_user_iotlb_rd_unlock(vq); @@ -691,7 +926,7 @@ out: out_access_unlock: rte_spinlock_unlock(&vq->access_lock); - return pkt_idx; + return count; } uint16_t @@ -710,10 +945,7 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id, return 0; } - if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) - return virtio_dev_merge_rx(dev, queue_id, pkts, count); - else - return virtio_dev_rx(dev, queue_id, pkts, count); + return virtio_dev_rx(dev, queue_id, pkts, count); } static inline bool @@ -838,42 +1070,62 @@ put_zmbuf(struct zcopy_mbuf *zmbuf) static __rte_always_inline int copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, - struct vring_desc *descs, uint16_t max_desc, - struct rte_mbuf *m, uint16_t desc_idx, - struct rte_mempool *mbuf_pool) + struct buf_vector *buf_vec, uint16_t nr_vec, + struct rte_mbuf *m, struct rte_mempool *mbuf_pool) { - struct vring_desc *desc; - uint64_t desc_addr; - uint32_t desc_avail, desc_offset; + uint32_t buf_avail, buf_offset; + uint64_t buf_addr, buf_iova, buf_len; uint32_t mbuf_avail, mbuf_offset; uint32_t cpy_len; struct rte_mbuf *cur = m, *prev = m; + struct virtio_net_hdr tmp_hdr; struct virtio_net_hdr *hdr = NULL; /* A counter to avoid desc dead loop chain */ - uint32_t nr_desc = 1; + uint16_t vec_idx = 0; struct batch_copy_elem *batch_copy = vq->batch_copy_elems; - uint16_t copy_nb = vq->batch_copy_nb_elems; int error = 0; - desc = &descs[desc_idx]; - if (unlikely((desc->len < dev->vhost_hlen)) || - (desc->flags & VRING_DESC_F_INDIRECT)) { - error = -1; - goto out; - } + buf_addr = buf_vec[vec_idx].buf_addr; + buf_iova = buf_vec[vec_idx].buf_iova; + buf_len = buf_vec[vec_idx].buf_len; - desc_addr = vhost_iova_to_vva(dev, - vq, desc->addr, - desc->len, - VHOST_ACCESS_RO); - if (unlikely(!desc_addr)) { + if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) { error = -1; goto out; } + if (likely(nr_vec > 1)) + rte_prefetch0((void *)(uintptr_t)buf_vec[1].buf_addr); + if (virtio_net_with_host_offload(dev)) { - hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr); - rte_prefetch0(hdr); + if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) { + uint64_t len; + uint64_t remain = sizeof(struct virtio_net_hdr); + uint64_t src; + uint64_t dst = (uint64_t)(uintptr_t)&tmp_hdr; + uint16_t hdr_vec_idx = 0; + + /* + * No luck, the virtio-net header doesn't fit + * in a contiguous virtual area. + */ + while (remain) { + len = RTE_MIN(remain, + buf_vec[hdr_vec_idx].buf_len); + src = buf_vec[hdr_vec_idx].buf_addr; + rte_memcpy((void *)(uintptr_t)dst, + (void *)(uintptr_t)src, len); + + remain -= len; + dst += len; + hdr_vec_idx++; + } + + hdr = &tmp_hdr; + } else { + hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr); + rte_prefetch0(hdr); + } } /* @@ -881,41 +1133,40 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, * for Tx: the first for storing the header, and others * for storing the data. */ - if (likely((desc->len == dev->vhost_hlen) && - (desc->flags & VRING_DESC_F_NEXT) != 0)) { - desc = &descs[desc->next]; - if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) { - error = -1; - goto out; - } - - desc_addr = vhost_iova_to_vva(dev, - vq, desc->addr, - desc->len, - VHOST_ACCESS_RO); - if (unlikely(!desc_addr)) { - error = -1; + if (unlikely(buf_len < dev->vhost_hlen)) { + buf_offset = dev->vhost_hlen - buf_len; + vec_idx++; + buf_addr = buf_vec[vec_idx].buf_addr; + buf_iova = buf_vec[vec_idx].buf_iova; + buf_len = buf_vec[vec_idx].buf_len; + buf_avail = buf_len - buf_offset; + } else if (buf_len == dev->vhost_hlen) { + if (unlikely(++vec_idx >= nr_vec)) goto out; - } + buf_addr = buf_vec[vec_idx].buf_addr; + buf_iova = buf_vec[vec_idx].buf_iova; + buf_len = buf_vec[vec_idx].buf_len; - desc_offset = 0; - desc_avail = desc->len; - nr_desc += 1; + buf_offset = 0; + buf_avail = buf_len; } else { - desc_avail = desc->len - dev->vhost_hlen; - desc_offset = dev->vhost_hlen; + buf_offset = dev->vhost_hlen; + buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen; } - rte_prefetch0((void *)(uintptr_t)(desc_addr + desc_offset)); + rte_prefetch0((void *)(uintptr_t) + (buf_addr + buf_offset)); - PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), desc_avail, 0); + PRINT_PACKET(dev, + (uintptr_t)(buf_addr + buf_offset), + (uint32_t)buf_avail, 0); mbuf_offset = 0; mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM; while (1) { uint64_t hpa; - cpy_len = RTE_MIN(desc_avail, mbuf_avail); + cpy_len = RTE_MIN(buf_avail, mbuf_avail); /* * A desc buf might across two host physical pages that are @@ -923,11 +1174,11 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, * will be copied even though zero copy is enabled. */ if (unlikely(dev->dequeue_zero_copy && (hpa = gpa_to_hpa(dev, - desc->addr + desc_offset, cpy_len)))) { + buf_iova + buf_offset, cpy_len)))) { cur->data_len = cpy_len; cur->data_off = 0; - cur->buf_addr = (void *)(uintptr_t)(desc_addr - + desc_offset); + cur->buf_addr = + (void *)(uintptr_t)(buf_addr + buf_offset); cur->buf_iova = hpa; /* @@ -937,61 +1188,53 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, mbuf_avail = cpy_len; } else { if (likely(cpy_len > MAX_BATCH_LEN || - copy_nb >= vq->size || + vq->batch_copy_nb_elems >= vq->size || (hdr && cur == m))) { rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset), - (void *)((uintptr_t)(desc_addr + - desc_offset)), + (void *)((uintptr_t)(buf_addr + + buf_offset)), cpy_len); } else { - batch_copy[copy_nb].dst = + batch_copy[vq->batch_copy_nb_elems].dst = rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset); - batch_copy[copy_nb].src = - (void *)((uintptr_t)(desc_addr + - desc_offset)); - batch_copy[copy_nb].len = cpy_len; - copy_nb++; + batch_copy[vq->batch_copy_nb_elems].src = + (void *)((uintptr_t)(buf_addr + + buf_offset)); + batch_copy[vq->batch_copy_nb_elems].len = + cpy_len; + vq->batch_copy_nb_elems++; } } mbuf_avail -= cpy_len; mbuf_offset += cpy_len; - desc_avail -= cpy_len; - desc_offset += cpy_len; + buf_avail -= cpy_len; + buf_offset += cpy_len; - /* This desc reaches to its end, get the next one */ - if (desc_avail == 0) { - if ((desc->flags & VRING_DESC_F_NEXT) == 0) + /* This buf reaches to its end, get the next one */ + if (buf_avail == 0) { + if (++vec_idx >= nr_vec) break; - if (unlikely(desc->next >= max_desc || - ++nr_desc > max_desc)) { - error = -1; - goto out; - } - desc = &descs[desc->next]; - if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) { - error = -1; - goto out; - } - - desc_addr = vhost_iova_to_vva(dev, - vq, desc->addr, - desc->len, - VHOST_ACCESS_RO); - if (unlikely(!desc_addr)) { - error = -1; - goto out; - } + buf_addr = buf_vec[vec_idx].buf_addr; + buf_iova = buf_vec[vec_idx].buf_iova; + buf_len = buf_vec[vec_idx].buf_len; - rte_prefetch0((void *)(uintptr_t)desc_addr); + /* + * Prefecth desc n + 1 buffer while + * desc n buffer is processed. + */ + if (vec_idx + 1 < nr_vec) + rte_prefetch0((void *)(uintptr_t) + buf_vec[vec_idx + 1].buf_addr); - desc_offset = 0; - desc_avail = desc->len; + buf_offset = 0; + buf_avail = buf_len; - PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0); + PRINT_PACKET(dev, (uintptr_t)buf_addr, + (uint32_t)buf_avail, 0); } /* @@ -1027,38 +1270,10 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, vhost_dequeue_offload(hdr, m); out: - vq->batch_copy_nb_elems = copy_nb; return error; } -static __rte_always_inline void -update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint32_t used_idx, uint32_t desc_idx) -{ - vq->used->ring[used_idx].id = desc_idx; - vq->used->ring[used_idx].len = 0; - vhost_log_used_vring(dev, vq, - offsetof(struct vring_used, ring[used_idx]), - sizeof(vq->used->ring[used_idx])); -} - -static __rte_always_inline void -update_used_idx(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint32_t count) -{ - if (unlikely(count == 0)) - return; - - rte_smp_wmb(); - rte_smp_rmb(); - - vq->used->idx += count; - vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), - sizeof(vq->used->idx)); - vhost_vring_call(dev, vq); -} - static __rte_always_inline struct zcopy_mbuf * get_zmbuf(struct vhost_virtqueue *vq) { @@ -1118,66 +1333,137 @@ restore_mbuf(struct rte_mbuf *m) } } -uint16_t -rte_vhost_dequeue_burst(int vid, uint16_t queue_id, +static __rte_always_inline uint16_t +virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) { - struct virtio_net *dev; - struct rte_mbuf *rarp_mbuf = NULL; - struct vhost_virtqueue *vq; - uint32_t desc_indexes[MAX_PKT_BURST]; - uint32_t used_idx; - uint32_t i = 0; + uint16_t i; uint16_t free_entries; - uint16_t avail_idx; - dev = get_device(vid); - if (!dev) - return 0; + if (unlikely(dev->dequeue_zero_copy)) { + struct zcopy_mbuf *zmbuf, *next; - if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) { - RTE_LOG(ERR, VHOST_DATA, - "(%d) %s: built-in vhost net backend is disabled.\n", - dev->vid, __func__); - return 0; - } + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); - if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) { - RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", - dev->vid, __func__, queue_id); - return 0; + if (mbuf_is_consumed(zmbuf->mbuf)) { + update_shadow_used_ring_split(vq, + zmbuf->desc_idx, 0); + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + restore_mbuf(zmbuf->mbuf); + rte_pktmbuf_free(zmbuf->mbuf); + put_zmbuf(zmbuf); + vq->nr_zmbuf -= 1; + } + } + + flush_shadow_used_ring_split(dev, vq); + vhost_vring_call_split(dev, vq); } - vq = dev->virtqueue[queue_id]; + rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); - if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0)) + free_entries = *((volatile uint16_t *)&vq->avail->idx) - + vq->last_avail_idx; + if (free_entries == 0) return 0; - if (unlikely(vq->enabled == 0)) - goto out_access_unlock; + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); - vq->batch_copy_nb_elems = 0; + count = RTE_MIN(count, MAX_PKT_BURST); + count = RTE_MIN(count, free_entries); + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n", + dev->vid, count); - if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) - vhost_user_iotlb_rd_lock(vq); + for (i = 0; i < count; i++) { + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint16_t head_idx, dummy_len; + uint16_t nr_vec = 0; + int err; - if (unlikely(vq->access_ok == 0)) - if (unlikely(vring_translate(dev, vq) < 0)) - goto out; + if (unlikely(fill_vec_buf_split(dev, vq, + vq->last_avail_idx + i, + &nr_vec, buf_vec, + &head_idx, &dummy_len, + VHOST_ACCESS_RO) < 0)) + break; + + if (likely(dev->dequeue_zero_copy == 0)) + update_shadow_used_ring_split(vq, head_idx, 0); + + rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr); + + pkts[i] = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(pkts[i] == NULL)) { + RTE_LOG(ERR, VHOST_DATA, + "Failed to allocate memory for mbuf.\n"); + break; + } + + err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], + mbuf_pool); + if (unlikely(err)) { + rte_pktmbuf_free(pkts[i]); + break; + } + + if (unlikely(dev->dequeue_zero_copy)) { + struct zcopy_mbuf *zmbuf; + + zmbuf = get_zmbuf(vq); + if (!zmbuf) { + rte_pktmbuf_free(pkts[i]); + break; + } + zmbuf->mbuf = pkts[i]; + zmbuf->desc_idx = head_idx; + + /* + * Pin lock the mbuf; we will check later to see + * whether the mbuf is freed (when we are the last + * user) or not. If that's the case, we then could + * update the used ring safely. + */ + rte_mbuf_refcnt_update(pkts[i], 1); + + vq->nr_zmbuf += 1; + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); + } + } + vq->last_avail_idx += i; + + if (likely(dev->dequeue_zero_copy == 0)) { + do_data_copy_dequeue(vq); + if (unlikely(i < count)) + vq->shadow_used_idx = i; + flush_shadow_used_ring_split(dev, vq); + vhost_vring_call_split(dev, vq); + } + + return i; +} + +static __rte_always_inline uint16_t +virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) +{ + uint16_t i; + + rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]); if (unlikely(dev->dequeue_zero_copy)) { struct zcopy_mbuf *zmbuf, *next; - int nr_updated = 0; for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); zmbuf != NULL; zmbuf = next) { next = TAILQ_NEXT(zmbuf, next); if (mbuf_is_consumed(zmbuf->mbuf)) { - used_idx = vq->last_used_idx++ & (vq->size - 1); - update_used_ring(dev, vq, used_idx, - zmbuf->desc_idx); - nr_updated += 1; + update_shadow_used_ring_packed(vq, + zmbuf->desc_idx, + 0, + zmbuf->desc_count); TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); restore_mbuf(zmbuf->mbuf); @@ -1187,93 +1473,34 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, } } - update_used_idx(dev, vq, nr_updated); + flush_shadow_used_ring_packed(dev, vq); + vhost_vring_call_packed(dev, vq); } - /* - * Construct a RARP broadcast packet, and inject it to the "pkts" - * array, to looks like that guest actually send such packet. - * - * Check user_send_rarp() for more information. - * - * broadcast_rarp shares a cacheline in the virtio_net structure - * with some fields that are accessed during enqueue and - * rte_atomic16_cmpset() causes a write if using cmpxchg. This could - * result in false sharing between enqueue and dequeue. - * - * Prevent unnecessary false sharing by reading broadcast_rarp first - * and only performing cmpset if the read indicates it is likely to - * be set. - */ - - if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) && - rte_atomic16_cmpset((volatile uint16_t *) - &dev->broadcast_rarp.cnt, 1, 0))) { - - rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac); - if (rarp_mbuf == NULL) { - RTE_LOG(ERR, VHOST_DATA, - "Failed to make RARP packet.\n"); - return 0; - } - count -= 1; - } - - free_entries = *((volatile uint16_t *)&vq->avail->idx) - - vq->last_avail_idx; - if (free_entries == 0) - goto out; - - LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); - - /* Prefetch available and used ring */ - avail_idx = vq->last_avail_idx & (vq->size - 1); - used_idx = vq->last_used_idx & (vq->size - 1); - rte_prefetch0(&vq->avail->ring[avail_idx]); - rte_prefetch0(&vq->used->ring[used_idx]); + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); count = RTE_MIN(count, MAX_PKT_BURST); - count = RTE_MIN(count, free_entries); - LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n", + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n", dev->vid, count); - /* Retrieve all of the head indexes first to avoid caching issues. */ - for (i = 0; i < count; i++) { - avail_idx = (vq->last_avail_idx + i) & (vq->size - 1); - used_idx = (vq->last_used_idx + i) & (vq->size - 1); - desc_indexes[i] = vq->avail->ring[avail_idx]; - - if (likely(dev->dequeue_zero_copy == 0)) - update_used_ring(dev, vq, used_idx, desc_indexes[i]); - } - - /* Prefetch descriptor index. */ - rte_prefetch0(&vq->desc[desc_indexes[0]]); for (i = 0; i < count; i++) { - struct vring_desc *desc; - uint16_t sz, idx; + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint16_t buf_id, dummy_len; + uint16_t desc_count, nr_vec = 0; int err; - if (likely(i + 1 < count)) - rte_prefetch0(&vq->desc[desc_indexes[i + 1]]); + if (unlikely(fill_vec_buf_packed(dev, vq, + vq->last_avail_idx, &desc_count, + buf_vec, &nr_vec, + &buf_id, &dummy_len, + VHOST_ACCESS_RW) < 0)) + break; - if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) { - desc = (struct vring_desc *)(uintptr_t) - vhost_iova_to_vva(dev, vq, - vq->desc[desc_indexes[i]].addr, - sizeof(*desc), - VHOST_ACCESS_RO); - if (unlikely(!desc)) - break; + if (likely(dev->dequeue_zero_copy == 0)) + update_shadow_used_ring_packed(vq, buf_id, 0, + desc_count); - rte_prefetch0(desc); - sz = vq->desc[desc_indexes[i]].len / sizeof(*desc); - idx = 0; - } else { - desc = vq->desc; - sz = vq->size; - idx = desc_indexes[i]; - } + rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr); pkts[i] = rte_pktmbuf_alloc(mbuf_pool); if (unlikely(pkts[i] == NULL)) { @@ -1282,8 +1509,8 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, break; } - err = copy_desc_to_mbuf(dev, vq, desc, sz, pkts[i], idx, - mbuf_pool); + err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], + mbuf_pool); if (unlikely(err)) { rte_pktmbuf_free(pkts[i]); break; @@ -1298,7 +1525,8 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, break; } zmbuf->mbuf = pkts[i]; - zmbuf->desc_idx = desc_indexes[i]; + zmbuf->desc_idx = buf_id; + zmbuf->desc_count = desc_count; /* * Pin lock the mbuf; we will check later to see @@ -1311,15 +1539,103 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, vq->nr_zmbuf += 1; TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); } + + vq->last_avail_idx += desc_count; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } } - vq->last_avail_idx += i; if (likely(dev->dequeue_zero_copy == 0)) { do_data_copy_dequeue(vq); - vq->last_used_idx += i; - update_used_idx(dev, vq, i); + if (unlikely(i < count)) + vq->shadow_used_idx = i; + flush_shadow_used_ring_packed(dev, vq); + vhost_vring_call_packed(dev, vq); + } + + return i; +} + +uint16_t +rte_vhost_dequeue_burst(int vid, uint16_t queue_id, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) +{ + struct virtio_net *dev; + struct rte_mbuf *rarp_mbuf = NULL; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return 0; + + if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) { + RTE_LOG(ERR, VHOST_DATA, + "(%d) %s: built-in vhost net backend is disabled.\n", + dev->vid, __func__); + return 0; + } + + if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) { + RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", + dev->vid, __func__, queue_id); + return 0; + } + + vq = dev->virtqueue[queue_id]; + + if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0)) + return 0; + + if (unlikely(vq->enabled == 0)) { + count = 0; + goto out_access_unlock; + } + + if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) + vhost_user_iotlb_rd_lock(vq); + + if (unlikely(vq->access_ok == 0)) + if (unlikely(vring_translate(dev, vq) < 0)) { + count = 0; + goto out; + } + + /* + * Construct a RARP broadcast packet, and inject it to the "pkts" + * array, to looks like that guest actually send such packet. + * + * Check user_send_rarp() for more information. + * + * broadcast_rarp shares a cacheline in the virtio_net structure + * with some fields that are accessed during enqueue and + * rte_atomic16_cmpset() causes a write if using cmpxchg. This could + * result in false sharing between enqueue and dequeue. + * + * Prevent unnecessary false sharing by reading broadcast_rarp first + * and only performing cmpset if the read indicates it is likely to + * be set. + */ + if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) && + rte_atomic16_cmpset((volatile uint16_t *) + &dev->broadcast_rarp.cnt, 1, 0))) { + + rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac); + if (rarp_mbuf == NULL) { + RTE_LOG(ERR, VHOST_DATA, + "Failed to make RARP packet.\n"); + count = 0; + goto out; + } + count -= 1; } + if (vq_is_packed(dev)) + count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count); + else + count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count); + out: if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) vhost_user_iotlb_rd_unlock(vq); @@ -1332,10 +1648,10 @@ out_access_unlock: * Inject it to the head of "pkts" array, so that switch's mac * learning table will get updated first. */ - memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *)); + memmove(&pkts[1], pkts, count * sizeof(struct rte_mbuf *)); pkts[0] = rarp_mbuf; - i += 1; + count += 1; } - return i; + return count; } |