From 3d9b72106bd664b1267533e7278ff817f942e3c6 Mon Sep 17 00:00:00 2001 From: Christian Ehrhardt Date: Thu, 8 Dec 2016 14:07:29 +0100 Subject: Imported Upstream version 16.11 Change-Id: I1944c65ddc88a9ad70f8c0eb6731552b84fbcb77 Signed-off-by: Christian Ehrhardt --- lib/librte_vhost/Makefile | 13 +- lib/librte_vhost/eventfd_link/Makefile | 41 - lib/librte_vhost/eventfd_link/eventfd_link.c | 277 ------ lib/librte_vhost/eventfd_link/eventfd_link.h | 94 -- lib/librte_vhost/fd_man.c | 299 +++++++ lib/librte_vhost/fd_man.h | 67 ++ lib/librte_vhost/libvirt/qemu-wrap.py | 387 -------- lib/librte_vhost/rte_virtio_net.h | 10 +- lib/librte_vhost/socket.c | 615 +++++++++++++ lib/librte_vhost/vhost-net.h | 250 ------ lib/librte_vhost/vhost.c | 430 +++++++++ lib/librte_vhost/vhost.h | 293 ++++++ lib/librte_vhost/vhost_cuse/eventfd_copy.c | 104 --- lib/librte_vhost/vhost_cuse/eventfd_copy.h | 45 - lib/librte_vhost/vhost_cuse/vhost-net-cdev.c | 431 --------- lib/librte_vhost/vhost_cuse/virtio-net-cdev.c | 433 --------- lib/librte_vhost/vhost_cuse/virtio-net-cdev.h | 56 -- lib/librte_vhost/vhost_rxtx.c | 931 ------------------- lib/librte_vhost/vhost_user.c | 1033 +++++++++++++++++++++ lib/librte_vhost/vhost_user.h | 128 +++ lib/librte_vhost/vhost_user/fd_man.c | 299 ------- lib/librte_vhost/vhost_user/fd_man.h | 67 -- lib/librte_vhost/vhost_user/vhost-net-user.c | 795 ----------------- lib/librte_vhost/vhost_user/vhost-net-user.h | 113 --- lib/librte_vhost/vhost_user/virtio-net-user.c | 470 ---------- lib/librte_vhost/vhost_user/virtio-net-user.h | 62 -- lib/librte_vhost/virtio-net.c | 847 ------------------ lib/librte_vhost/virtio_net.c | 1182 +++++++++++++++++++++++++ 28 files changed, 4052 insertions(+), 5720 deletions(-) delete mode 100644 lib/librte_vhost/eventfd_link/Makefile delete mode 100644 lib/librte_vhost/eventfd_link/eventfd_link.c delete mode 100644 lib/librte_vhost/eventfd_link/eventfd_link.h create mode 100644 lib/librte_vhost/fd_man.c create mode 100644 lib/librte_vhost/fd_man.h delete mode 100755 lib/librte_vhost/libvirt/qemu-wrap.py create mode 100644 lib/librte_vhost/socket.c delete mode 100644 lib/librte_vhost/vhost-net.h create mode 100644 lib/librte_vhost/vhost.c create mode 100644 lib/librte_vhost/vhost.h delete mode 100644 lib/librte_vhost/vhost_cuse/eventfd_copy.c delete mode 100644 lib/librte_vhost/vhost_cuse/eventfd_copy.h delete mode 100644 lib/librte_vhost/vhost_cuse/vhost-net-cdev.c delete mode 100644 lib/librte_vhost/vhost_cuse/virtio-net-cdev.c delete mode 100644 lib/librte_vhost/vhost_cuse/virtio-net-cdev.h delete mode 100644 lib/librte_vhost/vhost_rxtx.c create mode 100644 lib/librte_vhost/vhost_user.c create mode 100644 lib/librte_vhost/vhost_user.h delete mode 100644 lib/librte_vhost/vhost_user/fd_man.c delete mode 100644 lib/librte_vhost/vhost_user/fd_man.h delete mode 100644 lib/librte_vhost/vhost_user/vhost-net-user.c delete mode 100644 lib/librte_vhost/vhost_user/vhost-net-user.h delete mode 100644 lib/librte_vhost/vhost_user/virtio-net-user.c delete mode 100644 lib/librte_vhost/vhost_user/virtio-net-user.h delete mode 100644 lib/librte_vhost/virtio-net.c create mode 100644 lib/librte_vhost/virtio_net.c (limited to 'lib/librte_vhost') diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile index 538adb0b..415ffc6e 100644 --- a/lib/librte_vhost/Makefile +++ b/lib/librte_vhost/Makefile @@ -39,25 +39,16 @@ EXPORT_MAP := rte_vhost_version.map LIBABIVER := 3 CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -D_FILE_OFFSET_BITS=64 -ifeq ($(CONFIG_RTE_LIBRTE_VHOST_USER),y) CFLAGS += -I vhost_user LDLIBS += -lpthread -else -CFLAGS += -I vhost_cuse -LDLIBS += -lfuse -endif ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y) LDLIBS += -lnuma endif # all source are stored in SRCS-y -SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := virtio-net.c vhost_rxtx.c -ifeq ($(CONFIG_RTE_LIBRTE_VHOST_USER),y) -SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += vhost_user/vhost-net-user.c vhost_user/virtio-net-user.c vhost_user/fd_man.c -else -SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += vhost_cuse/vhost-net-cdev.c vhost_cuse/virtio-net-cdev.c vhost_cuse/eventfd_copy.c -endif +SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c socket.c vhost.c vhost_user.c \ + virtio_net.c # install includes SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_virtio_net.h diff --git a/lib/librte_vhost/eventfd_link/Makefile b/lib/librte_vhost/eventfd_link/Makefile deleted file mode 100644 index 3140e8bf..00000000 --- a/lib/librte_vhost/eventfd_link/Makefile +++ /dev/null @@ -1,41 +0,0 @@ -# BSD LICENSE -# -# Copyright(c) 2010-2014 Intel Corporation. All rights reserved. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Intel Corporation nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -RTE_KERNELDIR ?= /lib/modules/$(shell uname -r)/build - -obj-m += eventfd_link.o - - -all: - make -C $(RTE_KERNELDIR) M=$(PWD) modules - -clean: - make -C $(RTE_KERNELDIR) M=$(PWD) clean diff --git a/lib/librte_vhost/eventfd_link/eventfd_link.c b/lib/librte_vhost/eventfd_link/eventfd_link.c deleted file mode 100644 index 4b05b5a8..00000000 --- a/lib/librte_vhost/eventfd_link/eventfd_link.c +++ /dev/null @@ -1,277 +0,0 @@ -/*- - * GPL LICENSE SUMMARY - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * The full GNU General Public License is included in this distribution - * in the file called LICENSE.GPL. - * - * Contact Information: - * Intel Corporation - */ - -#include -#include -#include -#include -#include - -#include "eventfd_link.h" - - -/* - * get_files_struct is copied from fs/file.c - */ -struct files_struct * -get_files_struct(struct task_struct *task) -{ - struct files_struct *files; - - task_lock(task); - files = task->files; - if (files) - atomic_inc(&files->count); - task_unlock(task); - - return files; -} - -/* - * put_files_struct is extracted from fs/file.c - */ -void -put_files_struct(struct files_struct *files) -{ - if (atomic_dec_and_test(&files->count)) - BUG(); -} - -static struct file * -fget_from_files(struct files_struct *files, unsigned fd) -{ - struct file *file; - - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - if (file->f_mode & FMODE_PATH || - !atomic_long_inc_not_zero(&file->f_count)) { - - file = NULL; - } - } - rcu_read_unlock(); - - return file; -} - -static long -eventfd_link_ioctl_copy2(unsigned long arg) -{ - void __user *argp = (void __user *) arg; - struct task_struct *task_target = NULL; - struct file *file; - struct files_struct *files; - struct eventfd_copy2 eventfd_copy2; - long ret = -EFAULT; - - if (copy_from_user(&eventfd_copy2, argp, sizeof(struct eventfd_copy2))) - goto out; - - /* - * Find the task struct for the target pid - */ - ret = -ESRCH; - - task_target = - get_pid_task(find_vpid(eventfd_copy2.pid), PIDTYPE_PID); - if (task_target == NULL) { - pr_info("Unable to find pid %d\n", eventfd_copy2.pid); - goto out; - } - - ret = -ESTALE; - files = get_files_struct(task_target); - if (files == NULL) { - pr_info("Failed to get target files struct\n"); - goto out_task; - } - - ret = -EBADF; - file = fget_from_files(files, eventfd_copy2.fd); - put_files_struct(files); - - if (file == NULL) { - pr_info("Failed to get fd %d from target\n", eventfd_copy2.fd); - goto out_task; - } - - /* - * Install the file struct from the target process into the - * newly allocated file desciptor of the source process. - */ - ret = get_unused_fd_flags(eventfd_copy2.flags); - if (ret < 0) { - fput(file); - goto out_task; - } - fd_install(ret, file); - -out_task: - put_task_struct(task_target); -out: - return ret; -} - -static long -eventfd_link_ioctl_copy(unsigned long arg) -{ - void __user *argp = (void __user *) arg; - struct task_struct *task_target = NULL; - struct file *file; - struct files_struct *files; - struct fdtable *fdt; - struct eventfd_copy eventfd_copy; - long ret = -EFAULT; - - if (copy_from_user(&eventfd_copy, argp, sizeof(struct eventfd_copy))) - goto out; - - /* - * Find the task struct for the target pid - */ - ret = -ESRCH; - - task_target = - get_pid_task(find_vpid(eventfd_copy.target_pid), PIDTYPE_PID); - if (task_target == NULL) { - pr_info("Unable to find pid %d\n", eventfd_copy.target_pid); - goto out; - } - - ret = -ESTALE; - files = get_files_struct(current); - if (files == NULL) { - pr_info("Failed to get current files struct\n"); - goto out_task; - } - - ret = -EBADF; - file = fget_from_files(files, eventfd_copy.source_fd); - - if (file == NULL) { - pr_info("Failed to get fd %d from source\n", - eventfd_copy.source_fd); - put_files_struct(files); - goto out_task; - } - - /* - * Release the existing eventfd in the source process - */ - spin_lock(&files->file_lock); - fput(file); - filp_close(file, files); - fdt = files_fdtable(files); - fdt->fd[eventfd_copy.source_fd] = NULL; - spin_unlock(&files->file_lock); - - put_files_struct(files); - - /* - * Find the file struct associated with the target fd. - */ - - ret = -ESTALE; - files = get_files_struct(task_target); - if (files == NULL) { - pr_info("Failed to get target files struct\n"); - goto out_task; - } - - ret = -EBADF; - file = fget_from_files(files, eventfd_copy.target_fd); - put_files_struct(files); - - if (file == NULL) { - pr_info("Failed to get fd %d from target\n", - eventfd_copy.target_fd); - goto out_task; - } - - /* - * Install the file struct from the target process into the - * file desciptor of the source process, - */ - - fd_install(eventfd_copy.source_fd, file); - ret = 0; - -out_task: - put_task_struct(task_target); -out: - return ret; -} - -static long -eventfd_link_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) -{ - long ret = -ENOIOCTLCMD; - - switch (ioctl) { - case EVENTFD_COPY: - ret = eventfd_link_ioctl_copy(arg); - break; - case EVENTFD_COPY2: - ret = eventfd_link_ioctl_copy2(arg); - break; - } - - return ret; -} - -static const struct file_operations eventfd_link_fops = { - .owner = THIS_MODULE, - .unlocked_ioctl = eventfd_link_ioctl, -}; - - -static struct miscdevice eventfd_link_misc = { - .minor = MISC_DYNAMIC_MINOR, - .name = "eventfd-link", - .fops = &eventfd_link_fops, -}; - -static int __init -eventfd_link_init(void) -{ - return misc_register(&eventfd_link_misc); -} - -module_init(eventfd_link_init); - -static void __exit -eventfd_link_exit(void) -{ - misc_deregister(&eventfd_link_misc); -} - -module_exit(eventfd_link_exit); - -MODULE_VERSION("0.0.1"); -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Anthony Fee"); -MODULE_DESCRIPTION("Link eventfd"); -MODULE_ALIAS("devname:eventfd-link"); diff --git a/lib/librte_vhost/eventfd_link/eventfd_link.h b/lib/librte_vhost/eventfd_link/eventfd_link.h deleted file mode 100644 index 5ebc20b8..00000000 --- a/lib/librte_vhost/eventfd_link/eventfd_link.h +++ /dev/null @@ -1,94 +0,0 @@ -/*- - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * The full GNU General Public License is included in this distribution - * in the file called LICENSE.GPL. - * - * Contact Information: - * Intel Corporation - * - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#ifndef _EVENTFD_LINK_H_ -#define _EVENTFD_LINK_H_ - -/* - * arguements for the EVENTFD_COPY ioctl - */ -struct eventfd_copy { - unsigned target_fd; /* fd in the target pid */ - unsigned source_fd; /* fd in the calling pid */ - pid_t target_pid; /* pid of the target pid */ -}; - -/* - * ioctl to copy an fd entry in calling process to an fd in a target process - * NOTE: this one should be - * #define EVENTFD_COPY _IOWR('D', 1, struct eventfd_copy) actually - */ -#define EVENTFD_COPY 1 - -/* - * arguments for the EVENTFD_COPY2 ioctl - */ -struct eventfd_copy2 { - unsigned fd; /* fd to steal */ - pid_t pid; /* pid of the process to steal from */ - unsigned flags; /* flags to allocate new fd with */ -}; - -/* - * ioctl to copy an fd entry from the target process into newly allocated - * fd in the calling process - */ -#define EVENTFD_COPY2 _IOW('D', 2, struct eventfd_copy2) - -#endif /* _EVENTFD_LINK_H_ */ diff --git a/lib/librte_vhost/fd_man.c b/lib/librte_vhost/fd_man.c new file mode 100644 index 00000000..2d3eeb7d --- /dev/null +++ b/lib/librte_vhost/fd_man.c @@ -0,0 +1,299 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "fd_man.h" + +/** + * Returns the index in the fdset for a given fd. + * If fd is -1, it means to search for a free entry. + * @return + * index for the fd, or -1 if fd isn't in the fdset. + */ +static int +fdset_find_fd(struct fdset *pfdset, int fd) +{ + int i; + + if (pfdset == NULL) + return -1; + + for (i = 0; i < MAX_FDS && pfdset->fd[i].fd != fd; i++) + ; + + return i == MAX_FDS ? -1 : i; +} + +static int +fdset_find_free_slot(struct fdset *pfdset) +{ + return fdset_find_fd(pfdset, -1); +} + +static int +fdset_add_fd(struct fdset *pfdset, int idx, int fd, + fd_cb rcb, fd_cb wcb, void *dat) +{ + struct fdentry *pfdentry; + + if (pfdset == NULL || idx >= MAX_FDS || fd >= FD_SETSIZE) + return -1; + + pfdentry = &pfdset->fd[idx]; + pfdentry->fd = fd; + pfdentry->rcb = rcb; + pfdentry->wcb = wcb; + pfdentry->dat = dat; + + return 0; +} + +/** + * Fill the read/write fd_set with the fds in the fdset. + * @return + * the maximum fds filled in the read/write fd_set. + */ +static int +fdset_fill(fd_set *rfset, fd_set *wfset, struct fdset *pfdset) +{ + struct fdentry *pfdentry; + int i, maxfds = -1; + int num = MAX_FDS; + + if (pfdset == NULL) + return -1; + + for (i = 0; i < num; i++) { + pfdentry = &pfdset->fd[i]; + if (pfdentry->fd != -1) { + int added = 0; + if (pfdentry->rcb && rfset) { + FD_SET(pfdentry->fd, rfset); + added = 1; + } + if (pfdentry->wcb && wfset) { + FD_SET(pfdentry->fd, wfset); + added = 1; + } + if (added) + maxfds = pfdentry->fd < maxfds ? + maxfds : pfdentry->fd; + } + } + return maxfds; +} + +void +fdset_init(struct fdset *pfdset) +{ + int i; + + if (pfdset == NULL) + return; + + for (i = 0; i < MAX_FDS; i++) { + pfdset->fd[i].fd = -1; + pfdset->fd[i].dat = NULL; + } + pfdset->num = 0; +} + +/** + * Register the fd in the fdset with read/write handler and context. + */ +int +fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat) +{ + int i; + + if (pfdset == NULL || fd == -1) + return -1; + + pthread_mutex_lock(&pfdset->fd_mutex); + + /* Find a free slot in the list. */ + i = fdset_find_free_slot(pfdset); + if (i == -1 || fdset_add_fd(pfdset, i, fd, rcb, wcb, dat) < 0) { + pthread_mutex_unlock(&pfdset->fd_mutex); + return -2; + } + + pfdset->num++; + + pthread_mutex_unlock(&pfdset->fd_mutex); + + return 0; +} + +/** + * Unregister the fd from the fdset. + * Returns context of a given fd or NULL. + */ +void * +fdset_del(struct fdset *pfdset, int fd) +{ + int i; + void *dat = NULL; + + if (pfdset == NULL || fd == -1) + return NULL; + + do { + pthread_mutex_lock(&pfdset->fd_mutex); + + i = fdset_find_fd(pfdset, fd); + if (i != -1 && pfdset->fd[i].busy == 0) { + /* busy indicates r/wcb is executing! */ + dat = pfdset->fd[i].dat; + pfdset->fd[i].fd = -1; + pfdset->fd[i].rcb = pfdset->fd[i].wcb = NULL; + pfdset->fd[i].dat = NULL; + pfdset->num--; + i = -1; + } + pthread_mutex_unlock(&pfdset->fd_mutex); + } while (i != -1); + + return dat; +} + +/** + * Unregister the fd at the specified slot from the fdset. + */ +static void +fdset_del_slot(struct fdset *pfdset, int index) +{ + if (pfdset == NULL || index < 0 || index >= MAX_FDS) + return; + + pthread_mutex_lock(&pfdset->fd_mutex); + + pfdset->fd[index].fd = -1; + pfdset->fd[index].rcb = pfdset->fd[index].wcb = NULL; + pfdset->fd[index].dat = NULL; + pfdset->num--; + + pthread_mutex_unlock(&pfdset->fd_mutex); +} + +/** + * This functions runs in infinite blocking loop until there is no fd in + * pfdset. It calls corresponding r/w handler if there is event on the fd. + * + * Before the callback is called, we set the flag to busy status; If other + * thread(now rte_vhost_driver_unregister) calls fdset_del concurrently, it + * will wait until the flag is reset to zero(which indicates the callback is + * finished), then it could free the context after fdset_del. + */ +void +fdset_event_dispatch(struct fdset *pfdset) +{ + fd_set rfds, wfds; + int i, maxfds; + struct fdentry *pfdentry; + int num = MAX_FDS; + fd_cb rcb, wcb; + void *dat; + int fd; + int remove1, remove2; + int ret; + + if (pfdset == NULL) + return; + + while (1) { + struct timeval tv; + tv.tv_sec = 1; + tv.tv_usec = 0; + FD_ZERO(&rfds); + FD_ZERO(&wfds); + pthread_mutex_lock(&pfdset->fd_mutex); + + maxfds = fdset_fill(&rfds, &wfds, pfdset); + + pthread_mutex_unlock(&pfdset->fd_mutex); + + /* + * When select is blocked, other threads might unregister + * listenfds from and register new listenfds into fdset. + * When select returns, the entries for listenfds in the fdset + * might have been updated. It is ok if there is unwanted call + * for new listenfds. + */ + ret = select(maxfds + 1, &rfds, &wfds, NULL, &tv); + if (ret <= 0) + continue; + + for (i = 0; i < num; i++) { + remove1 = remove2 = 0; + pthread_mutex_lock(&pfdset->fd_mutex); + pfdentry = &pfdset->fd[i]; + fd = pfdentry->fd; + rcb = pfdentry->rcb; + wcb = pfdentry->wcb; + dat = pfdentry->dat; + pfdentry->busy = 1; + pthread_mutex_unlock(&pfdset->fd_mutex); + if (fd >= 0 && FD_ISSET(fd, &rfds) && rcb) + rcb(fd, dat, &remove1); + if (fd >= 0 && FD_ISSET(fd, &wfds) && wcb) + wcb(fd, dat, &remove2); + pfdentry->busy = 0; + /* + * fdset_del needs to check busy flag. + * We don't allow fdset_del to be called in callback + * directly. + */ + /* + * When we are to clean up the fd from fdset, + * because the fd is closed in the cb, + * the old fd val could be reused by when creates new + * listen fd in another thread, we couldn't call + * fd_set_del. + */ + if (remove1 || remove2) + fdset_del_slot(pfdset, i); + } + } +} diff --git a/lib/librte_vhost/fd_man.h b/lib/librte_vhost/fd_man.h new file mode 100644 index 00000000..bd66ed1c --- /dev/null +++ b/lib/librte_vhost/fd_man.h @@ -0,0 +1,67 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _FD_MAN_H_ +#define _FD_MAN_H_ +#include +#include + +#define MAX_FDS 1024 + +typedef void (*fd_cb)(int fd, void *dat, int *remove); + +struct fdentry { + int fd; /* -1 indicates this entry is empty */ + fd_cb rcb; /* callback when this fd is readable. */ + fd_cb wcb; /* callback when this fd is writeable.*/ + void *dat; /* fd context */ + int busy; /* whether this entry is being used in cb. */ +}; + +struct fdset { + struct fdentry fd[MAX_FDS]; + pthread_mutex_t fd_mutex; + int num; /* current fd number of this fdset */ +}; + + +void fdset_init(struct fdset *pfdset); + +int fdset_add(struct fdset *pfdset, int fd, + fd_cb rcb, fd_cb wcb, void *dat); + +void *fdset_del(struct fdset *pfdset, int fd); + +void fdset_event_dispatch(struct fdset *pfdset); + +#endif diff --git a/lib/librte_vhost/libvirt/qemu-wrap.py b/lib/librte_vhost/libvirt/qemu-wrap.py deleted file mode 100755 index e6a2cc9d..00000000 --- a/lib/librte_vhost/libvirt/qemu-wrap.py +++ /dev/null @@ -1,387 +0,0 @@ -#!/usr/bin/python -#/* -# * BSD LICENSE -# * -# * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. -# * All rights reserved. -# * -# * Redistribution and use in source and binary forms, with or without -# * modification, are permitted provided that the following conditions -# * are met: -# * -# * * Redistributions of source code must retain the above copyright -# * notice, this list of conditions and the following disclaimer. -# * * Redistributions in binary form must reproduce the above copyright -# * notice, this list of conditions and the following disclaimer in -# * the documentation and/or other materials provided with the -# * distribution. -# * * Neither the name of Intel Corporation nor the names of its -# * contributors may be used to endorse or promote products derived -# * from this software without specific prior written permission. -# * -# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# */ - -##################################################################### -# This script is designed to modify the call to the QEMU emulator -# to support userspace vhost when starting a guest machine through -# libvirt with vhost enabled. The steps to enable this are as follows -# and should be run as root: -# -# 1. Place this script in a libvirtd's binary search PATH ($PATH) -# A good location would be in the same directory that the QEMU -# binary is located -# -# 2. Ensure that the script has the same owner/group and file -# permissions as the QEMU binary -# -# 3. Update the VM xml file using "virsh edit VM.xml" -# -# 3.a) Set the VM to use the launch script -# -# Set the emulator path contained in the -# tags -# -# e.g replace /usr/bin/qemu-kvm -# with /usr/bin/qemu-wrap.py -# -# 3.b) Set the VM's device's to use vhost-net offload -# -# -# -# -# -# -# 4. Enable libvirt to access our userpace device file by adding it to -# controllers cgroup for libvirtd using the following steps -# -# 4.a) In /etc/libvirt/qemu.conf add/edit the following lines: -# 1) cgroup_controllers = [ ... "devices", ... ] -# 2) clear_emulator_capabilities = 0 -# 3) user = "root" -# 4) group = "root" -# 5) cgroup_device_acl = [ -# "/dev/null", "/dev/full", "/dev/zero", -# "/dev/random", "/dev/urandom", -# "/dev/ptmx", "/dev/kvm", "/dev/kqemu", -# "/dev/rtc", "/dev/hpet", "/dev/net/tun", -# "/dev/", -# "/dev/hugepages", -# ] -# -# 4.b) Disable SELinux or set to permissive mode -# -# 4.c) Mount cgroup device controller -# "mkdir /dev/cgroup" -# "mount -t cgroup none /dev/cgroup -o devices" -# -# 4.d) Set hugetlbfs_mount variable - ( Optional ) -# VMs using userspace vhost must use hugepage backed -# memory. This can be enabled in the libvirt XML -# config by adding a memory backing section to the -# XML config e.g. -# -# -# -# This memory backing section should be added after the -# and sections. This will add -# flags "-mem-prealloc -mem-path " to the QEMU -# command line. The hugetlbfs_mount variable can be used -# to override the default passed through by libvirt. -# -# if "-mem-prealloc" or "-mem-path " are not passed -# through and a vhost device is detected then these options will -# be automatically added by this script. This script will detect -# the system hugetlbfs mount point to be used for . The -# default for this script can be overidden by the -# hugetlbfs_dir variable in the configuration section of this script. -# -# -# 4.e) Restart the libvirtd system process -# e.g. on Fedora "systemctl restart libvirtd.service" -# -# -# 4.f) Edit the Configuration Parameters section of this script -# to point to the correct emulator location and set any -# addition options -# -# The script modifies the libvirtd Qemu call by modifying/adding -# options based on the configuration parameters below. -# NOTE: -# emul_path and us_vhost_path must be set -# All other parameters are optional -##################################################################### - - -############################################# -# Configuration Parameters -############################################# -#Path to QEMU binary -emul_path = "/usr/local/bin/qemu-system-x86_64" - -#Path to userspace vhost device file -# This filename should match the --dev-basename parameters of -# the command used to launch the userspace vhost sample application e.g. -# if the sample app lauch command is: -# ./build/vhost-switch ..... --dev-basename usvhost -# then this variable should be set to: -# us_vhost_path = "/dev/usvhost" -us_vhost_path = "/dev/usvhost" - -#List of additional user defined emulation options. These options will -#be added to all Qemu calls -emul_opts_user = [] - -#List of additional user defined emulation options for vhost only. -#These options will only be added to vhost enabled guests -emul_opts_user_vhost = [] - -#For all VHOST enabled VMs, the VM memory is preallocated from hugetlbfs -# Set this variable to one to enable this option for all VMs -use_huge_all = 0 - -#Instead of autodetecting, override the hugetlbfs directory by setting -#this variable -hugetlbfs_dir = "" - -############################################# - - -############################################# -# ****** Do Not Modify Below this Line ****** -############################################# - -import sys, os, subprocess -import time -import signal - - -#List of open userspace vhost file descriptors -fd_list = [] - -#additional virtio device flags when using userspace vhost -vhost_flags = [ "csum=off", - "gso=off", - "guest_tso4=off", - "guest_tso6=off", - "guest_ecn=off" - ] - -#String of the path to the Qemu process pid -qemu_pid = "/tmp/%d-qemu.pid" % os.getpid() - -############################################# -# Signal haldler to kill Qemu subprocess -############################################# -def kill_qemu_process(signum, stack): - pidfile = open(qemu_pid, 'r') - pid = int(pidfile.read()) - os.killpg(pid, signal.SIGTERM) - pidfile.close() - - -############################################# -# Find the system hugefile mount point. -# Note: -# if multiple hugetlbfs mount points exist -# then the first one found will be used -############################################# -def find_huge_mount(): - - if (len(hugetlbfs_dir)): - return hugetlbfs_dir - - huge_mount = "" - - if (os.access("/proc/mounts", os.F_OK)): - f = open("/proc/mounts", "r") - line = f.readline() - while line: - line_split = line.split(" ") - if line_split[2] == 'hugetlbfs': - huge_mount = line_split[1] - break - line = f.readline() - else: - print "/proc/mounts not found" - exit (1) - - f.close - if len(huge_mount) == 0: - print "Failed to find hugetlbfs mount point" - exit (1) - - return huge_mount - - -############################################# -# Get a userspace Vhost file descriptor -############################################# -def get_vhost_fd(): - - if (os.access(us_vhost_path, os.F_OK)): - fd = os.open( us_vhost_path, os.O_RDWR) - else: - print ("US-Vhost file %s not found" %us_vhost_path) - exit (1) - - return fd - - -############################################# -# Check for vhostfd. if found then replace -# with our own vhost fd and append any vhost -# flags onto the end -############################################# -def modify_netdev_arg(arg): - - global fd_list - vhost_in_use = 0 - s = '' - new_opts = [] - netdev_opts = arg.split(",") - - for opt in netdev_opts: - #check if vhost is used - if "vhost" == opt[:5]: - vhost_in_use = 1 - else: - new_opts.append(opt) - - #if using vhost append vhost options - if vhost_in_use == 1: - #append vhost on option - new_opts.append('vhost=on') - #append vhostfd ption - new_fd = get_vhost_fd() - new_opts.append('vhostfd=' + str(new_fd)) - fd_list.append(new_fd) - - #concatenate all options - for opt in new_opts: - if len(s) > 0: - s+=',' - - s+=opt - - return s - - -############################################# -# Main -############################################# -def main(): - - global fd_list - global vhost_in_use - new_args = [] - num_cmd_args = len(sys.argv) - emul_call = '' - mem_prealloc_set = 0 - mem_path_set = 0 - num = 0; - - #parse the parameters - while (num < num_cmd_args): - arg = sys.argv[num] - - #Check netdev +1 parameter for vhostfd - if arg == '-netdev': - num_vhost_devs = len(fd_list) - new_args.append(arg) - - num+=1 - arg = sys.argv[num] - mod_arg = modify_netdev_arg(arg) - new_args.append(mod_arg) - - #append vhost flags if this is a vhost device - # and -device is the next arg - # i.e -device -opt1,-opt2,...,-opt3,%vhost - if (num_vhost_devs < len(fd_list)): - num+=1 - arg = sys.argv[num] - if arg == '-device': - new_args.append(arg) - num+=1 - new_arg = sys.argv[num] - for flag in vhost_flags: - new_arg = ''.join([new_arg,',',flag]) - new_args.append(new_arg) - else: - new_args.append(arg) - elif arg == '-mem-prealloc': - mem_prealloc_set = 1 - new_args.append(arg) - elif arg == '-mem-path': - mem_path_set = 1 - new_args.append(arg) - - else: - new_args.append(arg) - - num+=1 - - #Set Qemu binary location - emul_call+=emul_path - emul_call+=" " - - #Add prealloc mem options if using vhost and not already added - if ((len(fd_list) > 0) and (mem_prealloc_set == 0)): - emul_call += "-mem-prealloc " - - #Add mempath mem options if using vhost and not already added - if ((len(fd_list) > 0) and (mem_path_set == 0)): - #Detect and add hugetlbfs mount point - mp = find_huge_mount() - mp = "".join(["-mem-path ", mp]) - emul_call += mp - emul_call += " " - - #add user options - for opt in emul_opts_user: - emul_call += opt - emul_call += " " - - #Add add user vhost only options - if len(fd_list) > 0: - for opt in emul_opts_user_vhost: - emul_call += opt - emul_call += " " - - #Add updated libvirt options - iter_args = iter(new_args) - #skip 1st arg i.e. call to this script - next(iter_args) - for arg in iter_args: - emul_call+=str(arg) - emul_call+= " " - - emul_call += "-pidfile %s " % qemu_pid - #Call QEMU - process = subprocess.Popen(emul_call, shell=True, preexec_fn=os.setsid) - - for sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP, signal.SIGQUIT]: - signal.signal(sig, kill_qemu_process) - - process.wait() - - #Close usvhost files - for fd in fd_list: - os.close(fd) - #Cleanup temporary files - if os.access(qemu_pid, os.F_OK): - os.remove(qemu_pid) - -if __name__ == "__main__": - main() diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h index 9caa6221..926039c5 100644 --- a/lib/librte_vhost/rte_virtio_net.h +++ b/lib/librte_vhost/rte_virtio_net.h @@ -53,16 +53,13 @@ #define RTE_VHOST_USER_CLIENT (1ULL << 0) #define RTE_VHOST_USER_NO_RECONNECT (1ULL << 1) +#define RTE_VHOST_USER_DEQUEUE_ZERO_COPY (1ULL << 2) /* Enum for virtqueue management. */ enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM}; /** * Device and vring operations. - * - * Make sure to set VIRTIO_DEV_RUNNING to the device flags in new_device and - * remove it in destroy_device. - * */ struct virtio_net_device_ops { int (*new_device)(int vid); /**< Add device. */ @@ -126,9 +123,8 @@ int rte_vhost_get_numa_node(int vid); uint32_t rte_vhost_get_queue_num(int vid); /** - * Get the virtio net device's ifname. For vhost-cuse, ifname is the - * path of the char device. For vhost-user, ifname is the vhost-user - * socket file path. + * Get the virtio net device's ifname, which is the vhost-user socket + * file path. * * @param vid * virtio-net device ID diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c new file mode 100644 index 00000000..aaa9c270 --- /dev/null +++ b/lib/librte_vhost/socket.c @@ -0,0 +1,615 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "fd_man.h" +#include "vhost.h" +#include "vhost_user.h" + +/* + * Every time rte_vhost_driver_register() is invoked, an associated + * vhost_user_socket struct will be created. + */ +struct vhost_user_socket { + char *path; + int listenfd; + int connfd; + bool is_server; + bool reconnect; + bool dequeue_zero_copy; +}; + +struct vhost_user_connection { + struct vhost_user_socket *vsocket; + int vid; +}; + +#define MAX_VHOST_SOCKET 1024 +struct vhost_user { + struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET]; + struct fdset fdset; + int vsocket_cnt; + pthread_mutex_t mutex; +}; + +#define MAX_VIRTIO_BACKLOG 128 + +static void vhost_user_server_new_connection(int fd, void *data, int *remove); +static void vhost_user_read_cb(int fd, void *dat, int *remove); +static int vhost_user_create_client(struct vhost_user_socket *vsocket); + +static struct vhost_user vhost_user = { + .fdset = { + .fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} }, + .fd_mutex = PTHREAD_MUTEX_INITIALIZER, + .num = 0 + }, + .vsocket_cnt = 0, + .mutex = PTHREAD_MUTEX_INITIALIZER, +}; + +/* return bytes# of read on success or negative val on failure. */ +int +read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) +{ + struct iovec iov; + struct msghdr msgh; + size_t fdsize = fd_num * sizeof(int); + char control[CMSG_SPACE(fdsize)]; + struct cmsghdr *cmsg; + int ret; + + memset(&msgh, 0, sizeof(msgh)); + iov.iov_base = buf; + iov.iov_len = buflen; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + + ret = recvmsg(sockfd, &msgh, 0); + if (ret <= 0) { + RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n"); + return ret; + } + + if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) { + RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n"); + return -1; + } + + for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msgh, cmsg)) { + if ((cmsg->cmsg_level == SOL_SOCKET) && + (cmsg->cmsg_type == SCM_RIGHTS)) { + memcpy(fds, CMSG_DATA(cmsg), fdsize); + break; + } + } + + return ret; +} + +int +send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) +{ + + struct iovec iov; + struct msghdr msgh; + size_t fdsize = fd_num * sizeof(int); + char control[CMSG_SPACE(fdsize)]; + struct cmsghdr *cmsg; + int ret; + + memset(&msgh, 0, sizeof(msgh)); + iov.iov_base = buf; + iov.iov_len = buflen; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + + if (fds && fd_num > 0) { + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + cmsg = CMSG_FIRSTHDR(&msgh); + cmsg->cmsg_len = CMSG_LEN(fdsize); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), fds, fdsize); + } else { + msgh.msg_control = NULL; + msgh.msg_controllen = 0; + } + + do { + ret = sendmsg(sockfd, &msgh, 0); + } while (ret < 0 && errno == EINTR); + + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, "sendmsg error\n"); + return ret; + } + + return ret; +} + +static void +vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket) +{ + int vid; + size_t size; + struct vhost_user_connection *conn; + int ret; + + conn = malloc(sizeof(*conn)); + if (conn == NULL) { + close(fd); + return; + } + + vid = vhost_new_device(); + if (vid == -1) { + close(fd); + free(conn); + return; + } + + size = strnlen(vsocket->path, PATH_MAX); + vhost_set_ifname(vid, vsocket->path, size); + + if (vsocket->dequeue_zero_copy) + vhost_enable_dequeue_zero_copy(vid); + + RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid); + + vsocket->connfd = fd; + conn->vsocket = vsocket; + conn->vid = vid; + ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb, + NULL, conn); + if (ret < 0) { + vsocket->connfd = -1; + free(conn); + close(fd); + RTE_LOG(ERR, VHOST_CONFIG, + "failed to add fd %d into vhost server fdset\n", + fd); + } +} + +/* call back when there is new vhost-user connection from client */ +static void +vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused) +{ + struct vhost_user_socket *vsocket = dat; + + fd = accept(fd, NULL, NULL); + if (fd < 0) + return; + + RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd); + vhost_user_add_connection(fd, vsocket); +} + +static void +vhost_user_read_cb(int connfd, void *dat, int *remove) +{ + struct vhost_user_connection *conn = dat; + struct vhost_user_socket *vsocket = conn->vsocket; + int ret; + + ret = vhost_user_msg_handler(conn->vid, connfd); + if (ret < 0) { + vsocket->connfd = -1; + close(connfd); + *remove = 1; + vhost_destroy_device(conn->vid); + free(conn); + + if (vsocket->reconnect) + vhost_user_create_client(vsocket); + } +} + +static int +create_unix_socket(const char *path, struct sockaddr_un *un, bool is_server) +{ + int fd; + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) + return -1; + RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n", + is_server ? "server" : "client", fd); + + if (!is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) { + RTE_LOG(ERR, VHOST_CONFIG, + "vhost-user: can't set nonblocking mode for socket, fd: " + "%d (%s)\n", fd, strerror(errno)); + close(fd); + return -1; + } + + memset(un, 0, sizeof(*un)); + un->sun_family = AF_UNIX; + strncpy(un->sun_path, path, sizeof(un->sun_path)); + un->sun_path[sizeof(un->sun_path) - 1] = '\0'; + + return fd; +} + +static int +vhost_user_create_server(struct vhost_user_socket *vsocket) +{ + int fd; + int ret; + struct sockaddr_un un; + const char *path = vsocket->path; + + fd = create_unix_socket(path, &un, vsocket->is_server); + if (fd < 0) + return -1; + + ret = bind(fd, (struct sockaddr *)&un, sizeof(un)); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to bind to %s: %s; remove it and try again\n", + path, strerror(errno)); + goto err; + } + RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path); + + ret = listen(fd, MAX_VIRTIO_BACKLOG); + if (ret < 0) + goto err; + + vsocket->listenfd = fd; + ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection, + NULL, vsocket); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to add listen fd %d to vhost server fdset\n", + fd); + goto err; + } + + return 0; + +err: + close(fd); + return -1; +} + +struct vhost_user_reconnect { + struct sockaddr_un un; + int fd; + struct vhost_user_socket *vsocket; + + TAILQ_ENTRY(vhost_user_reconnect) next; +}; + +TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect); +struct vhost_user_reconnect_list { + struct vhost_user_reconnect_tailq_list head; + pthread_mutex_t mutex; +}; + +static struct vhost_user_reconnect_list reconn_list; +static pthread_t reconn_tid; + +static int +vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz) +{ + int ret, flags; + + ret = connect(fd, un, sz); + if (ret < 0 && errno != EISCONN) + return -1; + + flags = fcntl(fd, F_GETFL, 0); + if (flags < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "can't get flags for connfd %d\n", fd); + return -2; + } + if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) { + RTE_LOG(ERR, VHOST_CONFIG, + "can't disable nonblocking on fd %d\n", fd); + return -2; + } + return 0; +} + +static void * +vhost_user_client_reconnect(void *arg __rte_unused) +{ + int ret; + struct vhost_user_reconnect *reconn, *next; + + while (1) { + pthread_mutex_lock(&reconn_list.mutex); + + /* + * An equal implementation of TAILQ_FOREACH_SAFE, + * which does not exist on all platforms. + */ + for (reconn = TAILQ_FIRST(&reconn_list.head); + reconn != NULL; reconn = next) { + next = TAILQ_NEXT(reconn, next); + + ret = vhost_user_connect_nonblock(reconn->fd, + (struct sockaddr *)&reconn->un, + sizeof(reconn->un)); + if (ret == -2) { + close(reconn->fd); + RTE_LOG(ERR, VHOST_CONFIG, + "reconnection for fd %d failed\n", + reconn->fd); + goto remove_fd; + } + if (ret == -1) + continue; + + RTE_LOG(INFO, VHOST_CONFIG, + "%s: connected\n", reconn->vsocket->path); + vhost_user_add_connection(reconn->fd, reconn->vsocket); +remove_fd: + TAILQ_REMOVE(&reconn_list.head, reconn, next); + free(reconn); + } + + pthread_mutex_unlock(&reconn_list.mutex); + sleep(1); + } + + return NULL; +} + +static int +vhost_user_reconnect_init(void) +{ + int ret; + + pthread_mutex_init(&reconn_list.mutex, NULL); + TAILQ_INIT(&reconn_list.head); + + ret = pthread_create(&reconn_tid, NULL, + vhost_user_client_reconnect, NULL); + if (ret < 0) + RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread"); + + return ret; +} + +static int +vhost_user_create_client(struct vhost_user_socket *vsocket) +{ + int fd; + int ret; + struct sockaddr_un un; + const char *path = vsocket->path; + struct vhost_user_reconnect *reconn; + + fd = create_unix_socket(path, &un, vsocket->is_server); + if (fd < 0) + return -1; + + ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&un, + sizeof(un)); + if (ret == 0) { + vhost_user_add_connection(fd, vsocket); + return 0; + } + + RTE_LOG(ERR, VHOST_CONFIG, + "failed to connect to %s: %s\n", + path, strerror(errno)); + + if (ret == -2 || !vsocket->reconnect) { + close(fd); + return -1; + } + + RTE_LOG(ERR, VHOST_CONFIG, "%s: reconnecting...\n", path); + reconn = malloc(sizeof(*reconn)); + if (reconn == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to allocate memory for reconnect\n"); + close(fd); + return -1; + } + reconn->un = un; + reconn->fd = fd; + reconn->vsocket = vsocket; + pthread_mutex_lock(&reconn_list.mutex); + TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next); + pthread_mutex_unlock(&reconn_list.mutex); + + return 0; +} + +/* + * Register a new vhost-user socket; here we could act as server + * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag + * is set. + */ +int +rte_vhost_driver_register(const char *path, uint64_t flags) +{ + int ret = -1; + struct vhost_user_socket *vsocket; + + if (!path) + return -1; + + pthread_mutex_lock(&vhost_user.mutex); + + if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) { + RTE_LOG(ERR, VHOST_CONFIG, + "error: the number of vhost sockets reaches maximum\n"); + goto out; + } + + vsocket = malloc(sizeof(struct vhost_user_socket)); + if (!vsocket) + goto out; + memset(vsocket, 0, sizeof(struct vhost_user_socket)); + vsocket->path = strdup(path); + vsocket->connfd = -1; + vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY; + + if ((flags & RTE_VHOST_USER_CLIENT) != 0) { + vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT); + if (vsocket->reconnect && reconn_tid == 0) { + if (vhost_user_reconnect_init() < 0) { + free(vsocket->path); + free(vsocket); + goto out; + } + } + ret = vhost_user_create_client(vsocket); + } else { + vsocket->is_server = true; + ret = vhost_user_create_server(vsocket); + } + if (ret < 0) { + free(vsocket->path); + free(vsocket); + goto out; + } + + vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket; + +out: + pthread_mutex_unlock(&vhost_user.mutex); + + return ret; +} + +static bool +vhost_user_remove_reconnect(struct vhost_user_socket *vsocket) +{ + int found = false; + struct vhost_user_reconnect *reconn, *next; + + pthread_mutex_lock(&reconn_list.mutex); + + for (reconn = TAILQ_FIRST(&reconn_list.head); + reconn != NULL; reconn = next) { + next = TAILQ_NEXT(reconn, next); + + if (reconn->vsocket == vsocket) { + TAILQ_REMOVE(&reconn_list.head, reconn, next); + close(reconn->fd); + free(reconn); + found = true; + break; + } + } + pthread_mutex_unlock(&reconn_list.mutex); + return found; +} + +/** + * Unregister the specified vhost socket + */ +int +rte_vhost_driver_unregister(const char *path) +{ + int i; + int count; + struct vhost_user_connection *conn; + + pthread_mutex_lock(&vhost_user.mutex); + + for (i = 0; i < vhost_user.vsocket_cnt; i++) { + struct vhost_user_socket *vsocket = vhost_user.vsockets[i]; + + if (!strcmp(vsocket->path, path)) { + if (vsocket->is_server) { + fdset_del(&vhost_user.fdset, vsocket->listenfd); + close(vsocket->listenfd); + unlink(path); + } else if (vsocket->reconnect) { + vhost_user_remove_reconnect(vsocket); + } + + conn = fdset_del(&vhost_user.fdset, vsocket->connfd); + if (conn) { + RTE_LOG(INFO, VHOST_CONFIG, + "free connfd = %d for device '%s'\n", + vsocket->connfd, path); + close(vsocket->connfd); + vhost_destroy_device(conn->vid); + free(conn); + } + + free(vsocket->path); + free(vsocket); + + count = --vhost_user.vsocket_cnt; + vhost_user.vsockets[i] = vhost_user.vsockets[count]; + vhost_user.vsockets[count] = NULL; + pthread_mutex_unlock(&vhost_user.mutex); + + return 0; + } + } + pthread_mutex_unlock(&vhost_user.mutex); + + return -1; +} + +int +rte_vhost_driver_session_start(void) +{ + fdset_event_dispatch(&vhost_user.fdset); + return 0; +} diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h deleted file mode 100644 index 38593a29..00000000 --- a/lib/librte_vhost/vhost-net.h +++ /dev/null @@ -1,250 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _VHOST_NET_CDEV_H_ -#define _VHOST_NET_CDEV_H_ -#include -#include -#include -#include -#include - -#include - -#include "rte_virtio_net.h" - -/* Used to indicate that the device is running on a data core */ -#define VIRTIO_DEV_RUNNING 1 - -/* Backend value set by guest. */ -#define VIRTIO_DEV_STOPPED -1 - -#define BUF_VECTOR_MAX 256 - -/** - * Structure contains buffer address, length and descriptor index - * from vring to do scatter RX. - */ -struct buf_vector { - uint64_t buf_addr; - uint32_t buf_len; - uint32_t desc_idx; -}; - -/** - * Structure contains variables relevant to RX/TX virtqueues. - */ -struct vhost_virtqueue { - struct vring_desc *desc; - struct vring_avail *avail; - struct vring_used *used; - uint32_t size; - - /* Last index used on the available ring */ - volatile uint16_t last_used_idx; -#define VIRTIO_INVALID_EVENTFD (-1) -#define VIRTIO_UNINITIALIZED_EVENTFD (-2) - - /* Backend value to determine if device should started/stopped */ - int backend; - /* Used to notify the guest (trigger interrupt) */ - int callfd; - /* Currently unused as polling mode is enabled */ - int kickfd; - int enabled; - - /* Physical address of used ring, for logging */ - uint64_t log_guest_addr; -} __rte_cache_aligned; - -/* Old kernels have no such macro defined */ -#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE - #define VIRTIO_NET_F_GUEST_ANNOUNCE 21 -#endif - - -/* - * Make an extra wrapper for VIRTIO_NET_F_MQ and - * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX as they are - * introduced since kernel v3.8. This makes our - * code buildable for older kernel. - */ -#ifdef VIRTIO_NET_F_MQ - #define VHOST_MAX_QUEUE_PAIRS VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX - #define VHOST_SUPPORTS_MQ (1ULL << VIRTIO_NET_F_MQ) -#else - #define VHOST_MAX_QUEUE_PAIRS 1 - #define VHOST_SUPPORTS_MQ 0 -#endif - -/* - * Define virtio 1.0 for older kernels - */ -#ifndef VIRTIO_F_VERSION_1 - #define VIRTIO_F_VERSION_1 32 -#endif - -/** - * Device structure contains all configuration information relating - * to the device. - */ -struct virtio_net { - /* Frontend (QEMU) memory and memory region information */ - struct virtio_memory *mem; - uint64_t features; - uint64_t protocol_features; - int vid; - uint32_t flags; - uint16_t vhost_hlen; - /* to tell if we need broadcast rarp packet */ - rte_atomic16_t broadcast_rarp; - uint32_t virt_qp_nb; - struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2]; -#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ) - char ifname[IF_NAME_SZ]; - uint64_t log_size; - uint64_t log_base; - uint64_t log_addr; - struct ether_addr mac; - -} __rte_cache_aligned; - -/** - * Information relating to memory regions including offsets to - * addresses in QEMUs memory file. - */ -struct virtio_memory_regions { - uint64_t guest_phys_address; - uint64_t guest_phys_address_end; - uint64_t memory_size; - uint64_t userspace_address; - uint64_t address_offset; -}; - - -/** - * Memory structure includes region and mapping information. - */ -struct virtio_memory { - /* Base QEMU userspace address of the memory file. */ - uint64_t base_address; - uint64_t mapped_address; - uint64_t mapped_size; - uint32_t nregions; - struct virtio_memory_regions regions[0]; -}; - - -/* Macros for printing using RTE_LOG */ -#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1 -#define RTE_LOGTYPE_VHOST_DATA RTE_LOGTYPE_USER1 - -#ifdef RTE_LIBRTE_VHOST_DEBUG -#define VHOST_MAX_PRINT_BUFF 6072 -#define LOG_LEVEL RTE_LOG_DEBUG -#define LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args) -#define PRINT_PACKET(device, addr, size, header) do { \ - char *pkt_addr = (char *)(addr); \ - unsigned int index; \ - char packet[VHOST_MAX_PRINT_BUFF]; \ - \ - if ((header)) \ - snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Header size %d: ", (device->vid), (size)); \ - else \ - snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Packet size %d: ", (device->vid), (size)); \ - for (index = 0; index < (size); index++) { \ - snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \ - "%02hhx ", pkt_addr[index]); \ - } \ - snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \ - \ - LOG_DEBUG(VHOST_DATA, "%s", packet); \ -} while (0) -#else -#define LOG_LEVEL RTE_LOG_INFO -#define LOG_DEBUG(log_type, fmt, args...) do {} while (0) -#define PRINT_PACKET(device, addr, size, header) do {} while (0) -#endif - -/** - * Function to convert guest physical addresses to vhost virtual addresses. - * This is used to convert guest virtio buffer addresses. - */ -static inline uint64_t __attribute__((always_inline)) -gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa) -{ - struct virtio_memory_regions *region; - uint32_t regionidx; - uint64_t vhost_va = 0; - - for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { - region = &dev->mem->regions[regionidx]; - if ((guest_pa >= region->guest_phys_address) && - (guest_pa <= region->guest_phys_address_end)) { - vhost_va = region->address_offset + guest_pa; - break; - } - } - return vhost_va; -} - -struct virtio_net_device_ops const *notify_ops; -struct virtio_net *get_device(int vid); - -int vhost_new_device(void); -void vhost_destroy_device(int); - -void vhost_set_ifname(int, const char *if_name, unsigned int if_len); - -int vhost_get_features(int, uint64_t *); -int vhost_set_features(int, uint64_t *); - -int vhost_set_vring_num(int, struct vhost_vring_state *); -int vhost_set_vring_addr(int, struct vhost_vring_addr *); -int vhost_set_vring_base(int, struct vhost_vring_state *); -int vhost_get_vring_base(int, uint32_t, struct vhost_vring_state *); - -int vhost_set_vring_kick(int, struct vhost_vring_file *); -int vhost_set_vring_call(int, struct vhost_vring_file *); - -int vhost_set_backend(int, struct vhost_vring_file *); - -int vhost_set_owner(int); -int vhost_reset_owner(int); - -/* - * Backend-specific cleanup. Defined by vhost-cuse and vhost-user. - */ -void vhost_backend_cleanup(struct virtio_net *dev); - -#endif /* _VHOST_NET_CDEV_H_ */ diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c new file mode 100644 index 00000000..31825b82 --- /dev/null +++ b/lib/librte_vhost/vhost.c @@ -0,0 +1,430 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#ifdef RTE_LIBRTE_VHOST_NUMA +#include +#endif + +#include +#include +#include +#include +#include +#include + +#include "vhost.h" + +#define VHOST_USER_F_PROTOCOL_FEATURES 30 + +/* Features supported by this lib. */ +#define VHOST_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \ + (1ULL << VIRTIO_NET_F_CTRL_VQ) | \ + (1ULL << VIRTIO_NET_F_CTRL_RX) | \ + (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \ + (VHOST_SUPPORTS_MQ) | \ + (1ULL << VIRTIO_F_VERSION_1) | \ + (1ULL << VHOST_F_LOG_ALL) | \ + (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \ + (1ULL << VIRTIO_NET_F_HOST_TSO4) | \ + (1ULL << VIRTIO_NET_F_HOST_TSO6) | \ + (1ULL << VIRTIO_NET_F_CSUM) | \ + (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \ + (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ + (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ + (1ULL << VIRTIO_RING_F_INDIRECT_DESC)) + +uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES; + +struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; + +/* device ops to add/remove device to/from data core. */ +struct virtio_net_device_ops const *notify_ops; + +struct virtio_net * +get_device(int vid) +{ + struct virtio_net *dev = vhost_devices[vid]; + + if (unlikely(!dev)) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) device not found.\n", vid); + } + + return dev; +} + +static void +cleanup_vq(struct vhost_virtqueue *vq, int destroy) +{ + if ((vq->callfd >= 0) && (destroy != 0)) + close(vq->callfd); + if (vq->kickfd >= 0) + close(vq->kickfd); +} + +/* + * Unmap any memory, close any file descriptors and + * free any memory owned by a device. + */ +void +cleanup_device(struct virtio_net *dev, int destroy) +{ + uint32_t i; + + vhost_backend_cleanup(dev); + + for (i = 0; i < dev->virt_qp_nb; i++) { + cleanup_vq(dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ], destroy); + cleanup_vq(dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ], destroy); + } +} + +/* + * Release virtqueues and device memory. + */ +static void +free_device(struct virtio_net *dev) +{ + uint32_t i; + struct vhost_virtqueue *rxq, *txq; + + for (i = 0; i < dev->virt_qp_nb; i++) { + rxq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ]; + txq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ]; + + rte_free(rxq->shadow_used_ring); + rte_free(txq->shadow_used_ring); + + /* rxq and txq are allocated together as queue-pair */ + rte_free(rxq); + } + + rte_free(dev); +} + +static void +init_vring_queue(struct vhost_virtqueue *vq, int qp_idx) +{ + memset(vq, 0, sizeof(struct vhost_virtqueue)); + + vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; + vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD; + + /* Backends are set to -1 indicating an inactive device. */ + vq->backend = -1; + + /* always set the default vq pair to enabled */ + if (qp_idx == 0) + vq->enabled = 1; + + TAILQ_INIT(&vq->zmbuf_list); +} + +static void +init_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx) +{ + uint32_t base_idx = qp_idx * VIRTIO_QNUM; + + init_vring_queue(dev->virtqueue[base_idx + VIRTIO_RXQ], qp_idx); + init_vring_queue(dev->virtqueue[base_idx + VIRTIO_TXQ], qp_idx); +} + +static void +reset_vring_queue(struct vhost_virtqueue *vq, int qp_idx) +{ + int callfd; + + callfd = vq->callfd; + init_vring_queue(vq, qp_idx); + vq->callfd = callfd; +} + +static void +reset_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx) +{ + uint32_t base_idx = qp_idx * VIRTIO_QNUM; + + reset_vring_queue(dev->virtqueue[base_idx + VIRTIO_RXQ], qp_idx); + reset_vring_queue(dev->virtqueue[base_idx + VIRTIO_TXQ], qp_idx); +} + +int +alloc_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx) +{ + struct vhost_virtqueue *virtqueue = NULL; + uint32_t virt_rx_q_idx = qp_idx * VIRTIO_QNUM + VIRTIO_RXQ; + uint32_t virt_tx_q_idx = qp_idx * VIRTIO_QNUM + VIRTIO_TXQ; + + virtqueue = rte_malloc(NULL, + sizeof(struct vhost_virtqueue) * VIRTIO_QNUM, 0); + if (virtqueue == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to allocate memory for virt qp:%d.\n", qp_idx); + return -1; + } + + dev->virtqueue[virt_rx_q_idx] = virtqueue; + dev->virtqueue[virt_tx_q_idx] = virtqueue + VIRTIO_TXQ; + + init_vring_queue_pair(dev, qp_idx); + + dev->virt_qp_nb += 1; + + return 0; +} + +/* + * Reset some variables in device structure, while keeping few + * others untouched, such as vid, ifname, virt_qp_nb: they + * should be same unless the device is removed. + */ +void +reset_device(struct virtio_net *dev) +{ + uint32_t i; + + dev->features = 0; + dev->protocol_features = 0; + dev->flags = 0; + + for (i = 0; i < dev->virt_qp_nb; i++) + reset_vring_queue_pair(dev, i); +} + +/* + * Invoked when there is a new vhost-user connection established (when + * there is a new virtio device being attached). + */ +int +vhost_new_device(void) +{ + struct virtio_net *dev; + int i; + + dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0); + if (dev == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to allocate memory for new dev.\n"); + return -1; + } + + for (i = 0; i < MAX_VHOST_DEVICE; i++) { + if (vhost_devices[i] == NULL) + break; + } + if (i == MAX_VHOST_DEVICE) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to find a free slot for new device.\n"); + return -1; + } + + vhost_devices[i] = dev; + dev->vid = i; + + return i; +} + +/* + * Invoked when there is the vhost-user connection is broken (when + * the virtio device is being detached). + */ +void +vhost_destroy_device(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + notify_ops->destroy_device(vid); + } + + cleanup_device(dev, 1); + free_device(dev); + + vhost_devices[vid] = NULL; +} + +void +vhost_set_ifname(int vid, const char *if_name, unsigned int if_len) +{ + struct virtio_net *dev; + unsigned int len; + + dev = get_device(vid); + if (dev == NULL) + return; + + len = if_len > sizeof(dev->ifname) ? + sizeof(dev->ifname) : if_len; + + strncpy(dev->ifname, if_name, len); + dev->ifname[sizeof(dev->ifname) - 1] = '\0'; +} + +void +vhost_enable_dequeue_zero_copy(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + dev->dequeue_zero_copy = 1; +} + +int +rte_vhost_get_numa_node(int vid) +{ +#ifdef RTE_LIBRTE_VHOST_NUMA + struct virtio_net *dev = get_device(vid); + int numa_node; + int ret; + + if (dev == NULL) + return -1; + + ret = get_mempolicy(&numa_node, NULL, 0, dev, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to query numa node: %d\n", vid, ret); + return -1; + } + + return numa_node; +#else + RTE_SET_USED(vid); + return -1; +#endif +} + +uint32_t +rte_vhost_get_queue_num(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return 0; + + return dev->virt_qp_nb; +} + +int +rte_vhost_get_ifname(int vid, char *buf, size_t len) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return -1; + + len = RTE_MIN(len, sizeof(dev->ifname)); + + strncpy(buf, dev->ifname, len); + buf[len - 1] = '\0'; + + return 0; +} + +uint16_t +rte_vhost_avail_entries(int vid, uint16_t queue_id) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return 0; + + vq = dev->virtqueue[queue_id]; + if (!vq->enabled) + return 0; + + return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx; +} + +int +rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return -1; + + if (enable) { + RTE_LOG(ERR, VHOST_CONFIG, + "guest notification isn't supported.\n"); + return -1; + } + + dev->virtqueue[queue_id]->used->flags = VRING_USED_F_NO_NOTIFY; + return 0; +} + +uint64_t rte_vhost_feature_get(void) +{ + return VHOST_FEATURES; +} + +int rte_vhost_feature_disable(uint64_t feature_mask) +{ + VHOST_FEATURES = VHOST_FEATURES & ~feature_mask; + return 0; +} + +int rte_vhost_feature_enable(uint64_t feature_mask) +{ + if ((feature_mask & VHOST_SUPPORTED_FEATURES) == feature_mask) { + VHOST_FEATURES = VHOST_FEATURES | feature_mask; + return 0; + } + return -1; +} + +/* + * Register ops so that we can add/remove device to data core. + */ +int +rte_vhost_driver_callback_register(struct virtio_net_device_ops const * const ops) +{ + notify_ops = ops; + + return 0; +} diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h new file mode 100644 index 00000000..22564f1c --- /dev/null +++ b/lib/librte_vhost/vhost.h @@ -0,0 +1,293 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VHOST_NET_CDEV_H_ +#define _VHOST_NET_CDEV_H_ +#include +#include +#include +#include +#include +#include + +#include + +#include "rte_virtio_net.h" + +/* Used to indicate that the device is running on a data core */ +#define VIRTIO_DEV_RUNNING 1 + +/* Backend value set by guest. */ +#define VIRTIO_DEV_STOPPED -1 + +#define BUF_VECTOR_MAX 256 + +/** + * Structure contains buffer address, length and descriptor index + * from vring to do scatter RX. + */ +struct buf_vector { + uint64_t buf_addr; + uint32_t buf_len; + uint32_t desc_idx; +}; + +/* + * A structure to hold some fields needed in zero copy code path, + * mainly for associating an mbuf with the right desc_idx. + */ +struct zcopy_mbuf { + struct rte_mbuf *mbuf; + uint32_t desc_idx; + uint16_t in_use; + + TAILQ_ENTRY(zcopy_mbuf) next; +}; +TAILQ_HEAD(zcopy_mbuf_list, zcopy_mbuf); + +/** + * Structure contains variables relevant to RX/TX virtqueues. + */ +struct vhost_virtqueue { + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; + uint32_t size; + + uint16_t last_avail_idx; + uint16_t last_used_idx; +#define VIRTIO_INVALID_EVENTFD (-1) +#define VIRTIO_UNINITIALIZED_EVENTFD (-2) + + /* Backend value to determine if device should started/stopped */ + int backend; + /* Used to notify the guest (trigger interrupt) */ + int callfd; + /* Currently unused as polling mode is enabled */ + int kickfd; + int enabled; + + /* Physical address of used ring, for logging */ + uint64_t log_guest_addr; + + uint16_t nr_zmbuf; + uint16_t zmbuf_size; + uint16_t last_zmbuf_idx; + struct zcopy_mbuf *zmbufs; + struct zcopy_mbuf_list zmbuf_list; + + struct vring_used_elem *shadow_used_ring; + uint16_t shadow_used_idx; +} __rte_cache_aligned; + +/* Old kernels have no such macro defined */ +#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE + #define VIRTIO_NET_F_GUEST_ANNOUNCE 21 +#endif + + +/* + * Make an extra wrapper for VIRTIO_NET_F_MQ and + * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX as they are + * introduced since kernel v3.8. This makes our + * code buildable for older kernel. + */ +#ifdef VIRTIO_NET_F_MQ + #define VHOST_MAX_QUEUE_PAIRS VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX + #define VHOST_SUPPORTS_MQ (1ULL << VIRTIO_NET_F_MQ) +#else + #define VHOST_MAX_QUEUE_PAIRS 1 + #define VHOST_SUPPORTS_MQ 0 +#endif + +/* + * Define virtio 1.0 for older kernels + */ +#ifndef VIRTIO_F_VERSION_1 + #define VIRTIO_F_VERSION_1 32 +#endif + +struct guest_page { + uint64_t guest_phys_addr; + uint64_t host_phys_addr; + uint64_t size; +}; + +/** + * Device structure contains all configuration information relating + * to the device. + */ +struct virtio_net { + /* Frontend (QEMU) memory and memory region information */ + struct virtio_memory *mem; + uint64_t features; + uint64_t protocol_features; + int vid; + uint32_t flags; + uint16_t vhost_hlen; + /* to tell if we need broadcast rarp packet */ + rte_atomic16_t broadcast_rarp; + uint32_t virt_qp_nb; + int dequeue_zero_copy; + struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2]; +#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ) + char ifname[IF_NAME_SZ]; + uint64_t log_size; + uint64_t log_base; + uint64_t log_addr; + struct ether_addr mac; + + uint32_t nr_guest_pages; + uint32_t max_guest_pages; + struct guest_page *guest_pages; +} __rte_cache_aligned; + +/** + * Information relating to memory regions including offsets to + * addresses in QEMUs memory file. + */ +struct virtio_memory_region { + uint64_t guest_phys_addr; + uint64_t guest_user_addr; + uint64_t host_user_addr; + uint64_t size; + void *mmap_addr; + uint64_t mmap_size; + int fd; +}; + + +/** + * Memory structure includes region and mapping information. + */ +struct virtio_memory { + uint32_t nregions; + struct virtio_memory_region regions[0]; +}; + + +/* Macros for printing using RTE_LOG */ +#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1 +#define RTE_LOGTYPE_VHOST_DATA RTE_LOGTYPE_USER1 + +#ifdef RTE_LIBRTE_VHOST_DEBUG +#define VHOST_MAX_PRINT_BUFF 6072 +#define LOG_LEVEL RTE_LOG_DEBUG +#define LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args) +#define PRINT_PACKET(device, addr, size, header) do { \ + char *pkt_addr = (char *)(addr); \ + unsigned int index; \ + char packet[VHOST_MAX_PRINT_BUFF]; \ + \ + if ((header)) \ + snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Header size %d: ", (device->vid), (size)); \ + else \ + snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Packet size %d: ", (device->vid), (size)); \ + for (index = 0; index < (size); index++) { \ + snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \ + "%02hhx ", pkt_addr[index]); \ + } \ + snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \ + \ + LOG_DEBUG(VHOST_DATA, "%s", packet); \ +} while (0) +#else +#define LOG_LEVEL RTE_LOG_INFO +#define LOG_DEBUG(log_type, fmt, args...) do {} while (0) +#define PRINT_PACKET(device, addr, size, header) do {} while (0) +#endif + +extern uint64_t VHOST_FEATURES; +#define MAX_VHOST_DEVICE 1024 +extern struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; + +/* Convert guest physical Address to host virtual address */ +static inline uint64_t __attribute__((always_inline)) +gpa_to_vva(struct virtio_net *dev, uint64_t gpa) +{ + struct virtio_memory_region *reg; + uint32_t i; + + for (i = 0; i < dev->mem->nregions; i++) { + reg = &dev->mem->regions[i]; + if (gpa >= reg->guest_phys_addr && + gpa < reg->guest_phys_addr + reg->size) { + return gpa - reg->guest_phys_addr + + reg->host_user_addr; + } + } + + return 0; +} + +/* Convert guest physical address to host physical address */ +static inline phys_addr_t __attribute__((always_inline)) +gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size) +{ + uint32_t i; + struct guest_page *page; + + for (i = 0; i < dev->nr_guest_pages; i++) { + page = &dev->guest_pages[i]; + + if (gpa >= page->guest_phys_addr && + gpa + size < page->guest_phys_addr + page->size) { + return gpa - page->guest_phys_addr + + page->host_phys_addr; + } + } + + return 0; +} + +struct virtio_net_device_ops const *notify_ops; +struct virtio_net *get_device(int vid); + +int vhost_new_device(void); +void cleanup_device(struct virtio_net *dev, int destroy); +void reset_device(struct virtio_net *dev); +void vhost_destroy_device(int); + +int alloc_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx); + +void vhost_set_ifname(int, const char *if_name, unsigned int if_len); +void vhost_enable_dequeue_zero_copy(int vid); + +/* + * Backend-specific cleanup. + * + * TODO: fix it; we have one backend now + */ +void vhost_backend_cleanup(struct virtio_net *dev); + +#endif /* _VHOST_NET_CDEV_H_ */ diff --git a/lib/librte_vhost/vhost_cuse/eventfd_copy.c b/lib/librte_vhost/vhost_cuse/eventfd_copy.c deleted file mode 100644 index 154b32a4..00000000 --- a/lib/librte_vhost/vhost_cuse/eventfd_copy.c +++ /dev/null @@ -1,104 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include -#include -#include -#include - -#include - -#include "eventfd_link/eventfd_link.h" -#include "eventfd_copy.h" -#include "vhost-net.h" - -static const char eventfd_cdev[] = "/dev/eventfd-link"; - -static int eventfd_link = -1; - -int -eventfd_init(void) -{ - if (eventfd_link >= 0) - return 0; - - eventfd_link = open(eventfd_cdev, O_RDWR); - if (eventfd_link < 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "eventfd_link module is not loaded\n"); - return -1; - } - - return 0; -} - -int -eventfd_free(void) -{ - if (eventfd_link >= 0) - close(eventfd_link); - return 0; -} - -/* - * This function uses the eventfd_link kernel module to copy an eventfd file - * descriptor provided by QEMU in to our process space. - */ -int -eventfd_copy(int target_fd, int target_pid) -{ - int ret; - struct eventfd_copy2 eventfd_copy2; - - - /* Open the character device to the kernel module. */ - /* TODO: check this earlier rather than fail until VM boots! */ - if (eventfd_init() < 0) - return -1; - - eventfd_copy2.fd = target_fd; - eventfd_copy2.pid = target_pid; - eventfd_copy2.flags = O_NONBLOCK | O_CLOEXEC; - /* Call the IOCTL to copy the eventfd. */ - ret = ioctl(eventfd_link, EVENTFD_COPY2, &eventfd_copy2); - - if (ret < 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "EVENTFD_COPY2 ioctl failed\n"); - return -1; - } - - return ret; -} diff --git a/lib/librte_vhost/vhost_cuse/eventfd_copy.h b/lib/librte_vhost/vhost_cuse/eventfd_copy.h deleted file mode 100644 index 5f446ca0..00000000 --- a/lib/librte_vhost/vhost_cuse/eventfd_copy.h +++ /dev/null @@ -1,45 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ -#ifndef _EVENTFD_H -#define _EVENTFD_H - -int -eventfd_init(void); - -int -eventfd_free(void); - -int -eventfd_copy(int target_fd, int target_pid); - -#endif diff --git a/lib/librte_vhost/vhost_cuse/vhost-net-cdev.c b/lib/librte_vhost/vhost_cuse/vhost-net-cdev.c deleted file mode 100644 index 5d150116..00000000 --- a/lib/librte_vhost/vhost_cuse/vhost-net-cdev.c +++ /dev/null @@ -1,431 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "virtio-net-cdev.h" -#include "vhost-net.h" -#include "eventfd_copy.h" - -#define FUSE_OPT_DUMMY "\0\0" -#define FUSE_OPT_FORE "-f\0\0" -#define FUSE_OPT_NOMULTI "-s\0\0" - -static const uint32_t default_major = 231; -static const uint32_t default_minor = 1; -static const char cuse_device_name[] = "/dev/cuse"; -static const char default_cdev[] = "vhost-net"; - -static struct fuse_session *session; - -/* - * Returns vhost_cuse_device_ctx from given fuse_req_t. The - * index is populated later when the device is added to the - * device linked list. - */ -static struct vhost_cuse_device_ctx -fuse_req_to_vhost_ctx(fuse_req_t req, struct fuse_file_info *fi) -{ - struct vhost_cuse_device_ctx ctx; - struct fuse_ctx const *const req_ctx = fuse_req_ctx(req); - - ctx.pid = req_ctx->pid; - ctx.vid = (int)fi->fh; - - return ctx; -} - -/* - * When the device is created in QEMU it gets initialised here and - * added to the device linked list. - */ -static void -vhost_net_open(fuse_req_t req, struct fuse_file_info *fi) -{ - int vid = 0; - - vid = vhost_new_device(); - if (vid == -1) { - fuse_reply_err(req, EPERM); - return; - } - - fi->fh = vid; - - RTE_LOG(INFO, VHOST_CONFIG, - "(%d) device configuration started\n", vid); - fuse_reply_open(req, fi); -} - -/* - * When QEMU is shutdown or killed the device gets released. - */ -static void -vhost_net_release(fuse_req_t req, struct fuse_file_info *fi) -{ - int err = 0; - struct vhost_cuse_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); - - vhost_destroy_device(ctx.vid); - RTE_LOG(INFO, VHOST_CONFIG, "(%d) device released\n", ctx.vid); - fuse_reply_err(req, err); -} - -/* - * Boilerplate code for CUSE IOCTL - * Implicit arguments: vid, req, result. - */ -#define VHOST_IOCTL(func) do { \ - result = (func)(vid); \ - fuse_reply_ioctl(req, result, NULL, 0); \ -} while (0) - -/* - * Boilerplate IOCTL RETRY - * Implicit arguments: req. - */ -#define VHOST_IOCTL_RETRY(size_r, size_w) do { \ - struct iovec iov_r = { arg, (size_r) }; \ - struct iovec iov_w = { arg, (size_w) }; \ - fuse_reply_ioctl_retry(req, &iov_r, \ - (size_r) ? 1 : 0, &iov_w, (size_w) ? 1 : 0);\ -} while (0) - -/* - * Boilerplate code for CUSE Read IOCTL - * Implicit arguments: vid, req, result, in_bufsz, in_buf. - */ -#define VHOST_IOCTL_R(type, var, func) do { \ - if (!in_bufsz) { \ - VHOST_IOCTL_RETRY(sizeof(type), 0);\ - } else { \ - (var) = *(const type*)in_buf; \ - result = func(vid, &(var)); \ - fuse_reply_ioctl(req, result, NULL, 0);\ - } \ -} while (0) - -/* - * Boilerplate code for CUSE Write IOCTL - * Implicit arguments: vid, req, result, out_bufsz. - */ -#define VHOST_IOCTL_W(type, var, func) do { \ - if (!out_bufsz) { \ - VHOST_IOCTL_RETRY(0, sizeof(type));\ - } else { \ - result = (func)(vid, &(var));\ - fuse_reply_ioctl(req, result, &(var), sizeof(type));\ - } \ -} while (0) - -/* - * Boilerplate code for CUSE Read/Write IOCTL - * Implicit arguments: vid, req, result, in_bufsz, in_buf. - */ -#define VHOST_IOCTL_RW(type1, var1, type2, var2, func) do { \ - if (!in_bufsz) { \ - VHOST_IOCTL_RETRY(sizeof(type1), sizeof(type2));\ - } else { \ - (var1) = *(const type1*) (in_buf); \ - result = (func)(vid, (var1), &(var2)); \ - fuse_reply_ioctl(req, result, &(var2), sizeof(type2));\ - } \ -} while (0) - -/* - * The IOCTLs are handled using CUSE/FUSE in userspace. Depending on the type - * of IOCTL a buffer is requested to read or to write. This request is handled - * by FUSE and the buffer is then given to CUSE. - */ -static void -vhost_net_ioctl(fuse_req_t req, int cmd, void *arg, - struct fuse_file_info *fi, __rte_unused unsigned flags, - const void *in_buf, size_t in_bufsz, size_t out_bufsz) -{ - struct vhost_cuse_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); - struct vhost_vring_file file; - struct vhost_vring_state state; - struct vhost_vring_addr addr; - uint64_t features; - uint32_t index; - int result = 0; - int vid = ctx.vid; - - switch (cmd) { - case VHOST_NET_SET_BACKEND: - LOG_DEBUG(VHOST_CONFIG, - "(%d) IOCTL: VHOST_NET_SET_BACKEND\n", ctx.vid); - if (!in_buf) { - VHOST_IOCTL_RETRY(sizeof(file), 0); - break; - } - file = *(const struct vhost_vring_file *)in_buf; - result = cuse_set_backend(ctx, &file); - fuse_reply_ioctl(req, result, NULL, 0); - break; - - case VHOST_GET_FEATURES: - LOG_DEBUG(VHOST_CONFIG, - "(%d) IOCTL: VHOST_GET_FEATURES\n", vid); - VHOST_IOCTL_W(uint64_t, features, vhost_get_features); - break; - - case VHOST_SET_FEATURES: - LOG_DEBUG(VHOST_CONFIG, - "(%d) IOCTL: VHOST_SET_FEATURES\n", vid); - VHOST_IOCTL_R(uint64_t, features, vhost_set_features); - break; - - case VHOST_RESET_OWNER: - LOG_DEBUG(VHOST_CONFIG, - "(%d) IOCTL: VHOST_RESET_OWNER\n", vid); - VHOST_IOCTL(vhost_reset_owner); - break; - - case VHOST_SET_OWNER: - LOG_DEBUG(VHOST_CONFIG, - "(%d) IOCTL: VHOST_SET_OWNER\n", vid); - VHOST_IOCTL(vhost_set_owner); - break; - - case VHOST_SET_MEM_TABLE: - /*TODO fix race condition.*/ - LOG_DEBUG(VHOST_CONFIG, - "(%d) IOCTL: VHOST_SET_MEM_TABLE\n", vid); - static struct vhost_memory mem_temp; - - switch (in_bufsz) { - case 0: - VHOST_IOCTL_RETRY(sizeof(struct vhost_memory), 0); - break; - - case sizeof(struct vhost_memory): - mem_temp = *(const struct vhost_memory *) in_buf; - - if (mem_temp.nregions > 0) { - VHOST_IOCTL_RETRY(sizeof(struct vhost_memory) + - (sizeof(struct vhost_memory_region) * - mem_temp.nregions), 0); - } else { - result = -1; - fuse_reply_ioctl(req, result, NULL, 0); - } - break; - - default: - result = cuse_set_mem_table(ctx, in_buf, - mem_temp.nregions); - if (result) - fuse_reply_err(req, EINVAL); - else - fuse_reply_ioctl(req, result, NULL, 0); - } - break; - - case VHOST_SET_VRING_NUM: - LOG_DEBUG(VHOST_CONFIG, - "(%d) IOCTL: VHOST_SET_VRING_NUM\n", vid); - VHOST_IOCTL_R(struct vhost_vring_state, state, - vhost_set_vring_num); - break; - - case VHOST_SET_VRING_BASE: - LOG_DEBUG(VHOST_CONFIG, - "(%d) IOCTL: VHOST_SET_VRING_BASE\n", vid); - VHOST_IOCTL_R(struct vhost_vring_state, state, - vhost_set_vring_base); - break; - - case VHOST_GET_VRING_BASE: - LOG_DEBUG(VHOST_CONFIG, - "(%d) IOCTL: VHOST_GET_VRING_BASE\n", vid); - VHOST_IOCTL_RW(uint32_t, index, - struct vhost_vring_state, state, vhost_get_vring_base); - break; - - case VHOST_SET_VRING_ADDR: - LOG_DEBUG(VHOST_CONFIG, - "(%d) IOCTL: VHOST_SET_VRING_ADDR\n", vid); - VHOST_IOCTL_R(struct vhost_vring_addr, addr, - vhost_set_vring_addr); - break; - - case VHOST_SET_VRING_KICK: - case VHOST_SET_VRING_CALL: - if (cmd == VHOST_SET_VRING_KICK) - LOG_DEBUG(VHOST_CONFIG, - "(%d) IOCTL: VHOST_SET_VRING_KICK\n", vid); - else - LOG_DEBUG(VHOST_CONFIG, - "(%d) IOCTL: VHOST_SET_VRING_CALL\n", vid); - if (!in_buf) - VHOST_IOCTL_RETRY(sizeof(struct vhost_vring_file), 0); - else { - int fd; - file = *(const struct vhost_vring_file *)in_buf; - LOG_DEBUG(VHOST_CONFIG, - "idx:%d fd:%d\n", file.index, file.fd); - fd = eventfd_copy(file.fd, ctx.pid); - if (fd < 0) { - fuse_reply_ioctl(req, -1, NULL, 0); - result = -1; - break; - } - file.fd = fd; - if (cmd == VHOST_SET_VRING_KICK) { - result = vhost_set_vring_kick(vid, &file); - fuse_reply_ioctl(req, result, NULL, 0); - } else { - result = vhost_set_vring_call(vid, &file); - fuse_reply_ioctl(req, result, NULL, 0); - } - } - break; - - default: - RTE_LOG(ERR, VHOST_CONFIG, - "(%d) IOCTL: DOESN NOT EXIST\n", vid); - result = -1; - fuse_reply_ioctl(req, result, NULL, 0); - } - - if (result < 0) - LOG_DEBUG(VHOST_CONFIG, - "(%d) IOCTL: FAIL\n", vid); - else - LOG_DEBUG(VHOST_CONFIG, - "(%d) IOCTL: SUCCESS\n", vid); -} - -/* - * Structure handling open, release and ioctl function pointers is populated. - */ -static const struct cuse_lowlevel_ops vhost_net_ops = { - .open = vhost_net_open, - .release = vhost_net_release, - .ioctl = vhost_net_ioctl, -}; - -/* - * cuse_info is populated and used to register the cuse device. - * vhost_net_device_ops are also passed when the device is registered in app. - */ -int -rte_vhost_driver_register(const char *dev_name, uint64_t flags) -{ - struct cuse_info cuse_info; - char device_name[PATH_MAX] = ""; - char char_device_name[PATH_MAX] = ""; - const char *device_argv[] = { device_name }; - - char fuse_opt_dummy[] = FUSE_OPT_DUMMY; - char fuse_opt_fore[] = FUSE_OPT_FORE; - char fuse_opt_nomulti[] = FUSE_OPT_NOMULTI; - char *fuse_argv[] = {fuse_opt_dummy, fuse_opt_fore, fuse_opt_nomulti}; - - if (flags) { - RTE_LOG(ERR, VHOST_CONFIG, - "vhost-cuse does not support any flags so far\n"); - return -1; - } - - if (access(cuse_device_name, R_OK | W_OK) < 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "char device %s can't be accessed, maybe not exist\n", - cuse_device_name); - return -1; - } - - if (eventfd_init() < 0) - return -1; - - /* - * The device name is created. This is passed to QEMU so that it can - * register the device with our application. - */ - snprintf(device_name, PATH_MAX, "DEVNAME=%s", dev_name); - snprintf(char_device_name, PATH_MAX, "/dev/%s", dev_name); - - /* Check if device already exists. */ - if (access(char_device_name, F_OK) != -1) { - RTE_LOG(ERR, VHOST_CONFIG, - "char device %s already exists\n", char_device_name); - return -1; - } - - memset(&cuse_info, 0, sizeof(cuse_info)); - cuse_info.dev_major = default_major; - cuse_info.dev_minor = default_minor; - cuse_info.dev_info_argc = 1; - cuse_info.dev_info_argv = device_argv; - cuse_info.flags = CUSE_UNRESTRICTED_IOCTL; - - session = cuse_lowlevel_setup(3, fuse_argv, - &cuse_info, &vhost_net_ops, 0, NULL); - if (session == NULL) - return -1; - - return 0; -} - -/** - * An empty function for unregister - */ -int -rte_vhost_driver_unregister(const char *dev_name __rte_unused) -{ - return 0; -} - -/** - * The CUSE session is launched allowing the application to receive open, - * release and ioctl calls. - */ -int -rte_vhost_driver_session_start(void) -{ - fuse_session_loop(session); - - return 0; -} diff --git a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c b/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c deleted file mode 100644 index 552be7d4..00000000 --- a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c +++ /dev/null @@ -1,433 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "rte_virtio_net.h" -#include "vhost-net.h" -#include "virtio-net-cdev.h" -#include "eventfd_copy.h" - -/* Line size for reading maps file. */ -static const uint32_t BUFSIZE = PATH_MAX; - -/* Size of prot char array in procmap. */ -#define PROT_SZ 5 - -/* Number of elements in procmap struct. */ -#define PROCMAP_SZ 8 - -/* Structure containing information gathered from maps file. */ -struct procmap { - uint64_t va_start; /* Start virtual address in file. */ - uint64_t len; /* Size of file. */ - uint64_t pgoff; /* Not used. */ - uint32_t maj; /* Not used. */ - uint32_t min; /* Not used. */ - uint32_t ino; /* Not used. */ - char prot[PROT_SZ]; /* Not used. */ - char fname[PATH_MAX]; /* File name. */ -}; - -/* - * Locate the file containing QEMU's memory space and - * map it to our address space. - */ -static int -host_memory_map(pid_t pid, uint64_t addr, - uint64_t *mapped_address, uint64_t *mapped_size) -{ - struct dirent *dptr = NULL; - struct procmap procmap; - DIR *dp = NULL; - int fd; - int i; - char memfile[PATH_MAX]; - char mapfile[PATH_MAX]; - char procdir[PATH_MAX]; - char resolved_path[PATH_MAX]; - char *path = NULL; - FILE *fmap; - void *map; - uint8_t found = 0; - char line[BUFSIZE]; - char dlm[] = "- : "; - char *str, *sp, *in[PROCMAP_SZ]; - char *end = NULL; - - /* Path where mem files are located. */ - snprintf(procdir, PATH_MAX, "/proc/%u/fd/", pid); - /* Maps file used to locate mem file. */ - snprintf(mapfile, PATH_MAX, "/proc/%u/maps", pid); - - fmap = fopen(mapfile, "r"); - if (fmap == NULL) { - RTE_LOG(ERR, VHOST_CONFIG, - "Failed to open maps file for pid %d\n", - pid); - return -1; - } - - /* Read through maps file until we find out base_address. */ - while (fgets(line, BUFSIZE, fmap) != 0) { - str = line; - errno = 0; - /* Split line into fields. */ - for (i = 0; i < PROCMAP_SZ; i++) { - in[i] = strtok_r(str, &dlm[i], &sp); - if ((in[i] == NULL) || (errno != 0)) { - fclose(fmap); - return -1; - } - str = NULL; - } - - /* Convert/Copy each field as needed. */ - procmap.va_start = strtoull(in[0], &end, 16); - if ((in[0] == '\0') || (end == NULL) || (*end != '\0') || - (errno != 0)) { - fclose(fmap); - return -1; - } - - procmap.len = strtoull(in[1], &end, 16); - if ((in[1] == '\0') || (end == NULL) || (*end != '\0') || - (errno != 0)) { - fclose(fmap); - return -1; - } - - procmap.pgoff = strtoull(in[3], &end, 16); - if ((in[3] == '\0') || (end == NULL) || (*end != '\0') || - (errno != 0)) { - fclose(fmap); - return -1; - } - - procmap.maj = strtoul(in[4], &end, 16); - if ((in[4] == '\0') || (end == NULL) || (*end != '\0') || - (errno != 0)) { - fclose(fmap); - return -1; - } - - procmap.min = strtoul(in[5], &end, 16); - if ((in[5] == '\0') || (end == NULL) || (*end != '\0') || - (errno != 0)) { - fclose(fmap); - return -1; - } - - procmap.ino = strtoul(in[6], &end, 16); - if ((in[6] == '\0') || (end == NULL) || (*end != '\0') || - (errno != 0)) { - fclose(fmap); - return -1; - } - - memcpy(&procmap.prot, in[2], PROT_SZ); - memcpy(&procmap.fname, in[7], PATH_MAX); - - if (procmap.va_start == addr) { - procmap.len = procmap.len - procmap.va_start; - found = 1; - break; - } - } - fclose(fmap); - - if (!found) { - RTE_LOG(ERR, VHOST_CONFIG, - "Failed to find memory file in pid %d maps file\n", - pid); - return -1; - } - - /* Find the guest memory file among the process fds. */ - dp = opendir(procdir); - if (dp == NULL) { - RTE_LOG(ERR, VHOST_CONFIG, - "Cannot open pid %d process directory\n", - pid); - return -1; - } - - found = 0; - - /* Read the fd directory contents. */ - while (NULL != (dptr = readdir(dp))) { - snprintf(memfile, PATH_MAX, "/proc/%u/fd/%s", - pid, dptr->d_name); - path = realpath(memfile, resolved_path); - if ((path == NULL) && (strlen(resolved_path) == 0)) { - RTE_LOG(ERR, VHOST_CONFIG, - "Failed to resolve fd directory\n"); - closedir(dp); - return -1; - } - if (strncmp(resolved_path, procmap.fname, - strnlen(procmap.fname, PATH_MAX)) == 0) { - found = 1; - break; - } - } - - closedir(dp); - - if (found == 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "Failed to find memory file for pid %d\n", - pid); - return -1; - } - /* Open the shared memory file and map the memory into this process. */ - fd = open(memfile, O_RDWR); - - if (fd == -1) { - RTE_LOG(ERR, VHOST_CONFIG, - "Failed to open %s for pid %d\n", - memfile, pid); - return -1; - } - - map = mmap(0, (size_t)procmap.len, PROT_READ|PROT_WRITE, - MAP_POPULATE|MAP_SHARED, fd, 0); - close(fd); - - if (map == MAP_FAILED) { - RTE_LOG(ERR, VHOST_CONFIG, - "Error mapping the file %s for pid %d\n", - memfile, pid); - return -1; - } - - /* Store the memory address and size in the device data structure */ - *mapped_address = (uint64_t)(uintptr_t)map; - *mapped_size = procmap.len; - - LOG_DEBUG(VHOST_CONFIG, - "Mem File: %s->%s - Size: %llu - VA: %p\n", - memfile, resolved_path, - (unsigned long long)*mapped_size, map); - - return 0; -} - -int -cuse_set_mem_table(struct vhost_cuse_device_ctx ctx, - const struct vhost_memory *mem_regions_addr, uint32_t nregions) -{ - uint64_t size = offsetof(struct vhost_memory, regions); - uint32_t idx, valid_regions; - struct virtio_memory_regions *pregion; - struct vhost_memory_region *mem_regions = (void *)(uintptr_t) - ((uint64_t)(uintptr_t)mem_regions_addr + size); - uint64_t base_address = 0, mapped_address, mapped_size; - struct virtio_net *dev; - - dev = get_device(ctx.vid); - if (dev == NULL) - return -1; - - if (dev->mem && dev->mem->mapped_address) { - munmap((void *)(uintptr_t)dev->mem->mapped_address, - (size_t)dev->mem->mapped_size); - free(dev->mem); - dev->mem = NULL; - } - - dev->mem = calloc(1, sizeof(struct virtio_memory) + - sizeof(struct virtio_memory_regions) * nregions); - if (dev->mem == NULL) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%d) failed to allocate memory for dev->mem\n", - dev->vid); - return -1; - } - - pregion = &dev->mem->regions[0]; - - for (idx = 0; idx < nregions; idx++) { - pregion[idx].guest_phys_address = - mem_regions[idx].guest_phys_addr; - pregion[idx].guest_phys_address_end = - pregion[idx].guest_phys_address + - mem_regions[idx].memory_size; - pregion[idx].memory_size = - mem_regions[idx].memory_size; - pregion[idx].userspace_address = - mem_regions[idx].userspace_addr; - - LOG_DEBUG(VHOST_CONFIG, - "REGION: %u - GPA: %p - QVA: %p - SIZE (%"PRIu64")\n", - idx, - (void *)(uintptr_t)pregion[idx].guest_phys_address, - (void *)(uintptr_t)pregion[idx].userspace_address, - pregion[idx].memory_size); - - /*set the base address mapping*/ - if (pregion[idx].guest_phys_address == 0x0) { - base_address = - pregion[idx].userspace_address; - /* Map VM memory file */ - if (host_memory_map(ctx.pid, base_address, - &mapped_address, &mapped_size) != 0) { - free(dev->mem); - dev->mem = NULL; - return -1; - } - dev->mem->mapped_address = mapped_address; - dev->mem->base_address = base_address; - dev->mem->mapped_size = mapped_size; - } - } - - /* Check that we have a valid base address. */ - if (base_address == 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "Failed to find base address of qemu memory file.\n"); - free(dev->mem); - dev->mem = NULL; - return -1; - } - - valid_regions = nregions; - for (idx = 0; idx < nregions; idx++) { - if ((pregion[idx].userspace_address < base_address) || - (pregion[idx].userspace_address > - (base_address + mapped_size))) - valid_regions--; - } - - - if (valid_regions != nregions) { - valid_regions = 0; - for (idx = nregions; 0 != idx--; ) { - if ((pregion[idx].userspace_address < base_address) || - (pregion[idx].userspace_address > - (base_address + mapped_size))) { - memmove(&pregion[idx], &pregion[idx + 1], - sizeof(struct virtio_memory_regions) * - valid_regions); - } else - valid_regions++; - } - } - - for (idx = 0; idx < valid_regions; idx++) { - pregion[idx].address_offset = - mapped_address - base_address + - pregion[idx].userspace_address - - pregion[idx].guest_phys_address; - } - dev->mem->nregions = valid_regions; - - return 0; -} - -/* - * Function to get the tap device name from the provided file descriptor and - * save it in the device structure. - */ -static int -get_ifname(int vid, int tap_fd, int pid) -{ - int fd_tap; - struct ifreq ifr; - uint32_t ifr_size; - int ret; - - fd_tap = eventfd_copy(tap_fd, pid); - if (fd_tap < 0) - return -1; - - ret = ioctl(fd_tap, TUNGETIFF, &ifr); - - if (close(fd_tap) < 0) - RTE_LOG(ERR, VHOST_CONFIG, "(%d) fd close failed\n", vid); - - if (ret >= 0) { - ifr_size = strnlen(ifr.ifr_name, sizeof(ifr.ifr_name)); - vhost_set_ifname(vid, ifr.ifr_name, ifr_size); - } else - RTE_LOG(ERR, VHOST_CONFIG, - "(%d) TUNGETIFF ioctl failed\n", vid); - - return 0; -} - -int -cuse_set_backend(struct vhost_cuse_device_ctx ctx, - struct vhost_vring_file *file) -{ - struct virtio_net *dev; - - dev = get_device(ctx.vid); - if (dev == NULL) - return -1; - - if (!(dev->flags & VIRTIO_DEV_RUNNING) && file->fd != VIRTIO_DEV_STOPPED) - get_ifname(ctx.vid, file->fd, ctx.pid); - - return vhost_set_backend(ctx.vid, file); -} - -void -vhost_backend_cleanup(struct virtio_net *dev) -{ - /* Unmap QEMU memory file if mapped. */ - if (dev->mem) { - munmap((void *)(uintptr_t)dev->mem->mapped_address, - (size_t)dev->mem->mapped_size); - free(dev->mem); - dev->mem = NULL; - } -} diff --git a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.h b/lib/librte_vhost/vhost_cuse/virtio-net-cdev.h deleted file mode 100644 index 3f67154b..00000000 --- a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.h +++ /dev/null @@ -1,56 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ -#ifndef _VIRTIO_NET_CDEV_H -#define _VIRTIO_NET_CDEV_H - -#include -#include - -#include "vhost-net.h" - -/* - * Structure used to identify device context. - */ -struct vhost_cuse_device_ctx { - pid_t pid; /* PID of process calling the IOCTL. */ - int vid; /* Virtio-net device ID */ -}; - -int -cuse_set_mem_table(struct vhost_cuse_device_ctx ctx, - const struct vhost_memory *mem_regions_addr, uint32_t nregions); - -int -cuse_set_backend(struct vhost_cuse_device_ctx ctx, struct vhost_vring_file *); - -#endif diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c deleted file mode 100644 index 5806f99a..00000000 --- a/lib/librte_vhost/vhost_rxtx.c +++ /dev/null @@ -1,931 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "vhost-net.h" - -#define MAX_PKT_BURST 32 -#define VHOST_LOG_PAGE 4096 - -static inline void __attribute__((always_inline)) -vhost_log_page(uint8_t *log_base, uint64_t page) -{ - log_base[page / 8] |= 1 << (page % 8); -} - -static inline void __attribute__((always_inline)) -vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len) -{ - uint64_t page; - - if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) || - !dev->log_base || !len)) - return; - - if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8))) - return; - - /* To make sure guest memory updates are committed before logging */ - rte_smp_wmb(); - - page = addr / VHOST_LOG_PAGE; - while (page * VHOST_LOG_PAGE < addr + len) { - vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page); - page += 1; - } -} - -static inline void __attribute__((always_inline)) -vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint64_t offset, uint64_t len) -{ - vhost_log_write(dev, vq->log_guest_addr + offset, len); -} - -static bool -is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb) -{ - return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM; -} - -static void -virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr) -{ - if (m_buf->ol_flags & PKT_TX_L4_MASK) { - net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len; - - switch (m_buf->ol_flags & PKT_TX_L4_MASK) { - case PKT_TX_TCP_CKSUM: - net_hdr->csum_offset = (offsetof(struct tcp_hdr, - cksum)); - break; - case PKT_TX_UDP_CKSUM: - net_hdr->csum_offset = (offsetof(struct udp_hdr, - dgram_cksum)); - break; - case PKT_TX_SCTP_CKSUM: - net_hdr->csum_offset = (offsetof(struct sctp_hdr, - cksum)); - break; - } - } - - if (m_buf->ol_flags & PKT_TX_TCP_SEG) { - if (m_buf->ol_flags & PKT_TX_IPV4) - net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; - else - net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; - net_hdr->gso_size = m_buf->tso_segsz; - net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len - + m_buf->l4_len; - } -} - -static inline void -copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr, - struct virtio_net_hdr_mrg_rxbuf hdr) -{ - if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf)) - *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr; - else - *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr; -} - -static inline int __attribute__((always_inline)) -copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, - struct rte_mbuf *m, uint16_t desc_idx) -{ - uint32_t desc_avail, desc_offset; - uint32_t mbuf_avail, mbuf_offset; - uint32_t cpy_len; - struct vring_desc *desc; - uint64_t desc_addr; - struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0}; - - desc = &vq->desc[desc_idx]; - desc_addr = gpa_to_vva(dev, desc->addr); - /* - * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid - * performance issue with some versions of gcc (4.8.4 and 5.3.0) which - * otherwise stores offset on the stack instead of in a register. - */ - if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr) - return -1; - - rte_prefetch0((void *)(uintptr_t)desc_addr); - - virtio_enqueue_offload(m, &virtio_hdr.hdr); - copy_virtio_net_hdr(dev, desc_addr, virtio_hdr); - vhost_log_write(dev, desc->addr, dev->vhost_hlen); - PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0); - - desc_offset = dev->vhost_hlen; - desc_avail = desc->len - dev->vhost_hlen; - - mbuf_avail = rte_pktmbuf_data_len(m); - mbuf_offset = 0; - while (mbuf_avail != 0 || m->next != NULL) { - /* done with current mbuf, fetch next */ - if (mbuf_avail == 0) { - m = m->next; - - mbuf_offset = 0; - mbuf_avail = rte_pktmbuf_data_len(m); - } - - /* done with current desc buf, fetch next */ - if (desc_avail == 0) { - if ((desc->flags & VRING_DESC_F_NEXT) == 0) { - /* Room in vring buffer is not enough */ - return -1; - } - if (unlikely(desc->next >= vq->size)) - return -1; - - desc = &vq->desc[desc->next]; - desc_addr = gpa_to_vva(dev, desc->addr); - if (unlikely(!desc_addr)) - return -1; - - desc_offset = 0; - desc_avail = desc->len; - } - - cpy_len = RTE_MIN(desc_avail, mbuf_avail); - rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)), - rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), - cpy_len); - vhost_log_write(dev, desc->addr + desc_offset, cpy_len); - PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), - cpy_len, 0); - - mbuf_avail -= cpy_len; - mbuf_offset += cpy_len; - desc_avail -= cpy_len; - desc_offset += cpy_len; - } - - return 0; -} - -/** - * This function adds buffers to the virtio devices RX virtqueue. Buffers can - * be received from the physical port or from another virtio device. A packet - * count is returned to indicate the number of packets that are succesfully - * added to the RX queue. This function works when the mbuf is scattered, but - * it doesn't support the mergeable feature. - */ -static inline uint32_t __attribute__((always_inline)) -virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, - struct rte_mbuf **pkts, uint32_t count) -{ - struct vhost_virtqueue *vq; - uint16_t avail_idx, free_entries, start_idx; - uint16_t desc_indexes[MAX_PKT_BURST]; - uint16_t used_idx; - uint32_t i; - - LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); - if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) { - RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", - dev->vid, __func__, queue_id); - return 0; - } - - vq = dev->virtqueue[queue_id]; - if (unlikely(vq->enabled == 0)) - return 0; - - avail_idx = *((volatile uint16_t *)&vq->avail->idx); - start_idx = vq->last_used_idx; - free_entries = avail_idx - start_idx; - count = RTE_MIN(count, free_entries); - count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST); - if (count == 0) - return 0; - - LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n", - dev->vid, start_idx, start_idx + count); - - /* Retrieve all of the desc indexes first to avoid caching issues. */ - rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]); - for (i = 0; i < count; i++) { - used_idx = (start_idx + i) & (vq->size - 1); - desc_indexes[i] = vq->avail->ring[used_idx]; - vq->used->ring[used_idx].id = desc_indexes[i]; - vq->used->ring[used_idx].len = pkts[i]->pkt_len + - dev->vhost_hlen; - vhost_log_used_vring(dev, vq, - offsetof(struct vring_used, ring[used_idx]), - sizeof(vq->used->ring[used_idx])); - } - - rte_prefetch0(&vq->desc[desc_indexes[0]]); - for (i = 0; i < count; i++) { - uint16_t desc_idx = desc_indexes[i]; - int err; - - err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx); - if (unlikely(err)) { - used_idx = (start_idx + i) & (vq->size - 1); - vq->used->ring[used_idx].len = dev->vhost_hlen; - vhost_log_used_vring(dev, vq, - offsetof(struct vring_used, ring[used_idx]), - sizeof(vq->used->ring[used_idx])); - } - - if (i + 1 < count) - rte_prefetch0(&vq->desc[desc_indexes[i+1]]); - } - - rte_smp_wmb(); - - *(volatile uint16_t *)&vq->used->idx += count; - vq->last_used_idx += count; - vhost_log_used_vring(dev, vq, - offsetof(struct vring_used, idx), - sizeof(vq->used->idx)); - - /* flush used->idx update before we read avail->flags. */ - rte_mb(); - - /* Kick the guest if necessary. */ - if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) - && (vq->callfd >= 0)) - eventfd_write(vq->callfd, (eventfd_t)1); - return count; -} - -static inline int -fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx, - uint32_t *allocated, uint32_t *vec_idx, - struct buf_vector *buf_vec) -{ - uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)]; - uint32_t vec_id = *vec_idx; - uint32_t len = *allocated; - - while (1) { - if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size)) - return -1; - - len += vq->desc[idx].len; - buf_vec[vec_id].buf_addr = vq->desc[idx].addr; - buf_vec[vec_id].buf_len = vq->desc[idx].len; - buf_vec[vec_id].desc_idx = idx; - vec_id++; - - if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0) - break; - - idx = vq->desc[idx].next; - } - - *allocated = len; - *vec_idx = vec_id; - - return 0; -} - -/* - * Returns -1 on fail, 0 on success - */ -static inline int -reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size, - uint16_t *end, struct buf_vector *buf_vec) -{ - uint16_t cur_idx; - uint16_t avail_idx; - uint32_t allocated = 0; - uint32_t vec_idx = 0; - uint16_t tries = 0; - - cur_idx = vq->last_used_idx; - - while (1) { - avail_idx = *((volatile uint16_t *)&vq->avail->idx); - if (unlikely(cur_idx == avail_idx)) - return -1; - - if (unlikely(fill_vec_buf(vq, cur_idx, &allocated, - &vec_idx, buf_vec) < 0)) - return -1; - - cur_idx++; - tries++; - - if (allocated >= size) - break; - - /* - * if we tried all available ring items, and still - * can't get enough buf, it means something abnormal - * happened. - */ - if (unlikely(tries >= vq->size)) - return -1; - } - - *end = cur_idx; - return 0; -} - -static inline uint32_t __attribute__((always_inline)) -copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint16_t end_idx, struct rte_mbuf *m, - struct buf_vector *buf_vec) -{ - struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0}; - uint32_t vec_idx = 0; - uint16_t start_idx = vq->last_used_idx; - uint16_t cur_idx = start_idx; - uint64_t desc_addr; - uint32_t desc_chain_head; - uint32_t desc_chain_len; - uint32_t mbuf_offset, mbuf_avail; - uint32_t desc_offset, desc_avail; - uint32_t cpy_len; - uint16_t desc_idx, used_idx; - - if (unlikely(m == NULL)) - return 0; - - LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", - dev->vid, cur_idx, end_idx); - - desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr); - if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr) - return 0; - - rte_prefetch0((void *)(uintptr_t)desc_addr); - - virtio_hdr.num_buffers = end_idx - start_idx; - LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n", - dev->vid, virtio_hdr.num_buffers); - - virtio_enqueue_offload(m, &virtio_hdr.hdr); - copy_virtio_net_hdr(dev, desc_addr, virtio_hdr); - vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen); - PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0); - - desc_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen; - desc_offset = dev->vhost_hlen; - desc_chain_head = buf_vec[vec_idx].desc_idx; - desc_chain_len = desc_offset; - - mbuf_avail = rte_pktmbuf_data_len(m); - mbuf_offset = 0; - while (mbuf_avail != 0 || m->next != NULL) { - /* done with current desc buf, get the next one */ - if (desc_avail == 0) { - desc_idx = buf_vec[vec_idx].desc_idx; - vec_idx++; - - if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) { - /* Update used ring with desc information */ - used_idx = cur_idx++ & (vq->size - 1); - vq->used->ring[used_idx].id = desc_chain_head; - vq->used->ring[used_idx].len = desc_chain_len; - vhost_log_used_vring(dev, vq, - offsetof(struct vring_used, - ring[used_idx]), - sizeof(vq->used->ring[used_idx])); - desc_chain_head = buf_vec[vec_idx].desc_idx; - desc_chain_len = 0; - } - - desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr); - if (unlikely(!desc_addr)) - return 0; - - /* Prefetch buffer address. */ - rte_prefetch0((void *)(uintptr_t)desc_addr); - desc_offset = 0; - desc_avail = buf_vec[vec_idx].buf_len; - } - - /* done with current mbuf, get the next one */ - if (mbuf_avail == 0) { - m = m->next; - - mbuf_offset = 0; - mbuf_avail = rte_pktmbuf_data_len(m); - } - - cpy_len = RTE_MIN(desc_avail, mbuf_avail); - rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)), - rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), - cpy_len); - vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset, - cpy_len); - PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), - cpy_len, 0); - - mbuf_avail -= cpy_len; - mbuf_offset += cpy_len; - desc_avail -= cpy_len; - desc_offset += cpy_len; - desc_chain_len += cpy_len; - } - - used_idx = cur_idx & (vq->size - 1); - vq->used->ring[used_idx].id = desc_chain_head; - vq->used->ring[used_idx].len = desc_chain_len; - vhost_log_used_vring(dev, vq, - offsetof(struct vring_used, ring[used_idx]), - sizeof(vq->used->ring[used_idx])); - - return end_idx - start_idx; -} - -static inline uint32_t __attribute__((always_inline)) -virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id, - struct rte_mbuf **pkts, uint32_t count) -{ - struct vhost_virtqueue *vq; - uint32_t pkt_idx = 0, nr_used = 0; - uint16_t end; - struct buf_vector buf_vec[BUF_VECTOR_MAX]; - - LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); - if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) { - RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", - dev->vid, __func__, queue_id); - return 0; - } - - vq = dev->virtqueue[queue_id]; - if (unlikely(vq->enabled == 0)) - return 0; - - count = RTE_MIN((uint32_t)MAX_PKT_BURST, count); - if (count == 0) - return 0; - - for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { - uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; - - if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len, - &end, buf_vec) < 0)) { - LOG_DEBUG(VHOST_DATA, - "(%d) failed to get enough desc from vring\n", - dev->vid); - break; - } - - nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end, - pkts[pkt_idx], buf_vec); - rte_smp_wmb(); - - *(volatile uint16_t *)&vq->used->idx += nr_used; - vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), - sizeof(vq->used->idx)); - vq->last_used_idx += nr_used; - } - - if (likely(pkt_idx)) { - /* flush used->idx update before we read avail->flags. */ - rte_mb(); - - /* Kick the guest if necessary. */ - if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) - && (vq->callfd >= 0)) - eventfd_write(vq->callfd, (eventfd_t)1); - } - - return pkt_idx; -} - -uint16_t -rte_vhost_enqueue_burst(int vid, uint16_t queue_id, - struct rte_mbuf **pkts, uint16_t count) -{ - struct virtio_net *dev = get_device(vid); - - if (!dev) - return 0; - - if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) - return virtio_dev_merge_rx(dev, queue_id, pkts, count); - else - return virtio_dev_rx(dev, queue_id, pkts, count); -} - -static void -parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr) -{ - struct ipv4_hdr *ipv4_hdr; - struct ipv6_hdr *ipv6_hdr; - void *l3_hdr = NULL; - struct ether_hdr *eth_hdr; - uint16_t ethertype; - - eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); - - m->l2_len = sizeof(struct ether_hdr); - ethertype = rte_be_to_cpu_16(eth_hdr->ether_type); - - if (ethertype == ETHER_TYPE_VLAN) { - struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1); - - m->l2_len += sizeof(struct vlan_hdr); - ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto); - } - - l3_hdr = (char *)eth_hdr + m->l2_len; - - switch (ethertype) { - case ETHER_TYPE_IPv4: - ipv4_hdr = (struct ipv4_hdr *)l3_hdr; - *l4_proto = ipv4_hdr->next_proto_id; - m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4; - *l4_hdr = (char *)l3_hdr + m->l3_len; - m->ol_flags |= PKT_TX_IPV4; - break; - case ETHER_TYPE_IPv6: - ipv6_hdr = (struct ipv6_hdr *)l3_hdr; - *l4_proto = ipv6_hdr->proto; - m->l3_len = sizeof(struct ipv6_hdr); - *l4_hdr = (char *)l3_hdr + m->l3_len; - m->ol_flags |= PKT_TX_IPV6; - break; - default: - m->l3_len = 0; - *l4_proto = 0; - break; - } -} - -static inline void __attribute__((always_inline)) -vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m) -{ - uint16_t l4_proto = 0; - void *l4_hdr = NULL; - struct tcp_hdr *tcp_hdr = NULL; - - parse_ethernet(m, &l4_proto, &l4_hdr); - if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) { - if (hdr->csum_start == (m->l2_len + m->l3_len)) { - switch (hdr->csum_offset) { - case (offsetof(struct tcp_hdr, cksum)): - if (l4_proto == IPPROTO_TCP) - m->ol_flags |= PKT_TX_TCP_CKSUM; - break; - case (offsetof(struct udp_hdr, dgram_cksum)): - if (l4_proto == IPPROTO_UDP) - m->ol_flags |= PKT_TX_UDP_CKSUM; - break; - case (offsetof(struct sctp_hdr, cksum)): - if (l4_proto == IPPROTO_SCTP) - m->ol_flags |= PKT_TX_SCTP_CKSUM; - break; - default: - break; - } - } - } - - if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { - switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { - case VIRTIO_NET_HDR_GSO_TCPV4: - case VIRTIO_NET_HDR_GSO_TCPV6: - tcp_hdr = (struct tcp_hdr *)l4_hdr; - m->ol_flags |= PKT_TX_TCP_SEG; - m->tso_segsz = hdr->gso_size; - m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2; - break; - default: - RTE_LOG(WARNING, VHOST_DATA, - "unsupported gso type %u.\n", hdr->gso_type); - break; - } - } -} - -#define RARP_PKT_SIZE 64 - -static int -make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac) -{ - struct ether_hdr *eth_hdr; - struct arp_hdr *rarp; - - if (rarp_mbuf->buf_len < 64) { - RTE_LOG(WARNING, VHOST_DATA, - "failed to make RARP; mbuf size too small %u (< %d)\n", - rarp_mbuf->buf_len, RARP_PKT_SIZE); - return -1; - } - - /* Ethernet header. */ - eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0); - memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN); - ether_addr_copy(mac, ð_hdr->s_addr); - eth_hdr->ether_type = htons(ETHER_TYPE_RARP); - - /* RARP header. */ - rarp = (struct arp_hdr *)(eth_hdr + 1); - rarp->arp_hrd = htons(ARP_HRD_ETHER); - rarp->arp_pro = htons(ETHER_TYPE_IPv4); - rarp->arp_hln = ETHER_ADDR_LEN; - rarp->arp_pln = 4; - rarp->arp_op = htons(ARP_OP_REVREQUEST); - - ether_addr_copy(mac, &rarp->arp_data.arp_sha); - ether_addr_copy(mac, &rarp->arp_data.arp_tha); - memset(&rarp->arp_data.arp_sip, 0x00, 4); - memset(&rarp->arp_data.arp_tip, 0x00, 4); - - rarp_mbuf->pkt_len = rarp_mbuf->data_len = RARP_PKT_SIZE; - - return 0; -} - -static inline int __attribute__((always_inline)) -copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, - struct rte_mbuf *m, uint16_t desc_idx, - struct rte_mempool *mbuf_pool) -{ - struct vring_desc *desc; - uint64_t desc_addr; - uint32_t desc_avail, desc_offset; - uint32_t mbuf_avail, mbuf_offset; - uint32_t cpy_len; - struct rte_mbuf *cur = m, *prev = m; - struct virtio_net_hdr *hdr; - /* A counter to avoid desc dead loop chain */ - uint32_t nr_desc = 1; - - desc = &vq->desc[desc_idx]; - if (unlikely(desc->len < dev->vhost_hlen)) - return -1; - - desc_addr = gpa_to_vva(dev, desc->addr); - if (unlikely(!desc_addr)) - return -1; - - hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr); - rte_prefetch0(hdr); - - /* - * A virtio driver normally uses at least 2 desc buffers - * for Tx: the first for storing the header, and others - * for storing the data. - */ - if (likely((desc->len == dev->vhost_hlen) && - (desc->flags & VRING_DESC_F_NEXT) != 0)) { - desc = &vq->desc[desc->next]; - - desc_addr = gpa_to_vva(dev, desc->addr); - if (unlikely(!desc_addr)) - return -1; - - rte_prefetch0((void *)(uintptr_t)desc_addr); - - desc_offset = 0; - desc_avail = desc->len; - nr_desc += 1; - - PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0); - } else { - desc_avail = desc->len - dev->vhost_hlen; - desc_offset = dev->vhost_hlen; - } - - mbuf_offset = 0; - mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM; - while (1) { - cpy_len = RTE_MIN(desc_avail, mbuf_avail); - rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset), - (void *)((uintptr_t)(desc_addr + desc_offset)), - cpy_len); - - mbuf_avail -= cpy_len; - mbuf_offset += cpy_len; - desc_avail -= cpy_len; - desc_offset += cpy_len; - - /* This desc reaches to its end, get the next one */ - if (desc_avail == 0) { - if ((desc->flags & VRING_DESC_F_NEXT) == 0) - break; - - if (unlikely(desc->next >= vq->size || - ++nr_desc > vq->size)) - return -1; - desc = &vq->desc[desc->next]; - - desc_addr = gpa_to_vva(dev, desc->addr); - if (unlikely(!desc_addr)) - return -1; - - rte_prefetch0((void *)(uintptr_t)desc_addr); - - desc_offset = 0; - desc_avail = desc->len; - - PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0); - } - - /* - * This mbuf reaches to its end, get a new one - * to hold more data. - */ - if (mbuf_avail == 0) { - cur = rte_pktmbuf_alloc(mbuf_pool); - if (unlikely(cur == NULL)) { - RTE_LOG(ERR, VHOST_DATA, "Failed to " - "allocate memory for mbuf.\n"); - return -1; - } - - prev->next = cur; - prev->data_len = mbuf_offset; - m->nb_segs += 1; - m->pkt_len += mbuf_offset; - prev = cur; - - mbuf_offset = 0; - mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM; - } - } - - prev->data_len = mbuf_offset; - m->pkt_len += mbuf_offset; - - if (hdr->flags != 0 || hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) - vhost_dequeue_offload(hdr, m); - - return 0; -} - -uint16_t -rte_vhost_dequeue_burst(int vid, uint16_t queue_id, - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) -{ - struct virtio_net *dev; - struct rte_mbuf *rarp_mbuf = NULL; - struct vhost_virtqueue *vq; - uint32_t desc_indexes[MAX_PKT_BURST]; - uint32_t used_idx; - uint32_t i = 0; - uint16_t free_entries; - uint16_t avail_idx; - - dev = get_device(vid); - if (!dev) - return 0; - - if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) { - RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", - dev->vid, __func__, queue_id); - return 0; - } - - vq = dev->virtqueue[queue_id]; - if (unlikely(vq->enabled == 0)) - return 0; - - /* - * Construct a RARP broadcast packet, and inject it to the "pkts" - * array, to looks like that guest actually send such packet. - * - * Check user_send_rarp() for more information. - */ - if (unlikely(rte_atomic16_cmpset((volatile uint16_t *) - &dev->broadcast_rarp.cnt, 1, 0))) { - rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool); - if (rarp_mbuf == NULL) { - RTE_LOG(ERR, VHOST_DATA, - "Failed to allocate memory for mbuf.\n"); - return 0; - } - - if (make_rarp_packet(rarp_mbuf, &dev->mac)) { - rte_pktmbuf_free(rarp_mbuf); - rarp_mbuf = NULL; - } else { - count -= 1; - } - } - - avail_idx = *((volatile uint16_t *)&vq->avail->idx); - free_entries = avail_idx - vq->last_used_idx; - if (free_entries == 0) - goto out; - - LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); - - /* Prefetch available ring to retrieve head indexes. */ - used_idx = vq->last_used_idx & (vq->size - 1); - rte_prefetch0(&vq->avail->ring[used_idx]); - rte_prefetch0(&vq->used->ring[used_idx]); - - count = RTE_MIN(count, MAX_PKT_BURST); - count = RTE_MIN(count, free_entries); - LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n", - dev->vid, count); - - /* Retrieve all of the head indexes first to avoid caching issues. */ - for (i = 0; i < count; i++) { - used_idx = (vq->last_used_idx + i) & (vq->size - 1); - desc_indexes[i] = vq->avail->ring[used_idx]; - - vq->used->ring[used_idx].id = desc_indexes[i]; - vq->used->ring[used_idx].len = 0; - vhost_log_used_vring(dev, vq, - offsetof(struct vring_used, ring[used_idx]), - sizeof(vq->used->ring[used_idx])); - } - - /* Prefetch descriptor index. */ - rte_prefetch0(&vq->desc[desc_indexes[0]]); - for (i = 0; i < count; i++) { - int err; - - if (likely(i + 1 < count)) - rte_prefetch0(&vq->desc[desc_indexes[i + 1]]); - - pkts[i] = rte_pktmbuf_alloc(mbuf_pool); - if (unlikely(pkts[i] == NULL)) { - RTE_LOG(ERR, VHOST_DATA, - "Failed to allocate memory for mbuf.\n"); - break; - } - err = copy_desc_to_mbuf(dev, vq, pkts[i], desc_indexes[i], - mbuf_pool); - if (unlikely(err)) { - rte_pktmbuf_free(pkts[i]); - break; - } - } - - rte_smp_wmb(); - rte_smp_rmb(); - vq->used->idx += i; - vq->last_used_idx += i; - vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), - sizeof(vq->used->idx)); - - /* Kick guest if required. */ - if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) - && (vq->callfd >= 0)) - eventfd_write(vq->callfd, (eventfd_t)1); - -out: - if (unlikely(rarp_mbuf != NULL)) { - /* - * Inject it to the head of "pkts" array, so that switch's mac - * learning table will get updated first. - */ - memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *)); - pkts[0] = rarp_mbuf; - i += 1; - } - - return i; -} diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c new file mode 100644 index 00000000..6b83c15f --- /dev/null +++ b/lib/librte_vhost/vhost_user.c @@ -0,0 +1,1033 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef RTE_LIBRTE_VHOST_NUMA +#include +#endif + +#include +#include +#include + +#include "vhost.h" +#include "vhost_user.h" + +static const char *vhost_message_str[VHOST_USER_MAX] = { + [VHOST_USER_NONE] = "VHOST_USER_NONE", + [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", + [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES", + [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER", + [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER", + [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE", + [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE", + [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD", + [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM", + [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR", + [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE", + [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE", + [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK", + [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL", + [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR", + [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES", + [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES", + [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", + [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE", + [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP", +}; + +static uint64_t +get_blk_size(int fd) +{ + struct stat stat; + int ret; + + ret = fstat(fd, &stat); + return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize; +} + +static void +free_mem_region(struct virtio_net *dev) +{ + uint32_t i; + struct virtio_memory_region *reg; + + if (!dev || !dev->mem) + return; + + for (i = 0; i < dev->mem->nregions; i++) { + reg = &dev->mem->regions[i]; + if (reg->host_user_addr) { + munmap(reg->mmap_addr, reg->mmap_size); + close(reg->fd); + } + } +} + +void +vhost_backend_cleanup(struct virtio_net *dev) +{ + if (dev->mem) { + free_mem_region(dev); + rte_free(dev->mem); + dev->mem = NULL; + } + if (dev->log_addr) { + munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); + dev->log_addr = 0; + } +} + +/* + * This function just returns success at the moment unless + * the device hasn't been initialised. + */ +static int +vhost_user_set_owner(void) +{ + return 0; +} + +static int +vhost_user_reset_owner(struct virtio_net *dev) +{ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + notify_ops->destroy_device(dev->vid); + } + + cleanup_device(dev, 0); + reset_device(dev); + return 0; +} + +/* + * The features that we support are requested. + */ +static uint64_t +vhost_user_get_features(void) +{ + return VHOST_FEATURES; +} + +/* + * We receive the negotiated features supported by us and the virtio device. + */ +static int +vhost_user_set_features(struct virtio_net *dev, uint64_t features) +{ + if (features & ~VHOST_FEATURES) + return -1; + + dev->features = features; + if (dev->features & + ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) { + dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + } else { + dev->vhost_hlen = sizeof(struct virtio_net_hdr); + } + LOG_DEBUG(VHOST_CONFIG, + "(%d) mergeable RX buffers %s, virtio 1 %s\n", + dev->vid, + (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off", + (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off"); + + return 0; +} + +/* + * The virtio device sends us the size of the descriptor ring. + */ +static int +vhost_user_set_vring_num(struct virtio_net *dev, + struct vhost_vring_state *state) +{ + struct vhost_virtqueue *vq = dev->virtqueue[state->index]; + + vq->size = state->num; + + if (dev->dequeue_zero_copy) { + vq->nr_zmbuf = 0; + vq->last_zmbuf_idx = 0; + vq->zmbuf_size = vq->size; + vq->zmbufs = rte_zmalloc(NULL, vq->zmbuf_size * + sizeof(struct zcopy_mbuf), 0); + if (vq->zmbufs == NULL) { + RTE_LOG(WARNING, VHOST_CONFIG, + "failed to allocate mem for zero copy; " + "zero copy is force disabled\n"); + dev->dequeue_zero_copy = 0; + } + } + + vq->shadow_used_ring = rte_malloc(NULL, + vq->size * sizeof(struct vring_used_elem), + RTE_CACHE_LINE_SIZE); + if (!vq->shadow_used_ring) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to allocate memory for shadow used ring.\n"); + return -1; + } + + return 0; +} + +/* + * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the + * same numa node as the memory of vring descriptor. + */ +#ifdef RTE_LIBRTE_VHOST_NUMA +static struct virtio_net* +numa_realloc(struct virtio_net *dev, int index) +{ + int oldnode, newnode; + struct virtio_net *old_dev; + struct vhost_virtqueue *old_vq, *vq; + int ret; + + /* + * vq is allocated on pairs, we should try to do realloc + * on first queue of one queue pair only. + */ + if (index % VIRTIO_QNUM != 0) + return dev; + + old_dev = dev; + vq = old_vq = dev->virtqueue[index]; + + ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc, + MPOL_F_NODE | MPOL_F_ADDR); + + /* check if we need to reallocate vq */ + ret |= get_mempolicy(&oldnode, NULL, 0, old_vq, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret) { + RTE_LOG(ERR, VHOST_CONFIG, + "Unable to get vq numa information.\n"); + return dev; + } + if (oldnode != newnode) { + RTE_LOG(INFO, VHOST_CONFIG, + "reallocate vq from %d to %d node\n", oldnode, newnode); + vq = rte_malloc_socket(NULL, sizeof(*vq) * VIRTIO_QNUM, 0, + newnode); + if (!vq) + return dev; + + memcpy(vq, old_vq, sizeof(*vq) * VIRTIO_QNUM); + rte_free(old_vq); + } + + /* check if we need to reallocate dev */ + ret = get_mempolicy(&oldnode, NULL, 0, old_dev, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret) { + RTE_LOG(ERR, VHOST_CONFIG, + "Unable to get dev numa information.\n"); + goto out; + } + if (oldnode != newnode) { + RTE_LOG(INFO, VHOST_CONFIG, + "reallocate dev from %d to %d node\n", + oldnode, newnode); + dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode); + if (!dev) { + dev = old_dev; + goto out; + } + + memcpy(dev, old_dev, sizeof(*dev)); + rte_free(old_dev); + } + +out: + dev->virtqueue[index] = vq; + dev->virtqueue[index + 1] = vq + 1; + vhost_devices[dev->vid] = dev; + + return dev; +} +#else +static struct virtio_net* +numa_realloc(struct virtio_net *dev, int index __rte_unused) +{ + return dev; +} +#endif + +/* + * Converts QEMU virtual address to Vhost virtual address. This function is + * used to convert the ring addresses to our address space. + */ +static uint64_t +qva_to_vva(struct virtio_net *dev, uint64_t qva) +{ + struct virtio_memory_region *reg; + uint32_t i; + + /* Find the region where the address lives. */ + for (i = 0; i < dev->mem->nregions; i++) { + reg = &dev->mem->regions[i]; + + if (qva >= reg->guest_user_addr && + qva < reg->guest_user_addr + reg->size) { + return qva - reg->guest_user_addr + + reg->host_user_addr; + } + } + + return 0; +} + +/* + * The virtio device sends us the desc, used and avail ring addresses. + * This function then converts these to our address space. + */ +static int +vhost_user_set_vring_addr(struct virtio_net *dev, struct vhost_vring_addr *addr) +{ + struct vhost_virtqueue *vq; + + if (dev->mem == NULL) + return -1; + + /* addr->index refers to the queue index. The txq 1, rxq is 0. */ + vq = dev->virtqueue[addr->index]; + + /* The addresses are converted from QEMU virtual to Vhost virtual. */ + vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev, + addr->desc_user_addr); + if (vq->desc == 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to find desc ring address.\n", + dev->vid); + return -1; + } + + dev = numa_realloc(dev, addr->index); + vq = dev->virtqueue[addr->index]; + + vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev, + addr->avail_user_addr); + if (vq->avail == 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to find avail ring address.\n", + dev->vid); + return -1; + } + + vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev, + addr->used_user_addr); + if (vq->used == 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to find used ring address.\n", + dev->vid); + return -1; + } + + if (vq->last_used_idx != vq->used->idx) { + RTE_LOG(WARNING, VHOST_CONFIG, + "last_used_idx (%u) and vq->used->idx (%u) mismatches; " + "some packets maybe resent for Tx and dropped for Rx\n", + vq->last_used_idx, vq->used->idx); + vq->last_used_idx = vq->used->idx; + vq->last_avail_idx = vq->used->idx; + } + + vq->log_guest_addr = addr->log_guest_addr; + + LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n", + dev->vid, vq->desc); + LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n", + dev->vid, vq->avail); + LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n", + dev->vid, vq->used); + LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n", + dev->vid, vq->log_guest_addr); + + return 0; +} + +/* + * The virtio device sends us the available ring last used index. + */ +static int +vhost_user_set_vring_base(struct virtio_net *dev, + struct vhost_vring_state *state) +{ + dev->virtqueue[state->index]->last_used_idx = state->num; + dev->virtqueue[state->index]->last_avail_idx = state->num; + + return 0; +} + +static void +add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr, + uint64_t host_phys_addr, uint64_t size) +{ + struct guest_page *page, *last_page; + + if (dev->nr_guest_pages == dev->max_guest_pages) { + dev->max_guest_pages *= 2; + dev->guest_pages = realloc(dev->guest_pages, + dev->max_guest_pages * sizeof(*page)); + } + + if (dev->nr_guest_pages > 0) { + last_page = &dev->guest_pages[dev->nr_guest_pages - 1]; + /* merge if the two pages are continuous */ + if (host_phys_addr == last_page->host_phys_addr + + last_page->size) { + last_page->size += size; + return; + } + } + + page = &dev->guest_pages[dev->nr_guest_pages++]; + page->guest_phys_addr = guest_phys_addr; + page->host_phys_addr = host_phys_addr; + page->size = size; +} + +static void +add_guest_pages(struct virtio_net *dev, struct virtio_memory_region *reg, + uint64_t page_size) +{ + uint64_t reg_size = reg->size; + uint64_t host_user_addr = reg->host_user_addr; + uint64_t guest_phys_addr = reg->guest_phys_addr; + uint64_t host_phys_addr; + uint64_t size; + + host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr); + size = page_size - (guest_phys_addr & (page_size - 1)); + size = RTE_MIN(size, reg_size); + + add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size); + host_user_addr += size; + guest_phys_addr += size; + reg_size -= size; + + while (reg_size > 0) { + host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t) + host_user_addr); + add_one_guest_page(dev, guest_phys_addr, host_phys_addr, + page_size); + + host_user_addr += page_size; + guest_phys_addr += page_size; + reg_size -= page_size; + } +} + +#ifdef RTE_LIBRTE_VHOST_DEBUG +/* TODO: enable it only in debug mode? */ +static void +dump_guest_pages(struct virtio_net *dev) +{ + uint32_t i; + struct guest_page *page; + + for (i = 0; i < dev->nr_guest_pages; i++) { + page = &dev->guest_pages[i]; + + RTE_LOG(INFO, VHOST_CONFIG, + "guest physical page region %u\n" + "\t guest_phys_addr: %" PRIx64 "\n" + "\t host_phys_addr : %" PRIx64 "\n" + "\t size : %" PRIx64 "\n", + i, + page->guest_phys_addr, + page->host_phys_addr, + page->size); + } +} +#else +#define dump_guest_pages(dev) +#endif + +static int +vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg) +{ + struct VhostUserMemory memory = pmsg->payload.memory; + struct virtio_memory_region *reg; + void *mmap_addr; + uint64_t mmap_size; + uint64_t mmap_offset; + uint64_t alignment; + uint32_t i; + int fd; + + /* Remove from the data plane. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + notify_ops->destroy_device(dev->vid); + } + + if (dev->mem) { + free_mem_region(dev); + rte_free(dev->mem); + dev->mem = NULL; + } + + dev->nr_guest_pages = 0; + if (!dev->guest_pages) { + dev->max_guest_pages = 8; + dev->guest_pages = malloc(dev->max_guest_pages * + sizeof(struct guest_page)); + } + + dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct virtio_memory) + + sizeof(struct virtio_memory_region) * memory.nregions, 0); + if (dev->mem == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to allocate memory for dev->mem\n", + dev->vid); + return -1; + } + dev->mem->nregions = memory.nregions; + + for (i = 0; i < memory.nregions; i++) { + fd = pmsg->fds[i]; + reg = &dev->mem->regions[i]; + + reg->guest_phys_addr = memory.regions[i].guest_phys_addr; + reg->guest_user_addr = memory.regions[i].userspace_addr; + reg->size = memory.regions[i].memory_size; + reg->fd = fd; + + mmap_offset = memory.regions[i].mmap_offset; + mmap_size = reg->size + mmap_offset; + + /* mmap() without flag of MAP_ANONYMOUS, should be called + * with length argument aligned with hugepagesz at older + * longterm version Linux, like 2.6.32 and 3.2.72, or + * mmap() will fail with EINVAL. + * + * to avoid failure, make sure in caller to keep length + * aligned. + */ + alignment = get_blk_size(fd); + if (alignment == (uint64_t)-1) { + RTE_LOG(ERR, VHOST_CONFIG, + "couldn't get hugepage size through fstat\n"); + goto err_mmap; + } + mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment); + + mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + + if (mmap_addr == MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, + "mmap region %u failed.\n", i); + goto err_mmap; + } + + reg->mmap_addr = mmap_addr; + reg->mmap_size = mmap_size; + reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + + mmap_offset; + + add_guest_pages(dev, reg, alignment); + + RTE_LOG(INFO, VHOST_CONFIG, + "guest memory region %u, size: 0x%" PRIx64 "\n" + "\t guest physical addr: 0x%" PRIx64 "\n" + "\t guest virtual addr: 0x%" PRIx64 "\n" + "\t host virtual addr: 0x%" PRIx64 "\n" + "\t mmap addr : 0x%" PRIx64 "\n" + "\t mmap size : 0x%" PRIx64 "\n" + "\t mmap align: 0x%" PRIx64 "\n" + "\t mmap off : 0x%" PRIx64 "\n", + i, reg->size, + reg->guest_phys_addr, + reg->guest_user_addr, + reg->host_user_addr, + (uint64_t)(uintptr_t)mmap_addr, + mmap_size, + alignment, + mmap_offset); + } + + dump_guest_pages(dev); + + return 0; + +err_mmap: + free_mem_region(dev); + rte_free(dev->mem); + dev->mem = NULL; + return -1; +} + +static int +vq_is_ready(struct vhost_virtqueue *vq) +{ + return vq && vq->desc && + vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD && + vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD; +} + +static int +virtio_is_ready(struct virtio_net *dev) +{ + struct vhost_virtqueue *rvq, *tvq; + uint32_t i; + + for (i = 0; i < dev->virt_qp_nb; i++) { + rvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ]; + tvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ]; + + if (!vq_is_ready(rvq) || !vq_is_ready(tvq)) { + RTE_LOG(INFO, VHOST_CONFIG, + "virtio is not ready for processing.\n"); + return 0; + } + } + + RTE_LOG(INFO, VHOST_CONFIG, + "virtio is now ready for processing.\n"); + return 1; +} + +static void +vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg) +{ + struct vhost_vring_file file; + struct vhost_virtqueue *vq; + uint32_t cur_qp_idx; + + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) + file.fd = VIRTIO_INVALID_EVENTFD; + else + file.fd = pmsg->fds[0]; + RTE_LOG(INFO, VHOST_CONFIG, + "vring call idx:%d file:%d\n", file.index, file.fd); + + /* + * FIXME: VHOST_SET_VRING_CALL is the first per-vring message + * we get, so we do vring queue pair allocation here. + */ + cur_qp_idx = file.index / VIRTIO_QNUM; + if (cur_qp_idx + 1 > dev->virt_qp_nb) { + if (alloc_vring_queue_pair(dev, cur_qp_idx) < 0) + return; + } + + vq = dev->virtqueue[file.index]; + assert(vq != NULL); + + if (vq->callfd >= 0) + close(vq->callfd); + + vq->callfd = file.fd; +} + +/* + * In vhost-user, when we receive kick message, will test whether virtio + * device is ready for packet processing. + */ +static void +vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg) +{ + struct vhost_vring_file file; + struct vhost_virtqueue *vq; + + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) + file.fd = VIRTIO_INVALID_EVENTFD; + else + file.fd = pmsg->fds[0]; + RTE_LOG(INFO, VHOST_CONFIG, + "vring kick idx:%d file:%d\n", file.index, file.fd); + + vq = dev->virtqueue[file.index]; + if (vq->kickfd >= 0) + close(vq->kickfd); + vq->kickfd = file.fd; + + if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) { + if (dev->dequeue_zero_copy) { + RTE_LOG(INFO, VHOST_CONFIG, + "dequeue zero copy is enabled\n"); + } + + if (notify_ops->new_device(dev->vid) == 0) + dev->flags |= VIRTIO_DEV_RUNNING; + } +} + +static void +free_zmbufs(struct vhost_virtqueue *vq) +{ + struct zcopy_mbuf *zmbuf, *next; + + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); + + rte_pktmbuf_free(zmbuf->mbuf); + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + } + + rte_free(vq->zmbufs); +} + +/* + * when virtio is stopped, qemu will send us the GET_VRING_BASE message. + */ +static int +vhost_user_get_vring_base(struct virtio_net *dev, + struct vhost_vring_state *state) +{ + struct vhost_virtqueue *vq = dev->virtqueue[state->index]; + + /* We have to stop the queue (virtio) if it is running. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + notify_ops->destroy_device(dev->vid); + } + + /* Here we are safe to get the last used index */ + state->num = vq->last_used_idx; + + RTE_LOG(INFO, VHOST_CONFIG, + "vring base idx:%d file:%d\n", state->index, state->num); + /* + * Based on current qemu vhost-user implementation, this message is + * sent and only sent in vhost_vring_stop. + * TODO: cleanup the vring, it isn't usable since here. + */ + if (vq->kickfd >= 0) + close(vq->kickfd); + + vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; + + if (dev->dequeue_zero_copy) + free_zmbufs(vq); + rte_free(vq->shadow_used_ring); + vq->shadow_used_ring = NULL; + + return 0; +} + +/* + * when virtio queues are ready to work, qemu will send us to + * enable the virtio queue pair. + */ +static int +vhost_user_set_vring_enable(struct virtio_net *dev, + struct vhost_vring_state *state) +{ + int enable = (int)state->num; + + RTE_LOG(INFO, VHOST_CONFIG, + "set queue enable: %d to qp idx: %d\n", + enable, state->index); + + if (notify_ops->vring_state_changed) + notify_ops->vring_state_changed(dev->vid, state->index, enable); + + dev->virtqueue[state->index]->enabled = enable; + + return 0; +} + +static void +vhost_user_set_protocol_features(struct virtio_net *dev, + uint64_t protocol_features) +{ + if (protocol_features & ~VHOST_USER_PROTOCOL_FEATURES) + return; + + dev->protocol_features = protocol_features; +} + +static int +vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg) +{ + int fd = msg->fds[0]; + uint64_t size, off; + void *addr; + + if (fd < 0) { + RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd); + return -1; + } + + if (msg->size != sizeof(VhostUserLog)) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid log base msg size: %"PRId32" != %d\n", + msg->size, (int)sizeof(VhostUserLog)); + return -1; + } + + size = msg->payload.log.mmap_size; + off = msg->payload.log.mmap_offset; + RTE_LOG(INFO, VHOST_CONFIG, + "log mmap size: %"PRId64", offset: %"PRId64"\n", + size, off); + + /* + * mmap from 0 to workaround a hugepage mmap bug: mmap will + * fail when offset is not page size aligned. + */ + addr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + close(fd); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n"); + return -1; + } + + /* + * Free previously mapped log memory on occasionally + * multiple VHOST_USER_SET_LOG_BASE. + */ + if (dev->log_addr) { + munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); + } + dev->log_addr = (uint64_t)(uintptr_t)addr; + dev->log_base = dev->log_addr + off; + dev->log_size = size; + + return 0; +} + +/* + * An rarp packet is constructed and broadcasted to notify switches about + * the new location of the migrated VM, so that packets from outside will + * not be lost after migration. + * + * However, we don't actually "send" a rarp packet here, instead, we set + * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it. + */ +static int +vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg) +{ + uint8_t *mac = (uint8_t *)&msg->payload.u64; + + RTE_LOG(DEBUG, VHOST_CONFIG, + ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n", + mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); + memcpy(dev->mac.addr_bytes, mac, 6); + + /* + * Set the flag to inject a RARP broadcast packet at + * rte_vhost_dequeue_burst(). + * + * rte_smp_wmb() is for making sure the mac is copied + * before the flag is set. + */ + rte_smp_wmb(); + rte_atomic16_set(&dev->broadcast_rarp, 1); + + return 0; +} + +/* return bytes# of read on success or negative val on failure. */ +static int +read_vhost_message(int sockfd, struct VhostUserMsg *msg) +{ + int ret; + + ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE, + msg->fds, VHOST_MEMORY_MAX_NREGIONS); + if (ret <= 0) + return ret; + + if (msg && msg->size) { + if (msg->size > sizeof(msg->payload)) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid msg size: %d\n", msg->size); + return -1; + } + ret = read(sockfd, &msg->payload, msg->size); + if (ret <= 0) + return ret; + if (ret != (int)msg->size) { + RTE_LOG(ERR, VHOST_CONFIG, + "read control message failed\n"); + return -1; + } + } + + return ret; +} + +static int +send_vhost_message(int sockfd, struct VhostUserMsg *msg) +{ + int ret; + + if (!msg) + return 0; + + msg->flags &= ~VHOST_USER_VERSION_MASK; + msg->flags |= VHOST_USER_VERSION; + msg->flags |= VHOST_USER_REPLY_MASK; + + ret = send_fd_message(sockfd, (char *)msg, + VHOST_USER_HDR_SIZE + msg->size, NULL, 0); + + return ret; +} + +int +vhost_user_msg_handler(int vid, int fd) +{ + struct virtio_net *dev; + struct VhostUserMsg msg; + int ret; + + dev = get_device(vid); + if (dev == NULL) + return -1; + + ret = read_vhost_message(fd, &msg); + if (ret <= 0 || msg.request >= VHOST_USER_MAX) { + if (ret < 0) + RTE_LOG(ERR, VHOST_CONFIG, + "vhost read message failed\n"); + else if (ret == 0) + RTE_LOG(INFO, VHOST_CONFIG, + "vhost peer closed\n"); + else + RTE_LOG(ERR, VHOST_CONFIG, + "vhost read incorrect message\n"); + + return -1; + } + + RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n", + vhost_message_str[msg.request]); + switch (msg.request) { + case VHOST_USER_GET_FEATURES: + msg.payload.u64 = vhost_user_get_features(); + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + case VHOST_USER_SET_FEATURES: + vhost_user_set_features(dev, msg.payload.u64); + break; + + case VHOST_USER_GET_PROTOCOL_FEATURES: + msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + case VHOST_USER_SET_PROTOCOL_FEATURES: + vhost_user_set_protocol_features(dev, msg.payload.u64); + break; + + case VHOST_USER_SET_OWNER: + vhost_user_set_owner(); + break; + case VHOST_USER_RESET_OWNER: + vhost_user_reset_owner(dev); + break; + + case VHOST_USER_SET_MEM_TABLE: + vhost_user_set_mem_table(dev, &msg); + break; + + case VHOST_USER_SET_LOG_BASE: + vhost_user_set_log_base(dev, &msg); + + /* it needs a reply */ + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + case VHOST_USER_SET_LOG_FD: + close(msg.fds[0]); + RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n"); + break; + + case VHOST_USER_SET_VRING_NUM: + vhost_user_set_vring_num(dev, &msg.payload.state); + break; + case VHOST_USER_SET_VRING_ADDR: + vhost_user_set_vring_addr(dev, &msg.payload.addr); + break; + case VHOST_USER_SET_VRING_BASE: + vhost_user_set_vring_base(dev, &msg.payload.state); + break; + + case VHOST_USER_GET_VRING_BASE: + ret = vhost_user_get_vring_base(dev, &msg.payload.state); + msg.size = sizeof(msg.payload.state); + send_vhost_message(fd, &msg); + break; + + case VHOST_USER_SET_VRING_KICK: + vhost_user_set_vring_kick(dev, &msg); + break; + case VHOST_USER_SET_VRING_CALL: + vhost_user_set_vring_call(dev, &msg); + break; + + case VHOST_USER_SET_VRING_ERR: + if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK)) + close(msg.fds[0]); + RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n"); + break; + + case VHOST_USER_GET_QUEUE_NUM: + msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + + case VHOST_USER_SET_VRING_ENABLE: + vhost_user_set_vring_enable(dev, &msg.payload.state); + break; + case VHOST_USER_SEND_RARP: + vhost_user_send_rarp(dev, &msg); + break; + + default: + break; + + } + + return 0; +} diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h new file mode 100644 index 00000000..ba78d326 --- /dev/null +++ b/lib/librte_vhost/vhost_user.h @@ -0,0 +1,128 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VHOST_NET_USER_H +#define _VHOST_NET_USER_H + +#include +#include + +#include "rte_virtio_net.h" + +/* refer to hw/virtio/vhost-user.c */ + +#define VHOST_MEMORY_MAX_NREGIONS 8 + +#define VHOST_USER_PROTOCOL_F_MQ 0 +#define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 +#define VHOST_USER_PROTOCOL_F_RARP 2 + +#define VHOST_USER_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \ + (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\ + (1ULL << VHOST_USER_PROTOCOL_F_RARP)) + +typedef enum VhostUserRequest { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_SEND_RARP = 19, + VHOST_USER_MAX +} VhostUserRequest; + +typedef struct VhostUserMemoryRegion { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + uint64_t mmap_offset; +} VhostUserMemoryRegion; + +typedef struct VhostUserMemory { + uint32_t nregions; + uint32_t padding; + VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; +} VhostUserMemory; + +typedef struct VhostUserLog { + uint64_t mmap_size; + uint64_t mmap_offset; +} VhostUserLog; + +typedef struct VhostUserMsg { + VhostUserRequest request; + +#define VHOST_USER_VERSION_MASK 0x3 +#define VHOST_USER_REPLY_MASK (0x1 << 2) + uint32_t flags; + uint32_t size; /* the following payload size */ + union { +#define VHOST_USER_VRING_IDX_MASK 0xff +#define VHOST_USER_VRING_NOFD_MASK (0x1<<8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + VhostUserMemory memory; + VhostUserLog log; + } payload; + int fds[VHOST_MEMORY_MAX_NREGIONS]; +} __attribute((packed)) VhostUserMsg; + +#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) + +/* The version of the protocol we support */ +#define VHOST_USER_VERSION 0x1 + + +/* vhost_user.c */ +int vhost_user_msg_handler(int vid, int fd); + +/* socket.c */ +int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num); +int send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num); + +#endif diff --git a/lib/librte_vhost/vhost_user/fd_man.c b/lib/librte_vhost/vhost_user/fd_man.c deleted file mode 100644 index 2d3eeb7d..00000000 --- a/lib/librte_vhost/vhost_user/fd_man.c +++ /dev/null @@ -1,299 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "fd_man.h" - -/** - * Returns the index in the fdset for a given fd. - * If fd is -1, it means to search for a free entry. - * @return - * index for the fd, or -1 if fd isn't in the fdset. - */ -static int -fdset_find_fd(struct fdset *pfdset, int fd) -{ - int i; - - if (pfdset == NULL) - return -1; - - for (i = 0; i < MAX_FDS && pfdset->fd[i].fd != fd; i++) - ; - - return i == MAX_FDS ? -1 : i; -} - -static int -fdset_find_free_slot(struct fdset *pfdset) -{ - return fdset_find_fd(pfdset, -1); -} - -static int -fdset_add_fd(struct fdset *pfdset, int idx, int fd, - fd_cb rcb, fd_cb wcb, void *dat) -{ - struct fdentry *pfdentry; - - if (pfdset == NULL || idx >= MAX_FDS || fd >= FD_SETSIZE) - return -1; - - pfdentry = &pfdset->fd[idx]; - pfdentry->fd = fd; - pfdentry->rcb = rcb; - pfdentry->wcb = wcb; - pfdentry->dat = dat; - - return 0; -} - -/** - * Fill the read/write fd_set with the fds in the fdset. - * @return - * the maximum fds filled in the read/write fd_set. - */ -static int -fdset_fill(fd_set *rfset, fd_set *wfset, struct fdset *pfdset) -{ - struct fdentry *pfdentry; - int i, maxfds = -1; - int num = MAX_FDS; - - if (pfdset == NULL) - return -1; - - for (i = 0; i < num; i++) { - pfdentry = &pfdset->fd[i]; - if (pfdentry->fd != -1) { - int added = 0; - if (pfdentry->rcb && rfset) { - FD_SET(pfdentry->fd, rfset); - added = 1; - } - if (pfdentry->wcb && wfset) { - FD_SET(pfdentry->fd, wfset); - added = 1; - } - if (added) - maxfds = pfdentry->fd < maxfds ? - maxfds : pfdentry->fd; - } - } - return maxfds; -} - -void -fdset_init(struct fdset *pfdset) -{ - int i; - - if (pfdset == NULL) - return; - - for (i = 0; i < MAX_FDS; i++) { - pfdset->fd[i].fd = -1; - pfdset->fd[i].dat = NULL; - } - pfdset->num = 0; -} - -/** - * Register the fd in the fdset with read/write handler and context. - */ -int -fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat) -{ - int i; - - if (pfdset == NULL || fd == -1) - return -1; - - pthread_mutex_lock(&pfdset->fd_mutex); - - /* Find a free slot in the list. */ - i = fdset_find_free_slot(pfdset); - if (i == -1 || fdset_add_fd(pfdset, i, fd, rcb, wcb, dat) < 0) { - pthread_mutex_unlock(&pfdset->fd_mutex); - return -2; - } - - pfdset->num++; - - pthread_mutex_unlock(&pfdset->fd_mutex); - - return 0; -} - -/** - * Unregister the fd from the fdset. - * Returns context of a given fd or NULL. - */ -void * -fdset_del(struct fdset *pfdset, int fd) -{ - int i; - void *dat = NULL; - - if (pfdset == NULL || fd == -1) - return NULL; - - do { - pthread_mutex_lock(&pfdset->fd_mutex); - - i = fdset_find_fd(pfdset, fd); - if (i != -1 && pfdset->fd[i].busy == 0) { - /* busy indicates r/wcb is executing! */ - dat = pfdset->fd[i].dat; - pfdset->fd[i].fd = -1; - pfdset->fd[i].rcb = pfdset->fd[i].wcb = NULL; - pfdset->fd[i].dat = NULL; - pfdset->num--; - i = -1; - } - pthread_mutex_unlock(&pfdset->fd_mutex); - } while (i != -1); - - return dat; -} - -/** - * Unregister the fd at the specified slot from the fdset. - */ -static void -fdset_del_slot(struct fdset *pfdset, int index) -{ - if (pfdset == NULL || index < 0 || index >= MAX_FDS) - return; - - pthread_mutex_lock(&pfdset->fd_mutex); - - pfdset->fd[index].fd = -1; - pfdset->fd[index].rcb = pfdset->fd[index].wcb = NULL; - pfdset->fd[index].dat = NULL; - pfdset->num--; - - pthread_mutex_unlock(&pfdset->fd_mutex); -} - -/** - * This functions runs in infinite blocking loop until there is no fd in - * pfdset. It calls corresponding r/w handler if there is event on the fd. - * - * Before the callback is called, we set the flag to busy status; If other - * thread(now rte_vhost_driver_unregister) calls fdset_del concurrently, it - * will wait until the flag is reset to zero(which indicates the callback is - * finished), then it could free the context after fdset_del. - */ -void -fdset_event_dispatch(struct fdset *pfdset) -{ - fd_set rfds, wfds; - int i, maxfds; - struct fdentry *pfdentry; - int num = MAX_FDS; - fd_cb rcb, wcb; - void *dat; - int fd; - int remove1, remove2; - int ret; - - if (pfdset == NULL) - return; - - while (1) { - struct timeval tv; - tv.tv_sec = 1; - tv.tv_usec = 0; - FD_ZERO(&rfds); - FD_ZERO(&wfds); - pthread_mutex_lock(&pfdset->fd_mutex); - - maxfds = fdset_fill(&rfds, &wfds, pfdset); - - pthread_mutex_unlock(&pfdset->fd_mutex); - - /* - * When select is blocked, other threads might unregister - * listenfds from and register new listenfds into fdset. - * When select returns, the entries for listenfds in the fdset - * might have been updated. It is ok if there is unwanted call - * for new listenfds. - */ - ret = select(maxfds + 1, &rfds, &wfds, NULL, &tv); - if (ret <= 0) - continue; - - for (i = 0; i < num; i++) { - remove1 = remove2 = 0; - pthread_mutex_lock(&pfdset->fd_mutex); - pfdentry = &pfdset->fd[i]; - fd = pfdentry->fd; - rcb = pfdentry->rcb; - wcb = pfdentry->wcb; - dat = pfdentry->dat; - pfdentry->busy = 1; - pthread_mutex_unlock(&pfdset->fd_mutex); - if (fd >= 0 && FD_ISSET(fd, &rfds) && rcb) - rcb(fd, dat, &remove1); - if (fd >= 0 && FD_ISSET(fd, &wfds) && wcb) - wcb(fd, dat, &remove2); - pfdentry->busy = 0; - /* - * fdset_del needs to check busy flag. - * We don't allow fdset_del to be called in callback - * directly. - */ - /* - * When we are to clean up the fd from fdset, - * because the fd is closed in the cb, - * the old fd val could be reused by when creates new - * listen fd in another thread, we couldn't call - * fd_set_del. - */ - if (remove1 || remove2) - fdset_del_slot(pfdset, i); - } - } -} diff --git a/lib/librte_vhost/vhost_user/fd_man.h b/lib/librte_vhost/vhost_user/fd_man.h deleted file mode 100644 index bd66ed1c..00000000 --- a/lib/librte_vhost/vhost_user/fd_man.h +++ /dev/null @@ -1,67 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _FD_MAN_H_ -#define _FD_MAN_H_ -#include -#include - -#define MAX_FDS 1024 - -typedef void (*fd_cb)(int fd, void *dat, int *remove); - -struct fdentry { - int fd; /* -1 indicates this entry is empty */ - fd_cb rcb; /* callback when this fd is readable. */ - fd_cb wcb; /* callback when this fd is writeable.*/ - void *dat; /* fd context */ - int busy; /* whether this entry is being used in cb. */ -}; - -struct fdset { - struct fdentry fd[MAX_FDS]; - pthread_mutex_t fd_mutex; - int num; /* current fd number of this fdset */ -}; - - -void fdset_init(struct fdset *pfdset); - -int fdset_add(struct fdset *pfdset, int fd, - fd_cb rcb, fd_cb wcb, void *dat); - -void *fdset_del(struct fdset *pfdset, int fd); - -void fdset_event_dispatch(struct fdset *pfdset); - -#endif diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.c b/lib/librte_vhost/vhost_user/vhost-net-user.c deleted file mode 100644 index b35594d9..00000000 --- a/lib/librte_vhost/vhost_user/vhost-net-user.c +++ /dev/null @@ -1,795 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "fd_man.h" -#include "vhost-net-user.h" -#include "vhost-net.h" -#include "virtio-net-user.h" - -/* - * Every time rte_vhost_driver_register() is invoked, an associated - * vhost_user_socket struct will be created. - */ -struct vhost_user_socket { - char *path; - int listenfd; - int connfd; - bool is_server; - bool reconnect; -}; - -struct vhost_user_connection { - struct vhost_user_socket *vsocket; - int vid; -}; - -#define MAX_VHOST_SOCKET 1024 -struct vhost_user { - struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET]; - struct fdset fdset; - int vsocket_cnt; - pthread_mutex_t mutex; -}; - -#define MAX_VIRTIO_BACKLOG 128 - -static void vhost_user_server_new_connection(int fd, void *data, int *remove); -static void vhost_user_msg_handler(int fd, void *dat, int *remove); -static int vhost_user_create_client(struct vhost_user_socket *vsocket); - -static struct vhost_user vhost_user = { - .fdset = { - .fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} }, - .fd_mutex = PTHREAD_MUTEX_INITIALIZER, - .num = 0 - }, - .vsocket_cnt = 0, - .mutex = PTHREAD_MUTEX_INITIALIZER, -}; - -static const char *vhost_message_str[VHOST_USER_MAX] = { - [VHOST_USER_NONE] = "VHOST_USER_NONE", - [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", - [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES", - [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER", - [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER", - [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE", - [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE", - [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD", - [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM", - [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR", - [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE", - [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE", - [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK", - [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL", - [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR", - [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES", - [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES", - [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", - [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE", - [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP", -}; - -/* return bytes# of read on success or negative val on failure. */ -static int -read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) -{ - struct iovec iov; - struct msghdr msgh; - size_t fdsize = fd_num * sizeof(int); - char control[CMSG_SPACE(fdsize)]; - struct cmsghdr *cmsg; - int ret; - - memset(&msgh, 0, sizeof(msgh)); - iov.iov_base = buf; - iov.iov_len = buflen; - - msgh.msg_iov = &iov; - msgh.msg_iovlen = 1; - msgh.msg_control = control; - msgh.msg_controllen = sizeof(control); - - ret = recvmsg(sockfd, &msgh, 0); - if (ret <= 0) { - RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n"); - return ret; - } - - if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) { - RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n"); - return -1; - } - - for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL; - cmsg = CMSG_NXTHDR(&msgh, cmsg)) { - if ((cmsg->cmsg_level == SOL_SOCKET) && - (cmsg->cmsg_type == SCM_RIGHTS)) { - memcpy(fds, CMSG_DATA(cmsg), fdsize); - break; - } - } - - return ret; -} - -/* return bytes# of read on success or negative val on failure. */ -static int -read_vhost_message(int sockfd, struct VhostUserMsg *msg) -{ - int ret; - - ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE, - msg->fds, VHOST_MEMORY_MAX_NREGIONS); - if (ret <= 0) - return ret; - - if (msg && msg->size) { - if (msg->size > sizeof(msg->payload)) { - RTE_LOG(ERR, VHOST_CONFIG, - "invalid msg size: %d\n", msg->size); - return -1; - } - ret = read(sockfd, &msg->payload, msg->size); - if (ret <= 0) - return ret; - if (ret != (int)msg->size) { - RTE_LOG(ERR, VHOST_CONFIG, - "read control message failed\n"); - return -1; - } - } - - return ret; -} - -static int -send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) -{ - - struct iovec iov; - struct msghdr msgh; - size_t fdsize = fd_num * sizeof(int); - char control[CMSG_SPACE(fdsize)]; - struct cmsghdr *cmsg; - int ret; - - memset(&msgh, 0, sizeof(msgh)); - iov.iov_base = buf; - iov.iov_len = buflen; - - msgh.msg_iov = &iov; - msgh.msg_iovlen = 1; - - if (fds && fd_num > 0) { - msgh.msg_control = control; - msgh.msg_controllen = sizeof(control); - cmsg = CMSG_FIRSTHDR(&msgh); - cmsg->cmsg_len = CMSG_LEN(fdsize); - cmsg->cmsg_level = SOL_SOCKET; - cmsg->cmsg_type = SCM_RIGHTS; - memcpy(CMSG_DATA(cmsg), fds, fdsize); - } else { - msgh.msg_control = NULL; - msgh.msg_controllen = 0; - } - - do { - ret = sendmsg(sockfd, &msgh, 0); - } while (ret < 0 && errno == EINTR); - - if (ret < 0) { - RTE_LOG(ERR, VHOST_CONFIG, "sendmsg error\n"); - return ret; - } - - return ret; -} - -static int -send_vhost_message(int sockfd, struct VhostUserMsg *msg) -{ - int ret; - - if (!msg) - return 0; - - msg->flags &= ~VHOST_USER_VERSION_MASK; - msg->flags |= VHOST_USER_VERSION; - msg->flags |= VHOST_USER_REPLY_MASK; - - ret = send_fd_message(sockfd, (char *)msg, - VHOST_USER_HDR_SIZE + msg->size, NULL, 0); - - return ret; -} - - -static void -vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket) -{ - int vid; - size_t size; - struct vhost_user_connection *conn; - int ret; - - conn = malloc(sizeof(*conn)); - if (conn == NULL) { - close(fd); - return; - } - - vid = vhost_new_device(); - if (vid == -1) { - close(fd); - free(conn); - return; - } - - size = strnlen(vsocket->path, PATH_MAX); - vhost_set_ifname(vid, vsocket->path, size); - - RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid); - - vsocket->connfd = fd; - conn->vsocket = vsocket; - conn->vid = vid; - ret = fdset_add(&vhost_user.fdset, fd, vhost_user_msg_handler, - NULL, conn); - if (ret < 0) { - vsocket->connfd = -1; - free(conn); - close(fd); - RTE_LOG(ERR, VHOST_CONFIG, - "failed to add fd %d into vhost server fdset\n", - fd); - } -} - -/* call back when there is new vhost-user connection from client */ -static void -vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused) -{ - struct vhost_user_socket *vsocket = dat; - - fd = accept(fd, NULL, NULL); - if (fd < 0) - return; - - RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd); - vhost_user_add_connection(fd, vsocket); -} - -/* callback when there is message on the connfd */ -static void -vhost_user_msg_handler(int connfd, void *dat, int *remove) -{ - int vid; - struct vhost_user_connection *conn = dat; - struct VhostUserMsg msg; - uint64_t features; - int ret; - - vid = conn->vid; - ret = read_vhost_message(connfd, &msg); - if (ret <= 0 || msg.request >= VHOST_USER_MAX) { - struct vhost_user_socket *vsocket = conn->vsocket; - - if (ret < 0) - RTE_LOG(ERR, VHOST_CONFIG, - "vhost read message failed\n"); - else if (ret == 0) - RTE_LOG(INFO, VHOST_CONFIG, - "vhost peer closed\n"); - else - RTE_LOG(ERR, VHOST_CONFIG, - "vhost read incorrect message\n"); - - vsocket->connfd = -1; - close(connfd); - *remove = 1; - free(conn); - vhost_destroy_device(vid); - - if (vsocket->reconnect) - vhost_user_create_client(vsocket); - - return; - } - - RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n", - vhost_message_str[msg.request]); - switch (msg.request) { - case VHOST_USER_GET_FEATURES: - ret = vhost_get_features(vid, &features); - msg.payload.u64 = features; - msg.size = sizeof(msg.payload.u64); - send_vhost_message(connfd, &msg); - break; - case VHOST_USER_SET_FEATURES: - features = msg.payload.u64; - vhost_set_features(vid, &features); - break; - - case VHOST_USER_GET_PROTOCOL_FEATURES: - msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES; - msg.size = sizeof(msg.payload.u64); - send_vhost_message(connfd, &msg); - break; - case VHOST_USER_SET_PROTOCOL_FEATURES: - user_set_protocol_features(vid, msg.payload.u64); - break; - - case VHOST_USER_SET_OWNER: - vhost_set_owner(vid); - break; - case VHOST_USER_RESET_OWNER: - vhost_reset_owner(vid); - break; - - case VHOST_USER_SET_MEM_TABLE: - user_set_mem_table(vid, &msg); - break; - - case VHOST_USER_SET_LOG_BASE: - user_set_log_base(vid, &msg); - - /* it needs a reply */ - msg.size = sizeof(msg.payload.u64); - send_vhost_message(connfd, &msg); - break; - case VHOST_USER_SET_LOG_FD: - close(msg.fds[0]); - RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n"); - break; - - case VHOST_USER_SET_VRING_NUM: - vhost_set_vring_num(vid, &msg.payload.state); - break; - case VHOST_USER_SET_VRING_ADDR: - vhost_set_vring_addr(vid, &msg.payload.addr); - break; - case VHOST_USER_SET_VRING_BASE: - vhost_set_vring_base(vid, &msg.payload.state); - break; - - case VHOST_USER_GET_VRING_BASE: - ret = user_get_vring_base(vid, &msg.payload.state); - msg.size = sizeof(msg.payload.state); - send_vhost_message(connfd, &msg); - break; - - case VHOST_USER_SET_VRING_KICK: - user_set_vring_kick(vid, &msg); - break; - case VHOST_USER_SET_VRING_CALL: - user_set_vring_call(vid, &msg); - break; - - case VHOST_USER_SET_VRING_ERR: - if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK)) - close(msg.fds[0]); - RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n"); - break; - - case VHOST_USER_GET_QUEUE_NUM: - msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS; - msg.size = sizeof(msg.payload.u64); - send_vhost_message(connfd, &msg); - break; - - case VHOST_USER_SET_VRING_ENABLE: - user_set_vring_enable(vid, &msg.payload.state); - break; - case VHOST_USER_SEND_RARP: - user_send_rarp(vid, &msg); - break; - - default: - break; - - } -} - -static int -create_unix_socket(const char *path, struct sockaddr_un *un, bool is_server) -{ - int fd; - - fd = socket(AF_UNIX, SOCK_STREAM, 0); - if (fd < 0) - return -1; - RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n", - is_server ? "server" : "client", fd); - - if (!is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) { - RTE_LOG(ERR, VHOST_CONFIG, - "vhost-user: can't set nonblocking mode for socket, fd: " - "%d (%s)\n", fd, strerror(errno)); - close(fd); - return -1; - } - - memset(un, 0, sizeof(*un)); - un->sun_family = AF_UNIX; - strncpy(un->sun_path, path, sizeof(un->sun_path)); - un->sun_path[sizeof(un->sun_path) - 1] = '\0'; - - return fd; -} - -static int -vhost_user_create_server(struct vhost_user_socket *vsocket) -{ - int fd; - int ret; - struct sockaddr_un un; - const char *path = vsocket->path; - - fd = create_unix_socket(path, &un, vsocket->is_server); - if (fd < 0) - return -1; - - ret = bind(fd, (struct sockaddr *)&un, sizeof(un)); - if (ret < 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "failed to bind to %s: %s; remove it and try again\n", - path, strerror(errno)); - goto err; - } - RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path); - - ret = listen(fd, MAX_VIRTIO_BACKLOG); - if (ret < 0) - goto err; - - vsocket->listenfd = fd; - ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection, - NULL, vsocket); - if (ret < 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "failed to add listen fd %d to vhost server fdset\n", - fd); - goto err; - } - - return 0; - -err: - close(fd); - return -1; -} - -struct vhost_user_reconnect { - struct sockaddr_un un; - int fd; - struct vhost_user_socket *vsocket; - - TAILQ_ENTRY(vhost_user_reconnect) next; -}; - -TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect); -struct vhost_user_reconnect_list { - struct vhost_user_reconnect_tailq_list head; - pthread_mutex_t mutex; -}; - -static struct vhost_user_reconnect_list reconn_list; -static pthread_t reconn_tid; - -static int -vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz) -{ - int ret, flags; - - ret = connect(fd, un, sz); - if (ret < 0 && errno != EISCONN) - return -1; - - flags = fcntl(fd, F_GETFL, 0); - if (flags < 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "can't get flags for connfd %d\n", fd); - return -2; - } - if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) { - RTE_LOG(ERR, VHOST_CONFIG, - "can't disable nonblocking on fd %d\n", fd); - return -2; - } - return 0; -} - -static void * -vhost_user_client_reconnect(void *arg __rte_unused) -{ - int ret; - struct vhost_user_reconnect *reconn, *next; - - while (1) { - pthread_mutex_lock(&reconn_list.mutex); - - /* - * An equal implementation of TAILQ_FOREACH_SAFE, - * which does not exist on all platforms. - */ - for (reconn = TAILQ_FIRST(&reconn_list.head); - reconn != NULL; reconn = next) { - next = TAILQ_NEXT(reconn, next); - - ret = vhost_user_connect_nonblock(reconn->fd, - (struct sockaddr *)&reconn->un, - sizeof(reconn->un)); - if (ret == -2) { - close(reconn->fd); - RTE_LOG(ERR, VHOST_CONFIG, - "reconnection for fd %d failed\n", - reconn->fd); - goto remove_fd; - } - if (ret == -1) - continue; - - RTE_LOG(INFO, VHOST_CONFIG, - "%s: connected\n", reconn->vsocket->path); - vhost_user_add_connection(reconn->fd, reconn->vsocket); -remove_fd: - TAILQ_REMOVE(&reconn_list.head, reconn, next); - free(reconn); - } - - pthread_mutex_unlock(&reconn_list.mutex); - sleep(1); - } - - return NULL; -} - -static int -vhost_user_reconnect_init(void) -{ - int ret; - - pthread_mutex_init(&reconn_list.mutex, NULL); - TAILQ_INIT(&reconn_list.head); - - ret = pthread_create(&reconn_tid, NULL, - vhost_user_client_reconnect, NULL); - if (ret < 0) - RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread"); - - return ret; -} - -static int -vhost_user_create_client(struct vhost_user_socket *vsocket) -{ - int fd; - int ret; - struct sockaddr_un un; - const char *path = vsocket->path; - struct vhost_user_reconnect *reconn; - - fd = create_unix_socket(path, &un, vsocket->is_server); - if (fd < 0) - return -1; - - ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&un, - sizeof(un)); - if (ret == 0) { - vhost_user_add_connection(fd, vsocket); - return 0; - } - - RTE_LOG(ERR, VHOST_CONFIG, - "failed to connect to %s: %s\n", - path, strerror(errno)); - - if (ret == -2 || !vsocket->reconnect) { - close(fd); - return -1; - } - - RTE_LOG(ERR, VHOST_CONFIG, "%s: reconnecting...\n", path); - reconn = malloc(sizeof(*reconn)); - if (reconn == NULL) { - RTE_LOG(ERR, VHOST_CONFIG, - "failed to allocate memory for reconnect\n"); - close(fd); - return -1; - } - reconn->un = un; - reconn->fd = fd; - reconn->vsocket = vsocket; - pthread_mutex_lock(&reconn_list.mutex); - TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next); - pthread_mutex_unlock(&reconn_list.mutex); - - return 0; -} - -/* - * Register a new vhost-user socket; here we could act as server - * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag - * is set. - */ -int -rte_vhost_driver_register(const char *path, uint64_t flags) -{ - int ret = -1; - struct vhost_user_socket *vsocket; - - if (!path) - return -1; - - pthread_mutex_lock(&vhost_user.mutex); - - if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) { - RTE_LOG(ERR, VHOST_CONFIG, - "error: the number of vhost sockets reaches maximum\n"); - goto out; - } - - vsocket = malloc(sizeof(struct vhost_user_socket)); - if (!vsocket) - goto out; - memset(vsocket, 0, sizeof(struct vhost_user_socket)); - vsocket->path = strdup(path); - vsocket->connfd = -1; - - if ((flags & RTE_VHOST_USER_CLIENT) != 0) { - vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT); - if (vsocket->reconnect && reconn_tid == 0) { - if (vhost_user_reconnect_init() < 0) { - free(vsocket->path); - free(vsocket); - goto out; - } - } - ret = vhost_user_create_client(vsocket); - } else { - vsocket->is_server = true; - ret = vhost_user_create_server(vsocket); - } - if (ret < 0) { - free(vsocket->path); - free(vsocket); - goto out; - } - - vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket; - -out: - pthread_mutex_unlock(&vhost_user.mutex); - - return ret; -} - -static bool -vhost_user_remove_reconnect(struct vhost_user_socket *vsocket) -{ - int found = false; - struct vhost_user_reconnect *reconn, *next; - - pthread_mutex_lock(&reconn_list.mutex); - - for (reconn = TAILQ_FIRST(&reconn_list.head); - reconn != NULL; reconn = next) { - next = TAILQ_NEXT(reconn, next); - - if (reconn->vsocket == vsocket) { - TAILQ_REMOVE(&reconn_list.head, reconn, next); - close(reconn->fd); - free(reconn); - found = true; - break; - } - } - pthread_mutex_unlock(&reconn_list.mutex); - return found; -} - -/** - * Unregister the specified vhost socket - */ -int -rte_vhost_driver_unregister(const char *path) -{ - int i; - int count; - struct vhost_user_connection *conn; - - pthread_mutex_lock(&vhost_user.mutex); - - for (i = 0; i < vhost_user.vsocket_cnt; i++) { - struct vhost_user_socket *vsocket = vhost_user.vsockets[i]; - - if (!strcmp(vsocket->path, path)) { - if (vsocket->is_server) { - fdset_del(&vhost_user.fdset, vsocket->listenfd); - close(vsocket->listenfd); - unlink(path); - } else if (vsocket->reconnect) { - vhost_user_remove_reconnect(vsocket); - } - - conn = fdset_del(&vhost_user.fdset, vsocket->connfd); - if (conn) { - RTE_LOG(INFO, VHOST_CONFIG, - "free connfd = %d for device '%s'\n", - vsocket->connfd, path); - close(vsocket->connfd); - vhost_destroy_device(conn->vid); - free(conn); - } - - free(vsocket->path); - free(vsocket); - - count = --vhost_user.vsocket_cnt; - vhost_user.vsockets[i] = vhost_user.vsockets[count]; - vhost_user.vsockets[count] = NULL; - pthread_mutex_unlock(&vhost_user.mutex); - - return 0; - } - } - pthread_mutex_unlock(&vhost_user.mutex); - - return -1; -} - -int -rte_vhost_driver_session_start(void) -{ - fdset_event_dispatch(&vhost_user.fdset); - return 0; -} diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.h b/lib/librte_vhost/vhost_user/vhost-net-user.h deleted file mode 100644 index f5332396..00000000 --- a/lib/librte_vhost/vhost_user/vhost-net-user.h +++ /dev/null @@ -1,113 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _VHOST_NET_USER_H -#define _VHOST_NET_USER_H - -#include -#include - -#include "rte_virtio_net.h" - -/* refer to hw/virtio/vhost-user.c */ - -#define VHOST_MEMORY_MAX_NREGIONS 8 - -typedef enum VhostUserRequest { - VHOST_USER_NONE = 0, - VHOST_USER_GET_FEATURES = 1, - VHOST_USER_SET_FEATURES = 2, - VHOST_USER_SET_OWNER = 3, - VHOST_USER_RESET_OWNER = 4, - VHOST_USER_SET_MEM_TABLE = 5, - VHOST_USER_SET_LOG_BASE = 6, - VHOST_USER_SET_LOG_FD = 7, - VHOST_USER_SET_VRING_NUM = 8, - VHOST_USER_SET_VRING_ADDR = 9, - VHOST_USER_SET_VRING_BASE = 10, - VHOST_USER_GET_VRING_BASE = 11, - VHOST_USER_SET_VRING_KICK = 12, - VHOST_USER_SET_VRING_CALL = 13, - VHOST_USER_SET_VRING_ERR = 14, - VHOST_USER_GET_PROTOCOL_FEATURES = 15, - VHOST_USER_SET_PROTOCOL_FEATURES = 16, - VHOST_USER_GET_QUEUE_NUM = 17, - VHOST_USER_SET_VRING_ENABLE = 18, - VHOST_USER_SEND_RARP = 19, - VHOST_USER_MAX -} VhostUserRequest; - -typedef struct VhostUserMemoryRegion { - uint64_t guest_phys_addr; - uint64_t memory_size; - uint64_t userspace_addr; - uint64_t mmap_offset; -} VhostUserMemoryRegion; - -typedef struct VhostUserMemory { - uint32_t nregions; - uint32_t padding; - VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; -} VhostUserMemory; - -typedef struct VhostUserLog { - uint64_t mmap_size; - uint64_t mmap_offset; -} VhostUserLog; - -typedef struct VhostUserMsg { - VhostUserRequest request; - -#define VHOST_USER_VERSION_MASK 0x3 -#define VHOST_USER_REPLY_MASK (0x1 << 2) - uint32_t flags; - uint32_t size; /* the following payload size */ - union { -#define VHOST_USER_VRING_IDX_MASK 0xff -#define VHOST_USER_VRING_NOFD_MASK (0x1<<8) - uint64_t u64; - struct vhost_vring_state state; - struct vhost_vring_addr addr; - VhostUserMemory memory; - VhostUserLog log; - } payload; - int fds[VHOST_MEMORY_MAX_NREGIONS]; -} __attribute((packed)) VhostUserMsg; - -#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) - -/* The version of the protocol we support */ -#define VHOST_USER_VERSION 0x1 - -/*****************************************************************************/ -#endif diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.c b/lib/librte_vhost/vhost_user/virtio-net-user.c deleted file mode 100644 index e7c43479..00000000 --- a/lib/librte_vhost/vhost_user/virtio-net-user.c +++ /dev/null @@ -1,470 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "virtio-net-user.h" -#include "vhost-net-user.h" -#include "vhost-net.h" - -struct orig_region_map { - int fd; - uint64_t mapped_address; - uint64_t mapped_size; - uint64_t blksz; -}; - -#define orig_region(ptr, nregions) \ - ((struct orig_region_map *)RTE_PTR_ADD((ptr), \ - sizeof(struct virtio_memory) + \ - sizeof(struct virtio_memory_regions) * (nregions))) - -static uint64_t -get_blk_size(int fd) -{ - struct stat stat; - int ret; - - ret = fstat(fd, &stat); - return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize; -} - -static void -free_mem_region(struct virtio_net *dev) -{ - struct orig_region_map *region; - unsigned int idx; - - if (!dev || !dev->mem) - return; - - region = orig_region(dev->mem, dev->mem->nregions); - for (idx = 0; idx < dev->mem->nregions; idx++) { - if (region[idx].mapped_address) { - munmap((void *)(uintptr_t)region[idx].mapped_address, - region[idx].mapped_size); - close(region[idx].fd); - } - } -} - -void -vhost_backend_cleanup(struct virtio_net *dev) -{ - if (dev->mem) { - free_mem_region(dev); - free(dev->mem); - dev->mem = NULL; - } - if (dev->log_addr) { - munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); - dev->log_addr = 0; - } -} - -int -user_set_mem_table(int vid, struct VhostUserMsg *pmsg) -{ - struct VhostUserMemory memory = pmsg->payload.memory; - struct virtio_memory_regions *pregion; - uint64_t mapped_address, mapped_size; - struct virtio_net *dev; - unsigned int idx = 0; - struct orig_region_map *pregion_orig; - uint64_t alignment; - - /* unmap old memory regions one by one*/ - dev = get_device(vid); - if (dev == NULL) - return -1; - - /* Remove from the data plane. */ - if (dev->flags & VIRTIO_DEV_RUNNING) { - dev->flags &= ~VIRTIO_DEV_RUNNING; - notify_ops->destroy_device(vid); - } - - if (dev->mem) { - free_mem_region(dev); - free(dev->mem); - dev->mem = NULL; - } - - dev->mem = calloc(1, - sizeof(struct virtio_memory) + - sizeof(struct virtio_memory_regions) * memory.nregions + - sizeof(struct orig_region_map) * memory.nregions); - if (dev->mem == NULL) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%d) failed to allocate memory for dev->mem\n", - dev->vid); - return -1; - } - dev->mem->nregions = memory.nregions; - - pregion_orig = orig_region(dev->mem, memory.nregions); - for (idx = 0; idx < memory.nregions; idx++) { - pregion = &dev->mem->regions[idx]; - pregion->guest_phys_address = - memory.regions[idx].guest_phys_addr; - pregion->guest_phys_address_end = - memory.regions[idx].guest_phys_addr + - memory.regions[idx].memory_size; - pregion->memory_size = - memory.regions[idx].memory_size; - pregion->userspace_address = - memory.regions[idx].userspace_addr; - - /* This is ugly */ - mapped_size = memory.regions[idx].memory_size + - memory.regions[idx].mmap_offset; - - /* mmap() without flag of MAP_ANONYMOUS, should be called - * with length argument aligned with hugepagesz at older - * longterm version Linux, like 2.6.32 and 3.2.72, or - * mmap() will fail with EINVAL. - * - * to avoid failure, make sure in caller to keep length - * aligned. - */ - alignment = get_blk_size(pmsg->fds[idx]); - if (alignment == (uint64_t)-1) { - RTE_LOG(ERR, VHOST_CONFIG, - "couldn't get hugepage size through fstat\n"); - goto err_mmap; - } - mapped_size = RTE_ALIGN_CEIL(mapped_size, alignment); - - mapped_address = (uint64_t)(uintptr_t)mmap(NULL, - mapped_size, - PROT_READ | PROT_WRITE, MAP_SHARED, - pmsg->fds[idx], - 0); - - RTE_LOG(INFO, VHOST_CONFIG, - "mapped region %d fd:%d to:%p sz:0x%"PRIx64" " - "off:0x%"PRIx64" align:0x%"PRIx64"\n", - idx, pmsg->fds[idx], (void *)(uintptr_t)mapped_address, - mapped_size, memory.regions[idx].mmap_offset, - alignment); - - if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) { - RTE_LOG(ERR, VHOST_CONFIG, - "mmap qemu guest failed.\n"); - goto err_mmap; - } - - pregion_orig[idx].mapped_address = mapped_address; - pregion_orig[idx].mapped_size = mapped_size; - pregion_orig[idx].blksz = alignment; - pregion_orig[idx].fd = pmsg->fds[idx]; - - mapped_address += memory.regions[idx].mmap_offset; - - pregion->address_offset = mapped_address - - pregion->guest_phys_address; - - if (memory.regions[idx].guest_phys_addr == 0) { - dev->mem->base_address = - memory.regions[idx].userspace_addr; - dev->mem->mapped_address = - pregion->address_offset; - } - - LOG_DEBUG(VHOST_CONFIG, - "REGION: %u GPA: %p QEMU VA: %p SIZE (%"PRIu64")\n", - idx, - (void *)(uintptr_t)pregion->guest_phys_address, - (void *)(uintptr_t)pregion->userspace_address, - pregion->memory_size); - } - - return 0; - -err_mmap: - while (idx--) { - munmap((void *)(uintptr_t)pregion_orig[idx].mapped_address, - pregion_orig[idx].mapped_size); - close(pregion_orig[idx].fd); - } - free(dev->mem); - dev->mem = NULL; - return -1; -} - -static int -vq_is_ready(struct vhost_virtqueue *vq) -{ - return vq && vq->desc && - vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD && - vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD; -} - -static int -virtio_is_ready(struct virtio_net *dev) -{ - struct vhost_virtqueue *rvq, *tvq; - uint32_t i; - - for (i = 0; i < dev->virt_qp_nb; i++) { - rvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ]; - tvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ]; - - if (!vq_is_ready(rvq) || !vq_is_ready(tvq)) { - RTE_LOG(INFO, VHOST_CONFIG, - "virtio is not ready for processing.\n"); - return 0; - } - } - - RTE_LOG(INFO, VHOST_CONFIG, - "virtio is now ready for processing.\n"); - return 1; -} - -void -user_set_vring_call(int vid, struct VhostUserMsg *pmsg) -{ - struct vhost_vring_file file; - - file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; - if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) - file.fd = VIRTIO_INVALID_EVENTFD; - else - file.fd = pmsg->fds[0]; - RTE_LOG(INFO, VHOST_CONFIG, - "vring call idx:%d file:%d\n", file.index, file.fd); - vhost_set_vring_call(vid, &file); -} - - -/* - * In vhost-user, when we receive kick message, will test whether virtio - * device is ready for packet processing. - */ -void -user_set_vring_kick(int vid, struct VhostUserMsg *pmsg) -{ - struct vhost_vring_file file; - struct virtio_net *dev = get_device(vid); - - if (!dev) - return; - - file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; - if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) - file.fd = VIRTIO_INVALID_EVENTFD; - else - file.fd = pmsg->fds[0]; - RTE_LOG(INFO, VHOST_CONFIG, - "vring kick idx:%d file:%d\n", file.index, file.fd); - vhost_set_vring_kick(vid, &file); - - if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) { - if (notify_ops->new_device(vid) == 0) - dev->flags |= VIRTIO_DEV_RUNNING; - } -} - -/* - * when virtio is stopped, qemu will send us the GET_VRING_BASE message. - */ -int -user_get_vring_base(int vid, struct vhost_vring_state *state) -{ - struct virtio_net *dev = get_device(vid); - - if (dev == NULL) - return -1; - /* We have to stop the queue (virtio) if it is running. */ - if (dev->flags & VIRTIO_DEV_RUNNING) { - dev->flags &= ~VIRTIO_DEV_RUNNING; - notify_ops->destroy_device(vid); - } - - /* Here we are safe to get the last used index */ - vhost_get_vring_base(vid, state->index, state); - - RTE_LOG(INFO, VHOST_CONFIG, - "vring base idx:%d file:%d\n", state->index, state->num); - /* - * Based on current qemu vhost-user implementation, this message is - * sent and only sent in vhost_vring_stop. - * TODO: cleanup the vring, it isn't usable since here. - */ - if (dev->virtqueue[state->index]->kickfd >= 0) - close(dev->virtqueue[state->index]->kickfd); - - dev->virtqueue[state->index]->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; - - return 0; -} - -/* - * when virtio queues are ready to work, qemu will send us to - * enable the virtio queue pair. - */ -int -user_set_vring_enable(int vid, struct vhost_vring_state *state) -{ - struct virtio_net *dev; - int enable = (int)state->num; - - dev = get_device(vid); - if (dev == NULL) - return -1; - - RTE_LOG(INFO, VHOST_CONFIG, - "set queue enable: %d to qp idx: %d\n", - enable, state->index); - - if (notify_ops->vring_state_changed) - notify_ops->vring_state_changed(vid, state->index, enable); - - dev->virtqueue[state->index]->enabled = enable; - - return 0; -} - -void -user_set_protocol_features(int vid, uint64_t protocol_features) -{ - struct virtio_net *dev; - - dev = get_device(vid); - if (dev == NULL || protocol_features & ~VHOST_USER_PROTOCOL_FEATURES) - return; - - dev->protocol_features = protocol_features; -} - -int -user_set_log_base(int vid, struct VhostUserMsg *msg) -{ - struct virtio_net *dev; - int fd = msg->fds[0]; - uint64_t size, off; - void *addr; - - dev = get_device(vid); - if (!dev) - return -1; - - if (fd < 0) { - RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd); - return -1; - } - - if (msg->size != sizeof(VhostUserLog)) { - RTE_LOG(ERR, VHOST_CONFIG, - "invalid log base msg size: %"PRId32" != %d\n", - msg->size, (int)sizeof(VhostUserLog)); - return -1; - } - - size = msg->payload.log.mmap_size; - off = msg->payload.log.mmap_offset; - RTE_LOG(INFO, VHOST_CONFIG, - "log mmap size: %"PRId64", offset: %"PRId64"\n", - size, off); - - /* - * mmap from 0 to workaround a hugepage mmap bug: mmap will - * fail when offset is not page size aligned. - */ - addr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - close(fd); - if (addr == MAP_FAILED) { - RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n"); - return -1; - } - - /* - * Free previously mapped log memory on occasionally - * multiple VHOST_USER_SET_LOG_BASE. - */ - if (dev->log_addr) { - munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); - } - dev->log_addr = (uint64_t)(uintptr_t)addr; - dev->log_base = dev->log_addr + off; - dev->log_size = size; - - return 0; -} - -/* - * An rarp packet is constructed and broadcasted to notify switches about - * the new location of the migrated VM, so that packets from outside will - * not be lost after migration. - * - * However, we don't actually "send" a rarp packet here, instead, we set - * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it. - */ -int -user_send_rarp(int vid, struct VhostUserMsg *msg) -{ - struct virtio_net *dev; - uint8_t *mac = (uint8_t *)&msg->payload.u64; - - dev = get_device(vid); - if (!dev) - return -1; - - RTE_LOG(DEBUG, VHOST_CONFIG, - ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n", - mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); - memcpy(dev->mac.addr_bytes, mac, 6); - - /* - * Set the flag to inject a RARP broadcast packet at - * rte_vhost_dequeue_burst(). - * - * rte_smp_wmb() is for making sure the mac is copied - * before the flag is set. - */ - rte_smp_wmb(); - rte_atomic16_set(&dev->broadcast_rarp, 1); - - return 0; -} diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.h b/lib/librte_vhost/vhost_user/virtio-net-user.h deleted file mode 100644 index e1b967b8..00000000 --- a/lib/librte_vhost/vhost_user/virtio-net-user.h +++ /dev/null @@ -1,62 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _VIRTIO_NET_USER_H -#define _VIRTIO_NET_USER_H - -#include "vhost-net.h" -#include "vhost-net-user.h" - -#define VHOST_USER_PROTOCOL_F_MQ 0 -#define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 -#define VHOST_USER_PROTOCOL_F_RARP 2 - -#define VHOST_USER_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \ - (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\ - (1ULL << VHOST_USER_PROTOCOL_F_RARP)) - -int user_set_mem_table(int, struct VhostUserMsg *); - -void user_set_vring_call(int, struct VhostUserMsg *); - -void user_set_vring_kick(int, struct VhostUserMsg *); - -void user_set_protocol_features(int vid, uint64_t protocol_features); -int user_set_log_base(int vid, struct VhostUserMsg *); -int user_send_rarp(int vid, struct VhostUserMsg *); - -int user_get_vring_base(int, struct vhost_vring_state *); - -int user_set_vring_enable(int vid, struct vhost_vring_state *state); - -#endif diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c deleted file mode 100644 index 1785695b..00000000 --- a/lib/librte_vhost/virtio-net.c +++ /dev/null @@ -1,847 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef RTE_LIBRTE_VHOST_NUMA -#include -#endif - -#include - -#include -#include -#include -#include -#include -#include - -#include "vhost-net.h" - -#define MAX_VHOST_DEVICE 1024 -static struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; - -/* device ops to add/remove device to/from data core. */ -struct virtio_net_device_ops const *notify_ops; - -#define VHOST_USER_F_PROTOCOL_FEATURES 30 - -/* Features supported by this lib. */ -#define VHOST_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \ - (1ULL << VIRTIO_NET_F_CTRL_VQ) | \ - (1ULL << VIRTIO_NET_F_CTRL_RX) | \ - (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \ - (VHOST_SUPPORTS_MQ) | \ - (1ULL << VIRTIO_F_VERSION_1) | \ - (1ULL << VHOST_F_LOG_ALL) | \ - (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \ - (1ULL << VIRTIO_NET_F_HOST_TSO4) | \ - (1ULL << VIRTIO_NET_F_HOST_TSO6) | \ - (1ULL << VIRTIO_NET_F_CSUM) | \ - (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \ - (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ - (1ULL << VIRTIO_NET_F_GUEST_TSO6)) - -static uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES; - - -/* - * Converts QEMU virtual address to Vhost virtual address. This function is - * used to convert the ring addresses to our address space. - */ -static uint64_t -qva_to_vva(struct virtio_net *dev, uint64_t qemu_va) -{ - struct virtio_memory_regions *region; - uint64_t vhost_va = 0; - uint32_t regionidx = 0; - - /* Find the region where the address lives. */ - for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { - region = &dev->mem->regions[regionidx]; - if ((qemu_va >= region->userspace_address) && - (qemu_va <= region->userspace_address + - region->memory_size)) { - vhost_va = qemu_va + region->guest_phys_address + - region->address_offset - - region->userspace_address; - break; - } - } - return vhost_va; -} - -struct virtio_net * -get_device(int vid) -{ - struct virtio_net *dev = vhost_devices[vid]; - - if (unlikely(!dev)) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%d) device not found.\n", vid); - } - - return dev; -} - -static void -cleanup_vq(struct vhost_virtqueue *vq, int destroy) -{ - if ((vq->callfd >= 0) && (destroy != 0)) - close(vq->callfd); - if (vq->kickfd >= 0) - close(vq->kickfd); -} - -/* - * Unmap any memory, close any file descriptors and - * free any memory owned by a device. - */ -static void -cleanup_device(struct virtio_net *dev, int destroy) -{ - uint32_t i; - - vhost_backend_cleanup(dev); - - for (i = 0; i < dev->virt_qp_nb; i++) { - cleanup_vq(dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ], destroy); - cleanup_vq(dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ], destroy); - } -} - -/* - * Release virtqueues and device memory. - */ -static void -free_device(struct virtio_net *dev) -{ - uint32_t i; - - for (i = 0; i < dev->virt_qp_nb; i++) - rte_free(dev->virtqueue[i * VIRTIO_QNUM]); - - rte_free(dev); -} - -static void -init_vring_queue(struct vhost_virtqueue *vq, int qp_idx) -{ - memset(vq, 0, sizeof(struct vhost_virtqueue)); - - vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; - vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD; - - /* Backends are set to -1 indicating an inactive device. */ - vq->backend = -1; - - /* always set the default vq pair to enabled */ - if (qp_idx == 0) - vq->enabled = 1; -} - -static void -init_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx) -{ - uint32_t base_idx = qp_idx * VIRTIO_QNUM; - - init_vring_queue(dev->virtqueue[base_idx + VIRTIO_RXQ], qp_idx); - init_vring_queue(dev->virtqueue[base_idx + VIRTIO_TXQ], qp_idx); -} - -static void -reset_vring_queue(struct vhost_virtqueue *vq, int qp_idx) -{ - int callfd; - - callfd = vq->callfd; - init_vring_queue(vq, qp_idx); - vq->callfd = callfd; -} - -static void -reset_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx) -{ - uint32_t base_idx = qp_idx * VIRTIO_QNUM; - - reset_vring_queue(dev->virtqueue[base_idx + VIRTIO_RXQ], qp_idx); - reset_vring_queue(dev->virtqueue[base_idx + VIRTIO_TXQ], qp_idx); -} - -static int -alloc_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx) -{ - struct vhost_virtqueue *virtqueue = NULL; - uint32_t virt_rx_q_idx = qp_idx * VIRTIO_QNUM + VIRTIO_RXQ; - uint32_t virt_tx_q_idx = qp_idx * VIRTIO_QNUM + VIRTIO_TXQ; - - virtqueue = rte_malloc(NULL, - sizeof(struct vhost_virtqueue) * VIRTIO_QNUM, 0); - if (virtqueue == NULL) { - RTE_LOG(ERR, VHOST_CONFIG, - "Failed to allocate memory for virt qp:%d.\n", qp_idx); - return -1; - } - - dev->virtqueue[virt_rx_q_idx] = virtqueue; - dev->virtqueue[virt_tx_q_idx] = virtqueue + VIRTIO_TXQ; - - init_vring_queue_pair(dev, qp_idx); - - dev->virt_qp_nb += 1; - - return 0; -} - -/* - * Reset some variables in device structure, while keeping few - * others untouched, such as vid, ifname, virt_qp_nb: they - * should be same unless the device is removed. - */ -static void -reset_device(struct virtio_net *dev) -{ - uint32_t i; - - dev->features = 0; - dev->protocol_features = 0; - dev->flags = 0; - - for (i = 0; i < dev->virt_qp_nb; i++) - reset_vring_queue_pair(dev, i); -} - -/* - * Function is called from the CUSE open function. The device structure is - * initialised and a new entry is added to the device configuration linked - * list. - */ -int -vhost_new_device(void) -{ - struct virtio_net *dev; - int i; - - dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0); - if (dev == NULL) { - RTE_LOG(ERR, VHOST_CONFIG, - "Failed to allocate memory for new dev.\n"); - return -1; - } - - for (i = 0; i < MAX_VHOST_DEVICE; i++) { - if (vhost_devices[i] == NULL) - break; - } - if (i == MAX_VHOST_DEVICE) { - RTE_LOG(ERR, VHOST_CONFIG, - "Failed to find a free slot for new device.\n"); - return -1; - } - - vhost_devices[i] = dev; - dev->vid = i; - - return i; -} - -/* - * Function is called from the CUSE release function. This function will - * cleanup the device and remove it from device configuration linked list. - */ -void -vhost_destroy_device(int vid) -{ - struct virtio_net *dev = get_device(vid); - - if (dev == NULL) - return; - - if (dev->flags & VIRTIO_DEV_RUNNING) { - dev->flags &= ~VIRTIO_DEV_RUNNING; - notify_ops->destroy_device(vid); - } - - cleanup_device(dev, 1); - free_device(dev); - - vhost_devices[vid] = NULL; -} - -void -vhost_set_ifname(int vid, const char *if_name, unsigned int if_len) -{ - struct virtio_net *dev; - unsigned int len; - - dev = get_device(vid); - if (dev == NULL) - return; - - len = if_len > sizeof(dev->ifname) ? - sizeof(dev->ifname) : if_len; - - strncpy(dev->ifname, if_name, len); - dev->ifname[sizeof(dev->ifname) - 1] = '\0'; -} - - -/* - * Called from CUSE IOCTL: VHOST_SET_OWNER - * This function just returns success at the moment unless - * the device hasn't been initialised. - */ -int -vhost_set_owner(int vid) -{ - struct virtio_net *dev; - - dev = get_device(vid); - if (dev == NULL) - return -1; - - return 0; -} - -/* - * Called from CUSE IOCTL: VHOST_RESET_OWNER - */ -int -vhost_reset_owner(int vid) -{ - struct virtio_net *dev; - - dev = get_device(vid); - if (dev == NULL) - return -1; - - if (dev->flags & VIRTIO_DEV_RUNNING) { - dev->flags &= ~VIRTIO_DEV_RUNNING; - notify_ops->destroy_device(vid); - } - - cleanup_device(dev, 0); - reset_device(dev); - return 0; -} - -/* - * Called from CUSE IOCTL: VHOST_GET_FEATURES - * The features that we support are requested. - */ -int -vhost_get_features(int vid, uint64_t *pu) -{ - struct virtio_net *dev; - - dev = get_device(vid); - if (dev == NULL) - return -1; - - /* Send our supported features. */ - *pu = VHOST_FEATURES; - return 0; -} - -/* - * Called from CUSE IOCTL: VHOST_SET_FEATURES - * We receive the negotiated features supported by us and the virtio device. - */ -int -vhost_set_features(int vid, uint64_t *pu) -{ - struct virtio_net *dev; - - dev = get_device(vid); - if (dev == NULL) - return -1; - if (*pu & ~VHOST_FEATURES) - return -1; - - dev->features = *pu; - if (dev->features & - ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) { - dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); - } else { - dev->vhost_hlen = sizeof(struct virtio_net_hdr); - } - LOG_DEBUG(VHOST_CONFIG, - "(%d) mergeable RX buffers %s, virtio 1 %s\n", - dev->vid, - (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off", - (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off"); - - return 0; -} - -/* - * Called from CUSE IOCTL: VHOST_SET_VRING_NUM - * The virtio device sends us the size of the descriptor ring. - */ -int -vhost_set_vring_num(int vid, struct vhost_vring_state *state) -{ - struct virtio_net *dev; - - dev = get_device(vid); - if (dev == NULL) - return -1; - - /* State->index refers to the queue index. The txq is 1, rxq is 0. */ - dev->virtqueue[state->index]->size = state->num; - - return 0; -} - -/* - * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the - * same numa node as the memory of vring descriptor. - */ -#ifdef RTE_LIBRTE_VHOST_NUMA -static struct virtio_net* -numa_realloc(struct virtio_net *dev, int index) -{ - int oldnode, newnode; - struct virtio_net *old_dev; - struct vhost_virtqueue *old_vq, *vq; - int ret; - - /* - * vq is allocated on pairs, we should try to do realloc - * on first queue of one queue pair only. - */ - if (index % VIRTIO_QNUM != 0) - return dev; - - old_dev = dev; - vq = old_vq = dev->virtqueue[index]; - - ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc, - MPOL_F_NODE | MPOL_F_ADDR); - - /* check if we need to reallocate vq */ - ret |= get_mempolicy(&oldnode, NULL, 0, old_vq, - MPOL_F_NODE | MPOL_F_ADDR); - if (ret) { - RTE_LOG(ERR, VHOST_CONFIG, - "Unable to get vq numa information.\n"); - return dev; - } - if (oldnode != newnode) { - RTE_LOG(INFO, VHOST_CONFIG, - "reallocate vq from %d to %d node\n", oldnode, newnode); - vq = rte_malloc_socket(NULL, sizeof(*vq) * VIRTIO_QNUM, 0, - newnode); - if (!vq) - return dev; - - memcpy(vq, old_vq, sizeof(*vq) * VIRTIO_QNUM); - rte_free(old_vq); - } - - /* check if we need to reallocate dev */ - ret = get_mempolicy(&oldnode, NULL, 0, old_dev, - MPOL_F_NODE | MPOL_F_ADDR); - if (ret) { - RTE_LOG(ERR, VHOST_CONFIG, - "Unable to get dev numa information.\n"); - goto out; - } - if (oldnode != newnode) { - RTE_LOG(INFO, VHOST_CONFIG, - "reallocate dev from %d to %d node\n", - oldnode, newnode); - dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode); - if (!dev) { - dev = old_dev; - goto out; - } - - memcpy(dev, old_dev, sizeof(*dev)); - rte_free(old_dev); - } - -out: - dev->virtqueue[index] = vq; - dev->virtqueue[index + 1] = vq + 1; - vhost_devices[dev->vid] = dev; - - return dev; -} -#else -static struct virtio_net* -numa_realloc(struct virtio_net *dev, int index __rte_unused) -{ - return dev; -} -#endif - -/* - * Called from CUSE IOCTL: VHOST_SET_VRING_ADDR - * The virtio device sends us the desc, used and avail ring addresses. - * This function then converts these to our address space. - */ -int -vhost_set_vring_addr(int vid, struct vhost_vring_addr *addr) -{ - struct virtio_net *dev; - struct vhost_virtqueue *vq; - - dev = get_device(vid); - if ((dev == NULL) || (dev->mem == NULL)) - return -1; - - /* addr->index refers to the queue index. The txq 1, rxq is 0. */ - vq = dev->virtqueue[addr->index]; - - /* The addresses are converted from QEMU virtual to Vhost virtual. */ - vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev, - addr->desc_user_addr); - if (vq->desc == 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%d) failed to find desc ring address.\n", - dev->vid); - return -1; - } - - dev = numa_realloc(dev, addr->index); - vq = dev->virtqueue[addr->index]; - - vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev, - addr->avail_user_addr); - if (vq->avail == 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%d) failed to find avail ring address.\n", - dev->vid); - return -1; - } - - vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev, - addr->used_user_addr); - if (vq->used == 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%d) failed to find used ring address.\n", - dev->vid); - return -1; - } - - if (vq->last_used_idx != vq->used->idx) { - RTE_LOG(WARNING, VHOST_CONFIG, - "last_used_idx (%u) and vq->used->idx (%u) mismatches; " - "some packets maybe resent for Tx and dropped for Rx\n", - vq->last_used_idx, vq->used->idx); - vq->last_used_idx = vq->used->idx; - } - - vq->log_guest_addr = addr->log_guest_addr; - - LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n", - dev->vid, vq->desc); - LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n", - dev->vid, vq->avail); - LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n", - dev->vid, vq->used); - LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n", - dev->vid, vq->log_guest_addr); - - return 0; -} - -/* - * Called from CUSE IOCTL: VHOST_SET_VRING_BASE - * The virtio device sends us the available ring last used index. - */ -int -vhost_set_vring_base(int vid, struct vhost_vring_state *state) -{ - struct virtio_net *dev; - - dev = get_device(vid); - if (dev == NULL) - return -1; - - /* State->index refers to the queue index. The txq is 1, rxq is 0. */ - dev->virtqueue[state->index]->last_used_idx = state->num; - - return 0; -} - -/* - * Called from CUSE IOCTL: VHOST_GET_VRING_BASE - * We send the virtio device our available ring last used index. - */ -int -vhost_get_vring_base(int vid, uint32_t index, - struct vhost_vring_state *state) -{ - struct virtio_net *dev; - - dev = get_device(vid); - if (dev == NULL) - return -1; - - state->index = index; - /* State->index refers to the queue index. The txq is 1, rxq is 0. */ - state->num = dev->virtqueue[state->index]->last_used_idx; - - return 0; -} - - -/* - * Called from CUSE IOCTL: VHOST_SET_VRING_CALL - * The virtio device sends an eventfd to interrupt the guest. This fd gets - * copied into our process space. - */ -int -vhost_set_vring_call(int vid, struct vhost_vring_file *file) -{ - struct virtio_net *dev; - struct vhost_virtqueue *vq; - uint32_t cur_qp_idx = file->index / VIRTIO_QNUM; - - dev = get_device(vid); - if (dev == NULL) - return -1; - - /* - * FIXME: VHOST_SET_VRING_CALL is the first per-vring message - * we get, so we do vring queue pair allocation here. - */ - if (cur_qp_idx + 1 > dev->virt_qp_nb) { - if (alloc_vring_queue_pair(dev, cur_qp_idx) < 0) - return -1; - } - - /* file->index refers to the queue index. The txq is 1, rxq is 0. */ - vq = dev->virtqueue[file->index]; - assert(vq != NULL); - - if (vq->callfd >= 0) - close(vq->callfd); - - vq->callfd = file->fd; - - return 0; -} - -/* - * Called from CUSE IOCTL: VHOST_SET_VRING_KICK - * The virtio device sends an eventfd that it can use to notify us. - * This fd gets copied into our process space. - */ -int -vhost_set_vring_kick(int vid, struct vhost_vring_file *file) -{ - struct virtio_net *dev; - struct vhost_virtqueue *vq; - - dev = get_device(vid); - if (dev == NULL) - return -1; - - /* file->index refers to the queue index. The txq is 1, rxq is 0. */ - vq = dev->virtqueue[file->index]; - - if (vq->kickfd >= 0) - close(vq->kickfd); - - vq->kickfd = file->fd; - - return 0; -} - -/* - * Called from CUSE IOCTL: VHOST_NET_SET_BACKEND - * To complete device initialisation when the virtio driver is loaded, - * we are provided with a valid fd for a tap device (not used by us). - * If this happens then we can add the device to a data core. - * When the virtio driver is removed we get fd=-1. - * At that point we remove the device from the data core. - * The device will still exist in the device configuration linked list. - */ -int -vhost_set_backend(int vid, struct vhost_vring_file *file) -{ - struct virtio_net *dev; - - dev = get_device(vid); - if (dev == NULL) - return -1; - - /* file->index refers to the queue index. The txq is 1, rxq is 0. */ - dev->virtqueue[file->index]->backend = file->fd; - - /* - * If the device isn't already running and both backend fds are set, - * we add the device. - */ - if (!(dev->flags & VIRTIO_DEV_RUNNING)) { - if (dev->virtqueue[VIRTIO_TXQ]->backend != VIRTIO_DEV_STOPPED && - dev->virtqueue[VIRTIO_RXQ]->backend != VIRTIO_DEV_STOPPED) { - if (notify_ops->new_device(vid) < 0) - return -1; - dev->flags |= VIRTIO_DEV_RUNNING; - } - } else if (file->fd == VIRTIO_DEV_STOPPED) { - dev->flags &= ~VIRTIO_DEV_RUNNING; - notify_ops->destroy_device(vid); - } - - return 0; -} - -int -rte_vhost_get_numa_node(int vid) -{ -#ifdef RTE_LIBRTE_VHOST_NUMA - struct virtio_net *dev = get_device(vid); - int numa_node; - int ret; - - if (dev == NULL) - return -1; - - ret = get_mempolicy(&numa_node, NULL, 0, dev, - MPOL_F_NODE | MPOL_F_ADDR); - if (ret < 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%d) failed to query numa node: %d\n", vid, ret); - return -1; - } - - return numa_node; -#else - RTE_SET_USED(vid); - return -1; -#endif -} - -uint32_t -rte_vhost_get_queue_num(int vid) -{ - struct virtio_net *dev = get_device(vid); - - if (dev == NULL) - return 0; - - return dev->virt_qp_nb; -} - -int -rte_vhost_get_ifname(int vid, char *buf, size_t len) -{ - struct virtio_net *dev = get_device(vid); - - if (dev == NULL) - return -1; - - len = RTE_MIN(len, sizeof(dev->ifname)); - - strncpy(buf, dev->ifname, len); - buf[len - 1] = '\0'; - - return 0; -} - -uint16_t -rte_vhost_avail_entries(int vid, uint16_t queue_id) -{ - struct virtio_net *dev; - struct vhost_virtqueue *vq; - - dev = get_device(vid); - if (!dev) - return 0; - - vq = dev->virtqueue[queue_id]; - if (!vq->enabled) - return 0; - - return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx; -} - -int -rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable) -{ - struct virtio_net *dev = get_device(vid); - - if (dev == NULL) - return -1; - - if (enable) { - RTE_LOG(ERR, VHOST_CONFIG, - "guest notification isn't supported.\n"); - return -1; - } - - dev->virtqueue[queue_id]->used->flags = VRING_USED_F_NO_NOTIFY; - return 0; -} - -uint64_t rte_vhost_feature_get(void) -{ - return VHOST_FEATURES; -} - -int rte_vhost_feature_disable(uint64_t feature_mask) -{ - VHOST_FEATURES = VHOST_FEATURES & ~feature_mask; - return 0; -} - -int rte_vhost_feature_enable(uint64_t feature_mask) -{ - if ((feature_mask & VHOST_SUPPORTED_FEATURES) == feature_mask) { - VHOST_FEATURES = VHOST_FEATURES | feature_mask; - return 0; - } - return -1; -} - -/* - * Register ops so that we can add/remove device to data core. - */ -int -rte_vhost_driver_callback_register(struct virtio_net_device_ops const * const ops) -{ - notify_ops = ops; - - return 0; -} diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c new file mode 100644 index 00000000..595f67c4 --- /dev/null +++ b/lib/librte_vhost/virtio_net.c @@ -0,0 +1,1182 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vhost.h" + +#define MAX_PKT_BURST 32 +#define VHOST_LOG_PAGE 4096 + +static inline void __attribute__((always_inline)) +vhost_log_page(uint8_t *log_base, uint64_t page) +{ + log_base[page / 8] |= 1 << (page % 8); +} + +static inline void __attribute__((always_inline)) +vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len) +{ + uint64_t page; + + if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) || + !dev->log_base || !len)) + return; + + if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8))) + return; + + /* To make sure guest memory updates are committed before logging */ + rte_smp_wmb(); + + page = addr / VHOST_LOG_PAGE; + while (page * VHOST_LOG_PAGE < addr + len) { + vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page); + page += 1; + } +} + +static inline void __attribute__((always_inline)) +vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t offset, uint64_t len) +{ + vhost_log_write(dev, vq->log_guest_addr + offset, len); +} + +static bool +is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb) +{ + return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM; +} + +static inline void __attribute__((always_inline)) +do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint16_t to, uint16_t from, uint16_t size) +{ + rte_memcpy(&vq->used->ring[to], + &vq->shadow_used_ring[from], + size * sizeof(struct vring_used_elem)); + vhost_log_used_vring(dev, vq, + offsetof(struct vring_used, ring[to]), + size * sizeof(struct vring_used_elem)); +} + +static inline void __attribute__((always_inline)) +flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + uint16_t used_idx = vq->last_used_idx & (vq->size - 1); + + if (used_idx + vq->shadow_used_idx <= vq->size) { + do_flush_shadow_used_ring(dev, vq, used_idx, 0, + vq->shadow_used_idx); + } else { + uint16_t size; + + /* update used ring interval [used_idx, vq->size] */ + size = vq->size - used_idx; + do_flush_shadow_used_ring(dev, vq, used_idx, 0, size); + + /* update the left half used ring interval [0, left_size] */ + do_flush_shadow_used_ring(dev, vq, 0, size, + vq->shadow_used_idx - size); + } + vq->last_used_idx += vq->shadow_used_idx; + + rte_smp_wmb(); + + *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx; + vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), + sizeof(vq->used->idx)); +} + +static inline void __attribute__((always_inline)) +update_shadow_used_ring(struct vhost_virtqueue *vq, + uint16_t desc_idx, uint16_t len) +{ + uint16_t i = vq->shadow_used_idx++; + + vq->shadow_used_ring[i].id = desc_idx; + vq->shadow_used_ring[i].len = len; +} + +static void +virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr) +{ + if (m_buf->ol_flags & PKT_TX_L4_MASK) { + net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len; + + switch (m_buf->ol_flags & PKT_TX_L4_MASK) { + case PKT_TX_TCP_CKSUM: + net_hdr->csum_offset = (offsetof(struct tcp_hdr, + cksum)); + break; + case PKT_TX_UDP_CKSUM: + net_hdr->csum_offset = (offsetof(struct udp_hdr, + dgram_cksum)); + break; + case PKT_TX_SCTP_CKSUM: + net_hdr->csum_offset = (offsetof(struct sctp_hdr, + cksum)); + break; + } + } + + if (m_buf->ol_flags & PKT_TX_TCP_SEG) { + if (m_buf->ol_flags & PKT_TX_IPV4) + net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; + else + net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; + net_hdr->gso_size = m_buf->tso_segsz; + net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len + + m_buf->l4_len; + } +} + +static inline void +copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr, + struct virtio_net_hdr_mrg_rxbuf hdr) +{ + if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf)) + *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr; + else + *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr; +} + +static inline int __attribute__((always_inline)) +copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs, + struct rte_mbuf *m, uint16_t desc_idx, uint32_t size) +{ + uint32_t desc_avail, desc_offset; + uint32_t mbuf_avail, mbuf_offset; + uint32_t cpy_len; + struct vring_desc *desc; + uint64_t desc_addr; + struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0}; + + desc = &descs[desc_idx]; + desc_addr = gpa_to_vva(dev, desc->addr); + /* + * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid + * performance issue with some versions of gcc (4.8.4 and 5.3.0) which + * otherwise stores offset on the stack instead of in a register. + */ + if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr) + return -1; + + rte_prefetch0((void *)(uintptr_t)desc_addr); + + virtio_enqueue_offload(m, &virtio_hdr.hdr); + copy_virtio_net_hdr(dev, desc_addr, virtio_hdr); + vhost_log_write(dev, desc->addr, dev->vhost_hlen); + PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0); + + desc_offset = dev->vhost_hlen; + desc_avail = desc->len - dev->vhost_hlen; + + mbuf_avail = rte_pktmbuf_data_len(m); + mbuf_offset = 0; + while (mbuf_avail != 0 || m->next != NULL) { + /* done with current mbuf, fetch next */ + if (mbuf_avail == 0) { + m = m->next; + + mbuf_offset = 0; + mbuf_avail = rte_pktmbuf_data_len(m); + } + + /* done with current desc buf, fetch next */ + if (desc_avail == 0) { + if ((desc->flags & VRING_DESC_F_NEXT) == 0) { + /* Room in vring buffer is not enough */ + return -1; + } + if (unlikely(desc->next >= size)) + return -1; + + desc = &descs[desc->next]; + desc_addr = gpa_to_vva(dev, desc->addr); + if (unlikely(!desc_addr)) + return -1; + + desc_offset = 0; + desc_avail = desc->len; + } + + cpy_len = RTE_MIN(desc_avail, mbuf_avail); + rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)), + rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), + cpy_len); + vhost_log_write(dev, desc->addr + desc_offset, cpy_len); + PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), + cpy_len, 0); + + mbuf_avail -= cpy_len; + mbuf_offset += cpy_len; + desc_avail -= cpy_len; + desc_offset += cpy_len; + } + + return 0; +} + +/** + * This function adds buffers to the virtio devices RX virtqueue. Buffers can + * be received from the physical port or from another virtio device. A packet + * count is returned to indicate the number of packets that are succesfully + * added to the RX queue. This function works when the mbuf is scattered, but + * it doesn't support the mergeable feature. + */ +static inline uint32_t __attribute__((always_inline)) +virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, + struct rte_mbuf **pkts, uint32_t count) +{ + struct vhost_virtqueue *vq; + uint16_t avail_idx, free_entries, start_idx; + uint16_t desc_indexes[MAX_PKT_BURST]; + struct vring_desc *descs; + uint16_t used_idx; + uint32_t i, sz; + + LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); + if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) { + RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", + dev->vid, __func__, queue_id); + return 0; + } + + vq = dev->virtqueue[queue_id]; + if (unlikely(vq->enabled == 0)) + return 0; + + avail_idx = *((volatile uint16_t *)&vq->avail->idx); + start_idx = vq->last_used_idx; + free_entries = avail_idx - start_idx; + count = RTE_MIN(count, free_entries); + count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST); + if (count == 0) + return 0; + + LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n", + dev->vid, start_idx, start_idx + count); + + /* Retrieve all of the desc indexes first to avoid caching issues. */ + rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]); + for (i = 0; i < count; i++) { + used_idx = (start_idx + i) & (vq->size - 1); + desc_indexes[i] = vq->avail->ring[used_idx]; + vq->used->ring[used_idx].id = desc_indexes[i]; + vq->used->ring[used_idx].len = pkts[i]->pkt_len + + dev->vhost_hlen; + vhost_log_used_vring(dev, vq, + offsetof(struct vring_used, ring[used_idx]), + sizeof(vq->used->ring[used_idx])); + } + + rte_prefetch0(&vq->desc[desc_indexes[0]]); + for (i = 0; i < count; i++) { + uint16_t desc_idx = desc_indexes[i]; + int err; + + if (vq->desc[desc_idx].flags & VRING_DESC_F_INDIRECT) { + descs = (struct vring_desc *)(uintptr_t)gpa_to_vva(dev, + vq->desc[desc_idx].addr); + if (unlikely(!descs)) { + count = i; + break; + } + + desc_idx = 0; + sz = vq->desc[desc_idx].len / sizeof(*descs); + } else { + descs = vq->desc; + sz = vq->size; + } + + err = copy_mbuf_to_desc(dev, descs, pkts[i], desc_idx, sz); + if (unlikely(err)) { + used_idx = (start_idx + i) & (vq->size - 1); + vq->used->ring[used_idx].len = dev->vhost_hlen; + vhost_log_used_vring(dev, vq, + offsetof(struct vring_used, ring[used_idx]), + sizeof(vq->used->ring[used_idx])); + } + + if (i + 1 < count) + rte_prefetch0(&vq->desc[desc_indexes[i+1]]); + } + + rte_smp_wmb(); + + *(volatile uint16_t *)&vq->used->idx += count; + vq->last_used_idx += count; + vhost_log_used_vring(dev, vq, + offsetof(struct vring_used, idx), + sizeof(vq->used->idx)); + + /* flush used->idx update before we read avail->flags. */ + rte_mb(); + + /* Kick the guest if necessary. */ + if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) + && (vq->callfd >= 0)) + eventfd_write(vq->callfd, (eventfd_t)1); + return count; +} + +static inline int __attribute__((always_inline)) +fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint32_t avail_idx, uint32_t *vec_idx, + struct buf_vector *buf_vec, uint16_t *desc_chain_head, + uint16_t *desc_chain_len) +{ + uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)]; + uint32_t vec_id = *vec_idx; + uint32_t len = 0; + struct vring_desc *descs = vq->desc; + + *desc_chain_head = idx; + + if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) { + descs = (struct vring_desc *)(uintptr_t) + gpa_to_vva(dev, vq->desc[idx].addr); + if (unlikely(!descs)) + return -1; + + idx = 0; + } + + while (1) { + if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size)) + return -1; + + len += descs[idx].len; + buf_vec[vec_id].buf_addr = descs[idx].addr; + buf_vec[vec_id].buf_len = descs[idx].len; + buf_vec[vec_id].desc_idx = idx; + vec_id++; + + if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0) + break; + + idx = descs[idx].next; + } + + *desc_chain_len = len; + *vec_idx = vec_id; + + return 0; +} + +/* + * Returns -1 on fail, 0 on success + */ +static inline int +reserve_avail_buf_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint32_t size, struct buf_vector *buf_vec, + uint16_t *num_buffers, uint16_t avail_head) +{ + uint16_t cur_idx; + uint32_t vec_idx = 0; + uint16_t tries = 0; + + uint16_t head_idx = 0; + uint16_t len = 0; + + *num_buffers = 0; + cur_idx = vq->last_avail_idx; + + while (size > 0) { + if (unlikely(cur_idx == avail_head)) + return -1; + + if (unlikely(fill_vec_buf(dev, vq, cur_idx, &vec_idx, buf_vec, + &head_idx, &len) < 0)) + return -1; + len = RTE_MIN(len, size); + update_shadow_used_ring(vq, head_idx, len); + size -= len; + + cur_idx++; + tries++; + *num_buffers += 1; + + /* + * if we tried all available ring items, and still + * can't get enough buf, it means something abnormal + * happened. + */ + if (unlikely(tries >= vq->size)) + return -1; + } + + return 0; +} + +static inline int __attribute__((always_inline)) +copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m, + struct buf_vector *buf_vec, uint16_t num_buffers) +{ + struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0}; + uint32_t vec_idx = 0; + uint64_t desc_addr; + uint32_t mbuf_offset, mbuf_avail; + uint32_t desc_offset, desc_avail; + uint32_t cpy_len; + uint64_t hdr_addr, hdr_phys_addr; + struct rte_mbuf *hdr_mbuf; + + if (unlikely(m == NULL)) + return -1; + + desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr); + if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr) + return -1; + + hdr_mbuf = m; + hdr_addr = desc_addr; + hdr_phys_addr = buf_vec[vec_idx].buf_addr; + rte_prefetch0((void *)(uintptr_t)hdr_addr); + + virtio_hdr.num_buffers = num_buffers; + LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n", + dev->vid, num_buffers); + + desc_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen; + desc_offset = dev->vhost_hlen; + + mbuf_avail = rte_pktmbuf_data_len(m); + mbuf_offset = 0; + while (mbuf_avail != 0 || m->next != NULL) { + /* done with current desc buf, get the next one */ + if (desc_avail == 0) { + vec_idx++; + desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr); + if (unlikely(!desc_addr)) + return -1; + + /* Prefetch buffer address. */ + rte_prefetch0((void *)(uintptr_t)desc_addr); + desc_offset = 0; + desc_avail = buf_vec[vec_idx].buf_len; + } + + /* done with current mbuf, get the next one */ + if (mbuf_avail == 0) { + m = m->next; + + mbuf_offset = 0; + mbuf_avail = rte_pktmbuf_data_len(m); + } + + if (hdr_addr) { + virtio_enqueue_offload(hdr_mbuf, &virtio_hdr.hdr); + copy_virtio_net_hdr(dev, hdr_addr, virtio_hdr); + vhost_log_write(dev, hdr_phys_addr, dev->vhost_hlen); + PRINT_PACKET(dev, (uintptr_t)hdr_addr, + dev->vhost_hlen, 0); + + hdr_addr = 0; + } + + cpy_len = RTE_MIN(desc_avail, mbuf_avail); + rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)), + rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), + cpy_len); + vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset, + cpy_len); + PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), + cpy_len, 0); + + mbuf_avail -= cpy_len; + mbuf_offset += cpy_len; + desc_avail -= cpy_len; + desc_offset += cpy_len; + } + + return 0; +} + +static inline uint32_t __attribute__((always_inline)) +virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id, + struct rte_mbuf **pkts, uint32_t count) +{ + struct vhost_virtqueue *vq; + uint32_t pkt_idx = 0; + uint16_t num_buffers; + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint16_t avail_head; + + LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); + if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) { + RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", + dev->vid, __func__, queue_id); + return 0; + } + + vq = dev->virtqueue[queue_id]; + if (unlikely(vq->enabled == 0)) + return 0; + + count = RTE_MIN((uint32_t)MAX_PKT_BURST, count); + if (count == 0) + return 0; + + rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); + + vq->shadow_used_idx = 0; + avail_head = *((volatile uint16_t *)&vq->avail->idx); + for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { + uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; + + if (unlikely(reserve_avail_buf_mergeable(dev, vq, + pkt_len, buf_vec, &num_buffers, + avail_head) < 0)) { + LOG_DEBUG(VHOST_DATA, + "(%d) failed to get enough desc from vring\n", + dev->vid); + vq->shadow_used_idx -= num_buffers; + break; + } + + LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", + dev->vid, vq->last_avail_idx, + vq->last_avail_idx + num_buffers); + + if (copy_mbuf_to_desc_mergeable(dev, pkts[pkt_idx], + buf_vec, num_buffers) < 0) { + vq->shadow_used_idx -= num_buffers; + break; + } + + vq->last_avail_idx += num_buffers; + } + + if (likely(vq->shadow_used_idx)) { + flush_shadow_used_ring(dev, vq); + + /* flush used->idx update before we read avail->flags. */ + rte_mb(); + + /* Kick the guest if necessary. */ + if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) + && (vq->callfd >= 0)) + eventfd_write(vq->callfd, (eventfd_t)1); + } + + return pkt_idx; +} + +uint16_t +rte_vhost_enqueue_burst(int vid, uint16_t queue_id, + struct rte_mbuf **pkts, uint16_t count) +{ + struct virtio_net *dev = get_device(vid); + + if (!dev) + return 0; + + if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) + return virtio_dev_merge_rx(dev, queue_id, pkts, count); + else + return virtio_dev_rx(dev, queue_id, pkts, count); +} + +static inline bool +virtio_net_with_host_offload(struct virtio_net *dev) +{ + if (dev->features & + (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_ECN | + VIRTIO_NET_F_HOST_TSO4 | VIRTIO_NET_F_HOST_TSO6 | + VIRTIO_NET_F_HOST_UFO)) + return true; + + return false; +} + +static void +parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr) +{ + struct ipv4_hdr *ipv4_hdr; + struct ipv6_hdr *ipv6_hdr; + void *l3_hdr = NULL; + struct ether_hdr *eth_hdr; + uint16_t ethertype; + + eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); + + m->l2_len = sizeof(struct ether_hdr); + ethertype = rte_be_to_cpu_16(eth_hdr->ether_type); + + if (ethertype == ETHER_TYPE_VLAN) { + struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1); + + m->l2_len += sizeof(struct vlan_hdr); + ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto); + } + + l3_hdr = (char *)eth_hdr + m->l2_len; + + switch (ethertype) { + case ETHER_TYPE_IPv4: + ipv4_hdr = (struct ipv4_hdr *)l3_hdr; + *l4_proto = ipv4_hdr->next_proto_id; + m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4; + *l4_hdr = (char *)l3_hdr + m->l3_len; + m->ol_flags |= PKT_TX_IPV4; + break; + case ETHER_TYPE_IPv6: + ipv6_hdr = (struct ipv6_hdr *)l3_hdr; + *l4_proto = ipv6_hdr->proto; + m->l3_len = sizeof(struct ipv6_hdr); + *l4_hdr = (char *)l3_hdr + m->l3_len; + m->ol_flags |= PKT_TX_IPV6; + break; + default: + m->l3_len = 0; + *l4_proto = 0; + break; + } +} + +static inline void __attribute__((always_inline)) +vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m) +{ + uint16_t l4_proto = 0; + void *l4_hdr = NULL; + struct tcp_hdr *tcp_hdr = NULL; + + if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE) + return; + + parse_ethernet(m, &l4_proto, &l4_hdr); + if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) { + if (hdr->csum_start == (m->l2_len + m->l3_len)) { + switch (hdr->csum_offset) { + case (offsetof(struct tcp_hdr, cksum)): + if (l4_proto == IPPROTO_TCP) + m->ol_flags |= PKT_TX_TCP_CKSUM; + break; + case (offsetof(struct udp_hdr, dgram_cksum)): + if (l4_proto == IPPROTO_UDP) + m->ol_flags |= PKT_TX_UDP_CKSUM; + break; + case (offsetof(struct sctp_hdr, cksum)): + if (l4_proto == IPPROTO_SCTP) + m->ol_flags |= PKT_TX_SCTP_CKSUM; + break; + default: + break; + } + } + } + + if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { + switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { + case VIRTIO_NET_HDR_GSO_TCPV4: + case VIRTIO_NET_HDR_GSO_TCPV6: + tcp_hdr = (struct tcp_hdr *)l4_hdr; + m->ol_flags |= PKT_TX_TCP_SEG; + m->tso_segsz = hdr->gso_size; + m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2; + break; + default: + RTE_LOG(WARNING, VHOST_DATA, + "unsupported gso type %u.\n", hdr->gso_type); + break; + } + } +} + +#define RARP_PKT_SIZE 64 + +static int +make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac) +{ + struct ether_hdr *eth_hdr; + struct arp_hdr *rarp; + + if (rarp_mbuf->buf_len < 64) { + RTE_LOG(WARNING, VHOST_DATA, + "failed to make RARP; mbuf size too small %u (< %d)\n", + rarp_mbuf->buf_len, RARP_PKT_SIZE); + return -1; + } + + /* Ethernet header. */ + eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0); + memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN); + ether_addr_copy(mac, ð_hdr->s_addr); + eth_hdr->ether_type = htons(ETHER_TYPE_RARP); + + /* RARP header. */ + rarp = (struct arp_hdr *)(eth_hdr + 1); + rarp->arp_hrd = htons(ARP_HRD_ETHER); + rarp->arp_pro = htons(ETHER_TYPE_IPv4); + rarp->arp_hln = ETHER_ADDR_LEN; + rarp->arp_pln = 4; + rarp->arp_op = htons(ARP_OP_REVREQUEST); + + ether_addr_copy(mac, &rarp->arp_data.arp_sha); + ether_addr_copy(mac, &rarp->arp_data.arp_tha); + memset(&rarp->arp_data.arp_sip, 0x00, 4); + memset(&rarp->arp_data.arp_tip, 0x00, 4); + + rarp_mbuf->pkt_len = rarp_mbuf->data_len = RARP_PKT_SIZE; + + return 0; +} + +static inline void __attribute__((always_inline)) +put_zmbuf(struct zcopy_mbuf *zmbuf) +{ + zmbuf->in_use = 0; +} + +static inline int __attribute__((always_inline)) +copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs, + uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx, + struct rte_mempool *mbuf_pool) +{ + struct vring_desc *desc; + uint64_t desc_addr; + uint32_t desc_avail, desc_offset; + uint32_t mbuf_avail, mbuf_offset; + uint32_t cpy_len; + struct rte_mbuf *cur = m, *prev = m; + struct virtio_net_hdr *hdr = NULL; + /* A counter to avoid desc dead loop chain */ + uint32_t nr_desc = 1; + + desc = &descs[desc_idx]; + if (unlikely((desc->len < dev->vhost_hlen)) || + (desc->flags & VRING_DESC_F_INDIRECT)) + return -1; + + desc_addr = gpa_to_vva(dev, desc->addr); + if (unlikely(!desc_addr)) + return -1; + + if (virtio_net_with_host_offload(dev)) { + hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr); + rte_prefetch0(hdr); + } + + /* + * A virtio driver normally uses at least 2 desc buffers + * for Tx: the first for storing the header, and others + * for storing the data. + */ + if (likely((desc->len == dev->vhost_hlen) && + (desc->flags & VRING_DESC_F_NEXT) != 0)) { + desc = &descs[desc->next]; + if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) + return -1; + + desc_addr = gpa_to_vva(dev, desc->addr); + if (unlikely(!desc_addr)) + return -1; + + desc_offset = 0; + desc_avail = desc->len; + nr_desc += 1; + } else { + desc_avail = desc->len - dev->vhost_hlen; + desc_offset = dev->vhost_hlen; + } + + rte_prefetch0((void *)(uintptr_t)(desc_addr + desc_offset)); + + PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), desc_avail, 0); + + mbuf_offset = 0; + mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM; + while (1) { + uint64_t hpa; + + cpy_len = RTE_MIN(desc_avail, mbuf_avail); + + /* + * A desc buf might across two host physical pages that are + * not continuous. In such case (gpa_to_hpa returns 0), data + * will be copied even though zero copy is enabled. + */ + if (unlikely(dev->dequeue_zero_copy && (hpa = gpa_to_hpa(dev, + desc->addr + desc_offset, cpy_len)))) { + cur->data_len = cpy_len; + cur->data_off = 0; + cur->buf_addr = (void *)(uintptr_t)desc_addr; + cur->buf_physaddr = hpa; + + /* + * In zero copy mode, one mbuf can only reference data + * for one or partial of one desc buff. + */ + mbuf_avail = cpy_len; + } else { + rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, + mbuf_offset), + (void *)((uintptr_t)(desc_addr + desc_offset)), + cpy_len); + } + + mbuf_avail -= cpy_len; + mbuf_offset += cpy_len; + desc_avail -= cpy_len; + desc_offset += cpy_len; + + /* This desc reaches to its end, get the next one */ + if (desc_avail == 0) { + if ((desc->flags & VRING_DESC_F_NEXT) == 0) + break; + + if (unlikely(desc->next >= max_desc || + ++nr_desc > max_desc)) + return -1; + desc = &descs[desc->next]; + if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) + return -1; + + desc_addr = gpa_to_vva(dev, desc->addr); + if (unlikely(!desc_addr)) + return -1; + + rte_prefetch0((void *)(uintptr_t)desc_addr); + + desc_offset = 0; + desc_avail = desc->len; + + PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0); + } + + /* + * This mbuf reaches to its end, get a new one + * to hold more data. + */ + if (mbuf_avail == 0) { + cur = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(cur == NULL)) { + RTE_LOG(ERR, VHOST_DATA, "Failed to " + "allocate memory for mbuf.\n"); + return -1; + } + + prev->next = cur; + prev->data_len = mbuf_offset; + m->nb_segs += 1; + m->pkt_len += mbuf_offset; + prev = cur; + + mbuf_offset = 0; + mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM; + } + } + + prev->data_len = mbuf_offset; + m->pkt_len += mbuf_offset; + + if (hdr) + vhost_dequeue_offload(hdr, m); + + return 0; +} + +static inline void __attribute__((always_inline)) +update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint32_t used_idx, uint32_t desc_idx) +{ + vq->used->ring[used_idx].id = desc_idx; + vq->used->ring[used_idx].len = 0; + vhost_log_used_vring(dev, vq, + offsetof(struct vring_used, ring[used_idx]), + sizeof(vq->used->ring[used_idx])); +} + +static inline void __attribute__((always_inline)) +update_used_idx(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint32_t count) +{ + if (unlikely(count == 0)) + return; + + rte_smp_wmb(); + rte_smp_rmb(); + + vq->used->idx += count; + vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), + sizeof(vq->used->idx)); + + /* Kick guest if required. */ + if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) + && (vq->callfd >= 0)) + eventfd_write(vq->callfd, (eventfd_t)1); +} + +static inline struct zcopy_mbuf *__attribute__((always_inline)) +get_zmbuf(struct vhost_virtqueue *vq) +{ + uint16_t i; + uint16_t last; + int tries = 0; + + /* search [last_zmbuf_idx, zmbuf_size) */ + i = vq->last_zmbuf_idx; + last = vq->zmbuf_size; + +again: + for (; i < last; i++) { + if (vq->zmbufs[i].in_use == 0) { + vq->last_zmbuf_idx = i + 1; + vq->zmbufs[i].in_use = 1; + return &vq->zmbufs[i]; + } + } + + tries++; + if (tries == 1) { + /* search [0, last_zmbuf_idx) */ + i = 0; + last = vq->last_zmbuf_idx; + goto again; + } + + return NULL; +} + +static inline bool __attribute__((always_inline)) +mbuf_is_consumed(struct rte_mbuf *m) +{ + while (m) { + if (rte_mbuf_refcnt_read(m) > 1) + return false; + m = m->next; + } + + return true; +} + +uint16_t +rte_vhost_dequeue_burst(int vid, uint16_t queue_id, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) +{ + struct virtio_net *dev; + struct rte_mbuf *rarp_mbuf = NULL; + struct vhost_virtqueue *vq; + uint32_t desc_indexes[MAX_PKT_BURST]; + uint32_t used_idx; + uint32_t i = 0; + uint16_t free_entries; + uint16_t avail_idx; + + dev = get_device(vid); + if (!dev) + return 0; + + if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) { + RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", + dev->vid, __func__, queue_id); + return 0; + } + + vq = dev->virtqueue[queue_id]; + if (unlikely(vq->enabled == 0)) + return 0; + + if (unlikely(dev->dequeue_zero_copy)) { + struct zcopy_mbuf *zmbuf, *next; + int nr_updated = 0; + + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); + + if (mbuf_is_consumed(zmbuf->mbuf)) { + used_idx = vq->last_used_idx++ & (vq->size - 1); + update_used_ring(dev, vq, used_idx, + zmbuf->desc_idx); + nr_updated += 1; + + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + rte_pktmbuf_free(zmbuf->mbuf); + put_zmbuf(zmbuf); + vq->nr_zmbuf -= 1; + } + } + + update_used_idx(dev, vq, nr_updated); + } + + /* + * Construct a RARP broadcast packet, and inject it to the "pkts" + * array, to looks like that guest actually send such packet. + * + * Check user_send_rarp() for more information. + */ + if (unlikely(rte_atomic16_cmpset((volatile uint16_t *) + &dev->broadcast_rarp.cnt, 1, 0))) { + rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool); + if (rarp_mbuf == NULL) { + RTE_LOG(ERR, VHOST_DATA, + "Failed to allocate memory for mbuf.\n"); + return 0; + } + + if (make_rarp_packet(rarp_mbuf, &dev->mac)) { + rte_pktmbuf_free(rarp_mbuf); + rarp_mbuf = NULL; + } else { + count -= 1; + } + } + + free_entries = *((volatile uint16_t *)&vq->avail->idx) - + vq->last_avail_idx; + if (free_entries == 0) + goto out; + + LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); + + /* Prefetch available and used ring */ + avail_idx = vq->last_avail_idx & (vq->size - 1); + used_idx = vq->last_used_idx & (vq->size - 1); + rte_prefetch0(&vq->avail->ring[avail_idx]); + rte_prefetch0(&vq->used->ring[used_idx]); + + count = RTE_MIN(count, MAX_PKT_BURST); + count = RTE_MIN(count, free_entries); + LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n", + dev->vid, count); + + /* Retrieve all of the head indexes first to avoid caching issues. */ + for (i = 0; i < count; i++) { + avail_idx = (vq->last_avail_idx + i) & (vq->size - 1); + used_idx = (vq->last_used_idx + i) & (vq->size - 1); + desc_indexes[i] = vq->avail->ring[avail_idx]; + + if (likely(dev->dequeue_zero_copy == 0)) + update_used_ring(dev, vq, used_idx, desc_indexes[i]); + } + + /* Prefetch descriptor index. */ + rte_prefetch0(&vq->desc[desc_indexes[0]]); + for (i = 0; i < count; i++) { + struct vring_desc *desc; + uint16_t sz, idx; + int err; + + if (likely(i + 1 < count)) + rte_prefetch0(&vq->desc[desc_indexes[i + 1]]); + + if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) { + desc = (struct vring_desc *)(uintptr_t)gpa_to_vva(dev, + vq->desc[desc_indexes[i]].addr); + if (unlikely(!desc)) + break; + + rte_prefetch0(desc); + sz = vq->desc[desc_indexes[i]].len / sizeof(*desc); + idx = 0; + } else { + desc = vq->desc; + sz = vq->size; + idx = desc_indexes[i]; + } + + pkts[i] = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(pkts[i] == NULL)) { + RTE_LOG(ERR, VHOST_DATA, + "Failed to allocate memory for mbuf.\n"); + break; + } + + err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool); + if (unlikely(err)) { + rte_pktmbuf_free(pkts[i]); + break; + } + + if (unlikely(dev->dequeue_zero_copy)) { + struct zcopy_mbuf *zmbuf; + + zmbuf = get_zmbuf(vq); + if (!zmbuf) { + rte_pktmbuf_free(pkts[i]); + break; + } + zmbuf->mbuf = pkts[i]; + zmbuf->desc_idx = desc_indexes[i]; + + /* + * Pin lock the mbuf; we will check later to see + * whether the mbuf is freed (when we are the last + * user) or not. If that's the case, we then could + * update the used ring safely. + */ + rte_mbuf_refcnt_update(pkts[i], 1); + + vq->nr_zmbuf += 1; + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); + } + } + vq->last_avail_idx += i; + + if (likely(dev->dequeue_zero_copy == 0)) { + vq->last_used_idx += i; + update_used_idx(dev, vq, i); + } + +out: + if (unlikely(rarp_mbuf != NULL)) { + /* + * Inject it to the head of "pkts" array, so that switch's mac + * learning table will get updated first. + */ + memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *)); + pkts[0] = rarp_mbuf; + i += 1; + } + + return i; +} -- cgit 1.2.3-korg