From 8d01b9cd70a67cdafd5b965a70420c3bd7fb3f82 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Thu, 1 Nov 2018 11:59:50 +0000 Subject: New upstream version 18.11-rc1 Change-Id: Iaa71986dd6332e878d8f4bf493101b2bbc6313bb Signed-off-by: Luca Boccassi --- lib/librte_eal/linuxapp/eal/Makefile | 20 +- lib/librte_eal/linuxapp/eal/eal.c | 119 ++++-- lib/librte_eal/linuxapp/eal/eal_dev.c | 172 +++++++- lib/librte_eal/linuxapp/eal/eal_hugepage_info.c | 1 + lib/librte_eal/linuxapp/eal/eal_interrupts.c | 79 ++++ lib/librte_eal/linuxapp/eal/eal_memalloc.c | 466 +++++++++++++++++---- lib/librte_eal/linuxapp/eal/eal_memory.c | 264 +++++++++--- lib/librte_eal/linuxapp/eal/eal_thread.c | 4 +- lib/librte_eal/linuxapp/eal/eal_timer.c | 5 +- lib/librte_eal/linuxapp/eal/eal_vfio.c | 216 ++++++---- lib/librte_eal/linuxapp/eal/eal_vfio.h | 4 + lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 11 + .../linuxapp/eal/include/exec-env/rte_kni_common.h | 6 + 13 files changed, 1085 insertions(+), 282 deletions(-) (limited to 'lib/librte_eal/linuxapp/eal') diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile index fd92c75c..51deb579 100644 --- a/lib/librte_eal/linuxapp/eal/Makefile +++ b/lib/librte_eal/linuxapp/eal/Makefile @@ -10,7 +10,7 @@ ARCH_DIR ?= $(RTE_ARCH) EXPORT_MAP := ../../rte_eal_version.map VPATH += $(RTE_SDK)/lib/librte_eal/common/arch/$(ARCH_DIR) -LIBABIVER := 8 +LIBABIVER := 9 VPATH += $(RTE_SDK)/lib/librte_eal/common @@ -70,10 +70,12 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_proc.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_fbarray.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_uuid.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += hotplug_mp.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_mp.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_keepalive.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_option.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_service.c SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_reciprocal.c @@ -85,22 +87,6 @@ SRCS-y += rte_cycles.c CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST) -CFLAGS_eal.o := -D_GNU_SOURCE -CFLAGS_eal_interrupts.o := -D_GNU_SOURCE -CFLAGS_eal_vfio_mp_sync.o := -D_GNU_SOURCE -CFLAGS_eal_timer.o := -D_GNU_SOURCE -CFLAGS_eal_lcore.o := -D_GNU_SOURCE -CFLAGS_eal_memalloc.o := -D_GNU_SOURCE -CFLAGS_eal_thread.o := -D_GNU_SOURCE -CFLAGS_eal_log.o := -D_GNU_SOURCE -CFLAGS_eal_common_log.o := -D_GNU_SOURCE -CFLAGS_eal_hugepage_info.o := -D_GNU_SOURCE -CFLAGS_eal_common_whitelist.o := -D_GNU_SOURCE -CFLAGS_eal_common_options.o := -D_GNU_SOURCE -CFLAGS_eal_common_thread.o := -D_GNU_SOURCE -CFLAGS_eal_common_lcore.o := -D_GNU_SOURCE -CFLAGS_rte_cycles.o := -D_GNU_SOURCE - # workaround for a gcc bug with noreturn attribute # http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603 ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c index e59ac657..361744d4 100644 --- a/lib/librte_eal/linuxapp/eal/eal.c +++ b/lib/librte_eal/linuxapp/eal/eal.c @@ -48,6 +48,7 @@ #include #include #include +#include #include "eal_private.h" #include "eal_thread.h" @@ -149,7 +150,7 @@ eal_create_runtime_dir(void) } const char * -eal_get_runtime_dir(void) +rte_eal_get_runtime_dir(void) { return runtime_dir; } @@ -263,6 +264,8 @@ rte_eal_config_create(void) * processes could later map the config into this exact location */ rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr; + rte_config.mem_config->dma_maskbits = 0; + } /* attach to an existing shared memory config */ @@ -352,6 +355,24 @@ eal_proc_type_detect(void) return ptype; } +/* copies data from internal config to shared config */ +static void +eal_update_mem_config(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + mcfg->legacy_mem = internal_config.legacy_mem; + mcfg->single_file_segments = internal_config.single_file_segments; +} + +/* copies data from shared config to internal config */ +static void +eal_update_internal_config(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + internal_config.legacy_mem = mcfg->legacy_mem; + internal_config.single_file_segments = mcfg->single_file_segments; +} + /* Sets up rte_config structure with the pointer to shared memory config.*/ static void rte_config_init(void) @@ -361,11 +382,13 @@ rte_config_init(void) switch (rte_config.process_type){ case RTE_PROC_PRIMARY: rte_eal_config_create(); + eal_update_mem_config(); break; case RTE_PROC_SECONDARY: rte_eal_config_attach(); rte_eal_mcfg_wait_complete(rte_config.mem_config); rte_eal_config_reattach(); + eal_update_internal_config(); break; case RTE_PROC_AUTO: case RTE_PROC_INVALID: @@ -580,12 +603,20 @@ eal_parse_args(int argc, char **argv) argvopt = argv; optind = 1; + opterr = 0; while ((opt = getopt_long(argc, argvopt, eal_short_options, eal_long_options, &option_index)) != EOF) { - /* getopt is not happy, stop right now */ + /* + * getopt didn't recognise the option, lets parse the + * registered options to see if the flag is valid + */ if (opt == '?') { + ret = rte_option_parse(argv[optind-1]); + if (ret == 0) + continue; + eal_usage(prgname); ret = -1; goto out; @@ -725,6 +756,9 @@ check_socket(const struct rte_memseg_list *msl, void *arg) { int *socket_id = arg; + if (msl->external) + return 0; + return *socket_id == msl->socket_id; } @@ -793,7 +827,8 @@ rte_eal_init(int argc, char **argv) int i, fctret, ret; pthread_t thread_id; static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0); - const char *logid; + const char *p; + static char logid[PATH_MAX]; char cpuset[RTE_CPU_AFFINITY_STR_LEN]; char thread_name[RTE_MAX_THREAD_NAME_LEN]; @@ -810,9 +845,8 @@ rte_eal_init(int argc, char **argv) return -1; } - logid = strrchr(argv[0], '/'); - logid = strdup(logid ? logid + 1: argv[0]); - + p = strrchr(argv[0], '/'); + strlcpy(logid, p ? p + 1 : argv[0], sizeof(logid)); thread_id = pthread_self(); eal_reset_internal_config(&internal_config); @@ -835,7 +869,7 @@ rte_eal_init(int argc, char **argv) } if (eal_plugins_init() < 0) { - rte_eal_init_alert("Cannot init plugins\n"); + rte_eal_init_alert("Cannot init plugins"); rte_errno = EINVAL; rte_atomic32_clear(&run_once); return -1; @@ -850,7 +884,7 @@ rte_eal_init(int argc, char **argv) rte_config_init(); if (rte_eal_intr_init() < 0) { - rte_eal_init_alert("Cannot init interrupt-handling thread\n"); + rte_eal_init_alert("Cannot init interrupt-handling thread"); return -1; } @@ -858,30 +892,43 @@ rte_eal_init(int argc, char **argv) * bus through mp channel in the secondary process before the bus scan. */ if (rte_mp_channel_init() < 0) { - rte_eal_init_alert("failed to init mp channel\n"); + rte_eal_init_alert("failed to init mp channel"); if (rte_eal_process_type() == RTE_PROC_PRIMARY) { rte_errno = EFAULT; return -1; } } + /* register multi-process action callbacks for hotplug */ + if (rte_mp_dev_hotplug_init() < 0) { + rte_eal_init_alert("failed to register mp callback for hotplug"); + return -1; + } + if (rte_bus_scan()) { - rte_eal_init_alert("Cannot scan the buses for devices\n"); + rte_eal_init_alert("Cannot scan the buses for devices"); rte_errno = ENODEV; rte_atomic32_clear(&run_once); return -1; } - /* autodetect the iova mapping mode (default is iova_pa) */ - rte_eal_get_configuration()->iova_mode = rte_bus_get_iommu_class(); - - /* Workaround for KNI which requires physical address to work */ - if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA && - rte_eal_check_module("rte_kni") == 1) { - rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA; - RTE_LOG(WARNING, EAL, - "Some devices want IOVA as VA but PA will be used because.. " - "KNI module inserted\n"); + /* if no EAL option "--iova-mode=", use bus IOVA scheme */ + if (internal_config.iova_mode == RTE_IOVA_DC) { + /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */ + rte_eal_get_configuration()->iova_mode = + rte_bus_get_iommu_class(); + + /* Workaround for KNI which requires physical address to work */ + if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA && + rte_eal_check_module("rte_kni") == 1) { + rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA; + RTE_LOG(WARNING, EAL, + "Some devices want IOVA as VA but PA will be used because.. " + "KNI module inserted\n"); + } + } else { + rte_eal_get_configuration()->iova_mode = + internal_config.iova_mode; } if (internal_config.no_hugetlbfs == 0) { @@ -924,7 +971,7 @@ rte_eal_init(int argc, char **argv) #ifdef VFIO_PRESENT if (rte_eal_vfio_setup() < 0) { - rte_eal_init_alert("Cannot init VFIO\n"); + rte_eal_init_alert("Cannot init VFIO"); rte_errno = EAGAIN; rte_atomic32_clear(&run_once); return -1; @@ -935,13 +982,13 @@ rte_eal_init(int argc, char **argv) * initialize memzones first. */ if (rte_eal_memzone_init() < 0) { - rte_eal_init_alert("Cannot init memzone\n"); + rte_eal_init_alert("Cannot init memzone"); rte_errno = ENODEV; return -1; } if (rte_eal_memory_init() < 0) { - rte_eal_init_alert("Cannot init memory\n"); + rte_eal_init_alert("Cannot init memory"); rte_errno = ENOMEM; return -1; } @@ -950,25 +997,25 @@ rte_eal_init(int argc, char **argv) eal_hugedirs_unlock(); if (rte_eal_malloc_heap_init() < 0) { - rte_eal_init_alert("Cannot init malloc heap\n"); + rte_eal_init_alert("Cannot init malloc heap"); rte_errno = ENODEV; return -1; } if (rte_eal_tailqs_init() < 0) { - rte_eal_init_alert("Cannot init tail queues for objects\n"); + rte_eal_init_alert("Cannot init tail queues for objects"); rte_errno = EFAULT; return -1; } if (rte_eal_alarm_init() < 0) { - rte_eal_init_alert("Cannot init interrupt-handling thread\n"); + rte_eal_init_alert("Cannot init interrupt-handling thread"); /* rte_eal_alarm_init sets rte_errno on failure. */ return -1; } if (rte_eal_timer_init() < 0) { - rte_eal_init_alert("Cannot init HPET or TSC timers\n"); + rte_eal_init_alert("Cannot init HPET or TSC timers"); rte_errno = ENOTSUP; return -1; } @@ -979,8 +1026,8 @@ rte_eal_init(int argc, char **argv) ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); - RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%x;cpuset=[%s%s])\n", - rte_config.master_lcore, (int)thread_id, cpuset, + RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%zx;cpuset=[%s%s])\n", + rte_config.master_lcore, (uintptr_t)thread_id, cpuset, ret == 0 ? "" : "..."); RTE_LCORE_FOREACH_SLAVE(i) { @@ -1022,14 +1069,14 @@ rte_eal_init(int argc, char **argv) /* initialize services so vdevs register service during bus_probe. */ ret = rte_service_init(); if (ret) { - rte_eal_init_alert("rte_service_init() failed\n"); + rte_eal_init_alert("rte_service_init() failed"); rte_errno = ENOEXEC; return -1; } /* Probe all the buses and devices/drivers on them */ if (rte_bus_probe()) { - rte_eal_init_alert("Cannot probe devices\n"); + rte_eal_init_alert("Cannot probe devices"); rte_errno = ENOTSUP; return -1; } @@ -1051,6 +1098,9 @@ rte_eal_init(int argc, char **argv) rte_eal_mcfg_complete(); + /* Call each registered callback, if enabled */ + rte_option_init(); + return fctret; } @@ -1059,7 +1109,12 @@ mark_freeable(const struct rte_memseg_list *msl, const struct rte_memseg *ms, void *arg __rte_unused) { /* ms is const, so find this memseg */ - struct rte_memseg *found = rte_mem_virt2memseg(ms->addr, msl); + struct rte_memseg *found; + + if (msl->external) + return 0; + + found = rte_mem_virt2memseg(ms->addr, msl); found->flags &= ~RTE_MEMSEG_FLAG_DO_NOT_FREE; diff --git a/lib/librte_eal/linuxapp/eal/eal_dev.c b/lib/librte_eal/linuxapp/eal/eal_dev.c index 1cf6aebf..d589c692 100644 --- a/lib/librte_eal/linuxapp/eal/eal_dev.c +++ b/lib/librte_eal/linuxapp/eal/eal_dev.c @@ -4,6 +4,8 @@ #include #include +#include +#include #include #include @@ -14,15 +16,32 @@ #include #include #include +#include +#include +#include +#include #include "eal_private.h" static struct rte_intr_handle intr_handle = {.fd = -1 }; static bool monitor_started; +static bool hotplug_handle; #define EAL_UEV_MSG_LEN 4096 #define EAL_UEV_MSG_ELEM_LEN 128 +/* + * spinlock for device hot-unplug failure handling. If it try to access bus or + * device, such as handle sigbus on bus or handle memory failure for device + * just need to use this lock. It could protect the bus and the device to avoid + * race condition. + */ +static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER; + +static struct sigaction sigbus_action_old; + +static int sigbus_need_recover; + static void dev_uev_handler(__rte_unused void *param); /* identify the system layer which reports this event. */ @@ -33,6 +52,55 @@ enum eal_dev_event_subsystem { EAL_DEV_EVENT_SUBSYSTEM_MAX }; +static void +sigbus_action_recover(void) +{ + if (sigbus_need_recover) { + sigaction(SIGBUS, &sigbus_action_old, NULL); + sigbus_need_recover = 0; + } +} + +static void sigbus_handler(int signum, siginfo_t *info, + void *ctx __rte_unused) +{ + int ret; + + RTE_LOG(DEBUG, EAL, "Thread[%d] catch SIGBUS, fault address:%p\n", + (int)pthread_self(), info->si_addr); + + rte_spinlock_lock(&failure_handle_lock); + ret = rte_bus_sigbus_handler(info->si_addr); + rte_spinlock_unlock(&failure_handle_lock); + if (ret == -1) { + rte_exit(EXIT_FAILURE, + "Failed to handle SIGBUS for hot-unplug, " + "(rte_errno: %s)!", strerror(rte_errno)); + } else if (ret == 1) { + if (sigbus_action_old.sa_flags == SA_SIGINFO + && sigbus_action_old.sa_sigaction) { + (*(sigbus_action_old.sa_sigaction))(signum, + info, ctx); + } else if (sigbus_action_old.sa_flags != SA_SIGINFO + && sigbus_action_old.sa_handler) { + (*(sigbus_action_old.sa_handler))(signum); + } else { + rte_exit(EXIT_FAILURE, + "Failed to handle generic SIGBUS!"); + } + } + + RTE_LOG(DEBUG, EAL, "Success to handle SIGBUS for hot-unplug!\n"); +} + +static int cmp_dev_name(const struct rte_device *dev, + const void *_name) +{ + const char *name = _name; + + return strcmp(dev->name, name); +} + static int dev_uev_socket_fd_create(void) { @@ -147,6 +215,9 @@ dev_uev_handler(__rte_unused void *param) struct rte_dev_event uevent; int ret; char buf[EAL_UEV_MSG_LEN]; + struct rte_bus *bus; + struct rte_device *dev; + const char *busname = ""; memset(&uevent, 0, sizeof(struct rte_dev_event)); memset(buf, 0, EAL_UEV_MSG_LEN); @@ -171,8 +242,43 @@ dev_uev_handler(__rte_unused void *param) RTE_LOG(DEBUG, EAL, "receive uevent(name:%s, type:%d, subsystem:%d)\n", uevent.devname, uevent.type, uevent.subsystem); - if (uevent.devname) - dev_callback_process(uevent.devname, uevent.type); + switch (uevent.subsystem) { + case EAL_DEV_EVENT_SUBSYSTEM_PCI: + case EAL_DEV_EVENT_SUBSYSTEM_UIO: + busname = "pci"; + break; + default: + break; + } + + if (uevent.devname) { + if (uevent.type == RTE_DEV_EVENT_REMOVE && hotplug_handle) { + rte_spinlock_lock(&failure_handle_lock); + bus = rte_bus_find_by_name(busname); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", + busname); + return; + } + + dev = bus->find_device(NULL, cmp_dev_name, + uevent.devname); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "Cannot find device (%s) on " + "bus (%s)\n", uevent.devname, busname); + return; + } + + ret = bus->hot_unplug_handler(dev); + rte_spinlock_unlock(&failure_handle_lock); + if (ret) { + RTE_LOG(ERR, EAL, "Can not handle hot-unplug " + "for device (%s)\n", dev->name); + return; + } + } + rte_dev_event_callback_process(uevent.devname, uevent.type); + } } int __rte_experimental @@ -220,5 +326,67 @@ rte_dev_event_monitor_stop(void) close(intr_handle.fd); intr_handle.fd = -1; monitor_started = false; + return 0; } + +int +dev_sigbus_handler_register(void) +{ + sigset_t mask; + struct sigaction action; + + rte_errno = 0; + + if (sigbus_need_recover) + return 0; + + sigemptyset(&mask); + sigaddset(&mask, SIGBUS); + action.sa_flags = SA_SIGINFO; + action.sa_mask = mask; + action.sa_sigaction = sigbus_handler; + sigbus_need_recover = !sigaction(SIGBUS, &action, &sigbus_action_old); + + return rte_errno; +} + +int +dev_sigbus_handler_unregister(void) +{ + rte_errno = 0; + + sigbus_action_recover(); + + return rte_errno; +} + +int __rte_experimental +rte_dev_hotplug_handle_enable(void) +{ + int ret = 0; + + ret = dev_sigbus_handler_register(); + if (ret < 0) + RTE_LOG(ERR, EAL, + "fail to register sigbus handler for devices.\n"); + + hotplug_handle = true; + + return ret; +} + +int __rte_experimental +rte_dev_hotplug_handle_disable(void) +{ + int ret = 0; + + ret = dev_sigbus_handler_unregister(); + if (ret < 0) + RTE_LOG(ERR, EAL, + "fail to unregister sigbus handler for devices.\n"); + + hotplug_handle = false; + + return ret; +} diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c index 3a7d4b22..0eab1cf7 100644 --- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c +++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c index 4076c6d6..39252a88 100644 --- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c +++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "eal_private.h" #include "eal_vfio.h" @@ -308,6 +309,66 @@ vfio_disable_msix(const struct rte_intr_handle *intr_handle) { return ret; } + +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE +/* enable req notifier */ +static int +vfio_enable_req(const struct rte_intr_handle *intr_handle) +{ + int len, ret; + char irq_set_buf[IRQ_SET_BUF_LEN]; + struct vfio_irq_set *irq_set; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | + VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_REQ_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + *fd_ptr = intr_handle->fd; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +/* disable req notifier */ +static int +vfio_disable_req(const struct rte_intr_handle *intr_handle) +{ + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_REQ_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) + RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n", + intr_handle->fd); + + return ret; +} +#endif #endif static int @@ -556,6 +617,12 @@ rte_intr_enable(const struct rte_intr_handle *intr_handle) if (vfio_enable_intx(intr_handle)) return -1; break; +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE + case RTE_INTR_HANDLE_VFIO_REQ: + if (vfio_enable_req(intr_handle)) + return -1; + break; +#endif #endif /* not used at this moment */ case RTE_INTR_HANDLE_DEV_EVENT: @@ -606,6 +673,12 @@ rte_intr_disable(const struct rte_intr_handle *intr_handle) if (vfio_disable_intx(intr_handle)) return -1; break; +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE + case RTE_INTR_HANDLE_VFIO_REQ: + if (vfio_disable_req(intr_handle)) + return -1; + break; +#endif #endif /* not used at this moment */ case RTE_INTR_HANDLE_DEV_EVENT: @@ -672,6 +745,12 @@ eal_intr_process_interrupts(struct epoll_event *events, int nfds) case RTE_INTR_HANDLE_VFIO_LEGACY: bytes_read = sizeof(buf.vfio_intr_count); break; +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE + case RTE_INTR_HANDLE_VFIO_REQ: + bytes_read = 0; + call = true; + break; +#endif #endif case RTE_INTR_HANDLE_VDEV: case RTE_INTR_HANDLE_EXT: diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c index aa95551a..48b9c736 100644 --- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c +++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -51,31 +52,56 @@ const int anonymous_hugepages_supported = #define RTE_MAP_HUGE_SHIFT 26 #endif +/* + * we don't actually care if memfd itself is supported - we only need to check + * if memfd supports hugetlbfs, as that already implies memfd support. + * + * also, this is not a constant, because while we may be *compiled* with memfd + * hugetlbfs support, we might not be *running* on a system that supports memfd + * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at + * runtime, and fall back to anonymous memory. + */ +static int memfd_create_supported = +#ifdef MFD_HUGETLB +#define MEMFD_SUPPORTED + 1; +#else + 0; +#endif + /* * not all kernel version support fallocate on hugetlbfs, so fall back to * ftruncate and disallow deallocation if fallocate is not supported. */ static int fallocate_supported = -1; /* unknown */ -/* for single-file segments, we need some kind of mechanism to keep track of +/* + * we have two modes - single file segments, and file-per-page mode. + * + * for single-file segments, we need some kind of mechanism to keep track of * which hugepages can be freed back to the system, and which cannot. we cannot * use flock() because they don't allow locking parts of a file, and we cannot * use fcntl() due to issues with their semantics, so we will have to rely on a - * bunch of lockfiles for each page. + * bunch of lockfiles for each page. so, we will use 'fds' array to keep track + * of per-page lockfiles. we will store the actual segment list fd in the + * 'memseg_list_fd' field. + * + * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd' + * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's. * * we cannot know how many pages a system will have in advance, but we do know * that they come in lists, and we know lengths of these lists. so, simply store * a malloc'd array of fd's indexed by list and segment index. * * they will be initialized at startup, and filled as we allocate/deallocate - * segments. also, use this to track memseg list proper fd. + * segments. */ static struct { int *fds; /**< dynamically allocated array of segment lock fd's */ int memseg_list_fd; /**< memseg list fd */ int len; /**< total length of the array */ int count; /**< entries used in an array */ -} lock_fds[RTE_MAX_MEMSEG_LISTS]; +} fd_list[RTE_MAX_MEMSEG_LISTS]; /** local copy of a memory map, used to synchronize memory hotplug in MP */ static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS]; @@ -182,6 +208,31 @@ get_file_size(int fd) return st.st_size; } +static inline uint32_t +bsf64(uint64_t v) +{ + return (uint32_t)__builtin_ctzll(v); +} + +static inline uint32_t +log2_u64(uint64_t v) +{ + if (v == 0) + return 0; + v = rte_align64pow2(v); + return bsf64(v); +} + +static int +pagesz_flags(uint64_t page_sz) +{ + /* as per mmap() manpage, all page sizes are log2 of page size + * shifted by MAP_HUGE_SHIFT + */ + int log2 = log2_u64(page_sz); + return log2 << RTE_MAP_HUGE_SHIFT; +} + /* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */ static int lock(int fd, int type) { @@ -209,12 +260,12 @@ static int get_segment_lock_fd(int list_idx, int seg_idx) char path[PATH_MAX] = {0}; int fd; - if (list_idx < 0 || list_idx >= (int)RTE_DIM(lock_fds)) + if (list_idx < 0 || list_idx >= (int)RTE_DIM(fd_list)) return -1; - if (seg_idx < 0 || seg_idx >= lock_fds[list_idx].len) + if (seg_idx < 0 || seg_idx >= fd_list[list_idx].len) return -1; - fd = lock_fds[list_idx].fds[seg_idx]; + fd = fd_list[list_idx].fds[seg_idx]; /* does this lock already exist? */ if (fd >= 0) return fd; @@ -236,8 +287,8 @@ static int get_segment_lock_fd(int list_idx, int seg_idx) return -1; } /* store it for future reference */ - lock_fds[list_idx].fds[seg_idx] = fd; - lock_fds[list_idx].count++; + fd_list[list_idx].fds[seg_idx] = fd; + fd_list[list_idx].count++; return fd; } @@ -245,12 +296,12 @@ static int unlock_segment(int list_idx, int seg_idx) { int fd, ret; - if (list_idx < 0 || list_idx >= (int)RTE_DIM(lock_fds)) + if (list_idx < 0 || list_idx >= (int)RTE_DIM(fd_list)) return -1; - if (seg_idx < 0 || seg_idx >= lock_fds[list_idx].len) + if (seg_idx < 0 || seg_idx >= fd_list[list_idx].len) return -1; - fd = lock_fds[list_idx].fds[seg_idx]; + fd = fd_list[list_idx].fds[seg_idx]; /* upgrade lock to exclusive to see if we can remove the lockfile */ ret = lock(fd, LOCK_EX); @@ -270,25 +321,77 @@ static int unlock_segment(int list_idx, int seg_idx) * and remove it from list anyway. */ close(fd); - lock_fds[list_idx].fds[seg_idx] = -1; - lock_fds[list_idx].count--; + fd_list[list_idx].fds[seg_idx] = -1; + fd_list[list_idx].count--; if (ret < 0) return -1; return 0; } +static int +get_seg_memfd(struct hugepage_info *hi __rte_unused, + unsigned int list_idx __rte_unused, + unsigned int seg_idx __rte_unused) +{ +#ifdef MEMFD_SUPPORTED + int fd; + char segname[250]; /* as per manpage, limit is 249 bytes plus null */ + + if (internal_config.single_file_segments) { + fd = fd_list[list_idx].memseg_list_fd; + + if (fd < 0) { + int flags = MFD_HUGETLB | pagesz_flags(hi->hugepage_sz); + + snprintf(segname, sizeof(segname), "seg_%i", list_idx); + fd = memfd_create(segname, flags); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", + __func__, strerror(errno)); + return -1; + } + fd_list[list_idx].memseg_list_fd = fd; + } + } else { + fd = fd_list[list_idx].fds[seg_idx]; + + if (fd < 0) { + int flags = MFD_HUGETLB | pagesz_flags(hi->hugepage_sz); + + snprintf(segname, sizeof(segname), "seg_%i-%i", + list_idx, seg_idx); + fd = memfd_create(segname, flags); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", + __func__, strerror(errno)); + return -1; + } + fd_list[list_idx].fds[seg_idx] = fd; + } + } + return fd; +#endif + return -1; +} + static int get_seg_fd(char *path, int buflen, struct hugepage_info *hi, unsigned int list_idx, unsigned int seg_idx) { int fd; + /* for in-memory mode, we only make it here when we're sure we support + * memfd, and this is a special case. + */ + if (internal_config.in_memory) + return get_seg_memfd(hi, list_idx, seg_idx); + if (internal_config.single_file_segments) { /* create a hugepage file path */ eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx); - fd = lock_fds[list_idx].memseg_list_fd; + fd = fd_list[list_idx].memseg_list_fd; if (fd < 0) { fd = open(path, O_CREAT | O_RDWR, 0600); @@ -304,24 +407,30 @@ get_seg_fd(char *path, int buflen, struct hugepage_info *hi, close(fd); return -1; } - lock_fds[list_idx].memseg_list_fd = fd; + fd_list[list_idx].memseg_list_fd = fd; } } else { /* create a hugepage file path */ eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); - fd = open(path, O_CREAT | O_RDWR, 0600); + + fd = fd_list[list_idx].fds[seg_idx]; + if (fd < 0) { - RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__, - strerror(errno)); - return -1; - } - /* take out a read lock */ - if (lock(fd, LOCK_SH) < 0) { - RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", - __func__, strerror(errno)); - close(fd); - return -1; + fd = open(path, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", + __func__, strerror(errno)); + return -1; + } + /* take out a read lock */ + if (lock(fd, LOCK_SH) < 0) { + RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", + __func__, strerror(errno)); + close(fd); + return -1; + } + fd_list[list_idx].fds[seg_idx] = fd; } } return fd; @@ -332,6 +441,33 @@ resize_hugefile(int fd, char *path, int list_idx, int seg_idx, uint64_t fa_offset, uint64_t page_sz, bool grow) { bool again = false; + + /* in-memory mode is a special case, because we don't need to perform + * any locking, and we can be sure that fallocate() is supported. + */ + if (internal_config.in_memory) { + int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_KEEP_SIZE; + int ret; + + /* grow or shrink the file */ + ret = fallocate(fd, flags, fa_offset, page_sz); + + if (ret < 0) { + RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", + __func__, + strerror(errno)); + return -1; + } + /* increase/decrease total segment count */ + fd_list[list_idx].count += (grow ? 1 : -1); + if (!grow && fd_list[list_idx].count == 0) { + close(fd_list[list_idx].memseg_list_fd); + fd_list[list_idx].memseg_list_fd = -1; + } + return 0; + } + do { if (fallocate_supported == 0) { /* we cannot deallocate memory if fallocate() is not @@ -410,9 +546,9 @@ resize_hugefile(int fd, char *path, int list_idx, int seg_idx, * page file fd, so that one of the processes * could then delete the file after shrinking. */ - if (ret < 1 && lock_fds[list_idx].count == 0) { + if (ret < 1 && fd_list[list_idx].count == 0) { close(fd); - lock_fds[list_idx].memseg_list_fd = -1; + fd_list[list_idx].memseg_list_fd = -1; } if (ret < 0) { @@ -448,13 +584,13 @@ resize_hugefile(int fd, char *path, int list_idx, int seg_idx, * more segments active in this segment list, * and remove the file if there aren't. */ - if (lock_fds[list_idx].count == 0) { + if (fd_list[list_idx].count == 0) { if (unlink(path)) RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n", __func__, path, strerror(errno)); close(fd); - lock_fds[list_idx].memseg_list_fd = -1; + fd_list[list_idx].memseg_list_fd = -1; } } } @@ -481,26 +617,34 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, void *new_addr; alloc_sz = hi->hugepage_sz; - if (!internal_config.single_file_segments && - internal_config.in_memory && - anonymous_hugepages_supported) { - int log2, flags; - - log2 = rte_log2_u32(alloc_sz); - /* as per mmap() manpage, all page sizes are log2 of page size - * shifted by MAP_HUGE_SHIFT - */ - flags = (log2 << RTE_MAP_HUGE_SHIFT) | MAP_HUGETLB | MAP_FIXED | + + /* these are checked at init, but code analyzers don't know that */ + if (internal_config.in_memory && !anonymous_hugepages_supported) { + RTE_LOG(ERR, EAL, "Anonymous hugepages not supported, in-memory mode cannot allocate memory\n"); + return -1; + } + if (internal_config.in_memory && !memfd_create_supported && + internal_config.single_file_segments) { + RTE_LOG(ERR, EAL, "Single-file segments are not supported without memfd support\n"); + return -1; + } + + /* in-memory without memfd is a special case */ + int mmap_flags; + + if (internal_config.in_memory && !memfd_create_supported) { + int pagesz_flag, flags; + + pagesz_flag = pagesz_flags(alloc_sz); + flags = pagesz_flag | MAP_HUGETLB | MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS; fd = -1; - va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, flags, -1, 0); - - /* single-file segments codepath will never be active because - * in-memory mode is incompatible with it and it's stopped at - * EAL initialization stage, however the compiler doesn't know - * that and complains about map_offset being used uninitialized - * on failure codepaths while having in-memory mode enabled. so, - * assign a value here. + mmap_flags = flags; + + /* single-file segments codepath will never be active + * here because in-memory mode is incompatible with the + * fallback path, and it's stopped at EAL initialization + * stage. */ map_offset = 0; } else { @@ -524,7 +668,8 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, __func__, strerror(errno)); goto resized; } - if (internal_config.hugepage_unlink) { + if (internal_config.hugepage_unlink && + !internal_config.in_memory) { if (unlink(path)) { RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n", __func__, strerror(errno)); @@ -532,16 +677,16 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, } } } - - /* - * map the segment, and populate page tables, the kernel fills - * this segment with zeros if it's a new page. - */ - va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, - map_offset); + mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED; } + /* + * map the segment, and populate page tables, the kernel fills + * this segment with zeros if it's a new page. + */ + va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd, + map_offset); + if (va == MAP_FAILED) { RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__, strerror(errno)); @@ -593,10 +738,6 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, goto mapped; } #endif - /* for non-single file segments that aren't in-memory, we can close fd - * here */ - if (!internal_config.single_file_segments && !internal_config.in_memory) - close(fd); ms->addr = addr; ms->hugepage_sz = alloc_sz; @@ -626,7 +767,10 @@ unmapped: RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n"); } resized: - /* in-memory mode will never be single-file-segments mode */ + /* some codepaths will return negative fd, so exit early */ + if (fd < 0) + return -1; + if (internal_config.single_file_segments) { resize_hugefile(fd, path, list_idx, seg_idx, map_offset, alloc_sz, false); @@ -638,6 +782,7 @@ resized: lock(fd, LOCK_EX) == 1) unlink(path); close(fd); + fd_list[list_idx].fds[seg_idx] = -1; } return -1; } @@ -648,7 +793,8 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi, { uint64_t map_offset; char path[PATH_MAX]; - int fd, ret; + int fd, ret = 0; + bool exit_early; /* erase page data */ memset(ms->addr, 0, ms->len); @@ -660,8 +806,17 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi, return -1; } + exit_early = false; + + /* if we're using anonymous hugepages, nothing to be done */ + if (internal_config.in_memory && !memfd_create_supported) + exit_early = true; + /* if we've already unlinked the page, nothing needs to be done */ - if (internal_config.hugepage_unlink) { + if (!internal_config.in_memory && internal_config.hugepage_unlink) + exit_early = true; + + if (exit_early) { memset(ms, 0, sizeof(*ms)); return 0; } @@ -684,14 +839,17 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi, /* if we're able to take out a write lock, we're the last one * holding onto this page. */ - ret = lock(fd, LOCK_EX); - if (ret >= 0) { - /* no one else is using this page */ - if (ret == 1) - unlink(path); + if (!internal_config.in_memory) { + ret = lock(fd, LOCK_EX); + if (ret >= 0) { + /* no one else is using this page */ + if (ret == 1) + unlink(path); + } } /* closing fd will drop the lock */ close(fd); + fd_list[list_idx].fds[seg_idx] = -1; } memset(ms, 0, sizeof(*ms)); @@ -828,7 +986,7 @@ free_seg_walk(const struct rte_memseg_list *msl, void *arg) int msl_idx, seg_idx, ret, dir_fd = -1; start_addr = (uintptr_t) msl->base_va; - end_addr = start_addr + msl->memseg_arr.len * (size_t)msl->page_sz; + end_addr = start_addr + msl->len; if ((uintptr_t)wa->ms->addr < start_addr || (uintptr_t)wa->ms->addr >= end_addr) @@ -1250,6 +1408,9 @@ sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused) unsigned int i; int msl_idx; + if (msl->external) + return 0; + msl_idx = msl - mcfg->memsegs; primary_msl = &mcfg->memsegs[msl_idx]; local_msl = &local_memsegs[msl_idx]; @@ -1298,6 +1459,9 @@ secondary_msl_create_walk(const struct rte_memseg_list *msl, char name[PATH_MAX]; int msl_idx, ret; + if (msl->external) + return 0; + msl_idx = msl - mcfg->memsegs; primary_msl = &mcfg->memsegs[msl_idx]; local_msl = &local_memsegs[msl_idx]; @@ -1314,50 +1478,176 @@ secondary_msl_create_walk(const struct rte_memseg_list *msl, return -1; } local_msl->base_va = primary_msl->base_va; + local_msl->len = primary_msl->len; return 0; } static int -secondary_lock_list_create_walk(const struct rte_memseg_list *msl, - void *arg __rte_unused) +alloc_list(int list_idx, int len) { - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - unsigned int i, len; - int msl_idx; int *data; + int i; - msl_idx = msl - mcfg->memsegs; - len = msl->memseg_arr.len; - - /* ensure we have space to store lock fd per each possible segment */ + /* ensure we have space to store fd per each possible segment */ data = malloc(sizeof(int) * len); if (data == NULL) { - RTE_LOG(ERR, EAL, "Unable to allocate space for lock descriptors\n"); + RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n"); return -1; } /* set all fd's as invalid */ for (i = 0; i < len; i++) data[i] = -1; - lock_fds[msl_idx].fds = data; - lock_fds[msl_idx].len = len; - lock_fds[msl_idx].count = 0; - lock_fds[msl_idx].memseg_list_fd = -1; + fd_list[list_idx].fds = data; + fd_list[list_idx].len = len; + fd_list[list_idx].count = 0; + fd_list[list_idx].memseg_list_fd = -1; + + return 0; +} + +static int +fd_list_create_walk(const struct rte_memseg_list *msl, + void *arg __rte_unused) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int len; + int msl_idx; + + if (msl->external) + return 0; + + msl_idx = msl - mcfg->memsegs; + len = msl->memseg_arr.len; + + return alloc_list(msl_idx, len); +} + +int +eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + /* if list is not allocated, allocate it */ + if (fd_list[list_idx].len == 0) { + int len = mcfg->memsegs[list_idx].memseg_arr.len; + + if (alloc_list(list_idx, len) < 0) + return -ENOMEM; + } + fd_list[list_idx].fds[seg_idx] = fd; return 0; } +int +eal_memalloc_get_seg_fd(int list_idx, int seg_idx) +{ + int fd; + if (internal_config.single_file_segments) { + fd = fd_list[list_idx].memseg_list_fd; + } else if (fd_list[list_idx].len == 0) { + /* list not initialized */ + fd = -1; + } else { + fd = fd_list[list_idx].fds[seg_idx]; + } + if (fd < 0) + return -ENODEV; + return fd; +} + +static int +test_memfd_create(void) +{ +#ifdef MEMFD_SUPPORTED + unsigned int i; + for (i = 0; i < internal_config.num_hugepage_sizes; i++) { + uint64_t pagesz = internal_config.hugepage_info[i].hugepage_sz; + int pagesz_flag = pagesz_flags(pagesz); + int flags; + + flags = pagesz_flag | MFD_HUGETLB; + int fd = memfd_create("test", flags); + if (fd < 0) { + /* we failed - let memalloc know this isn't working */ + if (errno == EINVAL) { + memfd_create_supported = 0; + return 0; /* not supported */ + } + + /* we got other error - something's wrong */ + return -1; /* error */ + } + close(fd); + return 1; /* supported */ + } +#endif + return 0; /* not supported */ +} + +int +eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + /* fd_list not initialized? */ + if (fd_list[list_idx].len == 0) + return -ENODEV; + if (internal_config.single_file_segments) { + size_t pgsz = mcfg->memsegs[list_idx].page_sz; + + /* segment not active? */ + if (fd_list[list_idx].memseg_list_fd < 0) + return -ENOENT; + *offset = pgsz * seg_idx; + } else { + /* segment not active? */ + if (fd_list[list_idx].fds[seg_idx] < 0) + return -ENOENT; + *offset = 0; + } + return 0; +} + int eal_memalloc_init(void) { if (rte_eal_process_type() == RTE_PROC_SECONDARY) if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0) return -1; + if (rte_eal_process_type() == RTE_PROC_PRIMARY && + internal_config.in_memory) { + int mfd_res = test_memfd_create(); - /* initialize all of the lock fd lists */ - if (internal_config.single_file_segments) - if (rte_memseg_list_walk(secondary_lock_list_create_walk, NULL)) + if (mfd_res < 0) { + RTE_LOG(ERR, EAL, "Unable to check if memfd is supported\n"); + return -1; + } + if (mfd_res == 1) + RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n"); + else + RTE_LOG(INFO, EAL, "Using memfd is not supported, falling back to anonymous hugepages\n"); + + /* we only support single-file segments mode with in-memory mode + * if we support hugetlbfs with memfd_create. this code will + * test if we do. + */ + if (internal_config.single_file_segments && + mfd_res != 1) { + RTE_LOG(ERR, EAL, "Single-file segments mode cannot be used without memfd support\n"); return -1; + } + /* this cannot ever happen but better safe than sorry */ + if (!anonymous_hugepages_supported) { + RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n"); + return -1; + } + } + + /* initialize all of the fd lists */ + if (rte_memseg_list_walk(fd_list_create_walk, NULL)) + return -1; return 0; } diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c index dbf19499..fce86fda 100644 --- a/lib/librte_eal/linuxapp/eal/eal_memory.c +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c @@ -5,6 +5,7 @@ #define _FILE_OFFSET_BITS 64 #include +#include #include #include #include @@ -17,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -263,7 +265,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, int node_id = -1; int essential_prev = 0; int oldpolicy; - struct bitmask *oldmask = numa_allocate_nodemask(); + struct bitmask *oldmask = NULL; bool have_numa = true; unsigned long maxnode = 0; @@ -275,6 +277,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, if (have_numa) { RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n"); + oldmask = numa_allocate_nodemask(); if (get_mempolicy(&oldpolicy, oldmask->maskp, oldmask->size + 1, 0, 0) < 0) { RTE_LOG(ERR, EAL, @@ -402,7 +405,8 @@ out: numa_set_localalloc(); } } - numa_free_cpumask(oldmask); + if (oldmask != NULL) + numa_free_cpumask(oldmask); #endif return i; } @@ -584,7 +588,7 @@ unlink_hugepage_files(struct hugepage_file *hugepg_tbl, for (page = 0; page < nrpages; page++) { struct hugepage_file *hp = &hugepg_tbl[page]; - if (hp->final_va != NULL && unlink(hp->filepath)) { + if (hp->orig_va != NULL && unlink(hp->filepath)) { RTE_LOG(WARNING, EAL, "%s(): Removing %s failed: %s\n", __func__, hp->filepath, strerror(errno)); } @@ -771,7 +775,10 @@ remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end) rte_fbarray_set_used(arr, ms_idx); - close(fd); + /* store segment fd internally */ + if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0) + RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n", + rte_strerror(rte_errno)); } RTE_LOG(DEBUG, EAL, "Allocated %" PRIu64 "M on socket %i\n", (seg_len * page_sz) >> 20, socket_id); @@ -857,6 +864,7 @@ alloc_va_space(struct rte_memseg_list *msl) return -1; } msl->base_va = addr; + msl->len = mem_sz; return 0; } @@ -1365,6 +1373,7 @@ eal_legacy_hugepage_init(void) msl->base_va = addr; msl->page_sz = page_sz; msl->socket_id = 0; + msl->len = internal_config.memory; /* populate memsegs. each memseg is one page long */ for (cur_seg = 0; cur_seg < n_segs; cur_seg++) { @@ -1611,7 +1620,7 @@ eal_legacy_hugepage_init(void) if (msl->memseg_arr.count > 0) continue; /* this is an unused list, deallocate it */ - mem_sz = (size_t)msl->page_sz * msl->memseg_arr.len; + mem_sz = msl->len; munmap(msl->base_va, mem_sz); msl->base_va = NULL; @@ -1770,6 +1779,7 @@ getFileSize(int fd) static int eal_legacy_hugepage_attach(void) { + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; struct hugepage_file *hp = NULL; unsigned int num_hp = 0; unsigned int i = 0; @@ -1813,6 +1823,9 @@ eal_legacy_hugepage_attach(void) struct hugepage_file *hf = &hp[i]; size_t map_sz = hf->size; void *map_addr = hf->final_va; + int msl_idx, ms_idx; + struct rte_memseg_list *msl; + struct rte_memseg *ms; /* if size is zero, no more pages left */ if (map_sz == 0) @@ -1830,25 +1843,50 @@ eal_legacy_hugepage_attach(void) if (map_addr == MAP_FAILED) { RTE_LOG(ERR, EAL, "Could not map %s: %s\n", hf->filepath, strerror(errno)); - close(fd); - goto error; + goto fd_error; } /* set shared lock on the file. */ if (flock(fd, LOCK_SH) < 0) { RTE_LOG(DEBUG, EAL, "%s(): Locking file failed: %s\n", __func__, strerror(errno)); - close(fd); - goto error; + goto fd_error; } - close(fd); + /* find segment data */ + msl = rte_mem_virt2memseg_list(map_addr); + if (msl == NULL) { + RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg list\n", + __func__); + goto fd_error; + } + ms = rte_mem_virt2memseg(map_addr, msl); + if (ms == NULL) { + RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg\n", + __func__); + goto fd_error; + } + + msl_idx = msl - mcfg->memsegs; + ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + if (ms_idx < 0) { + RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg idx\n", + __func__); + goto fd_error; + } + + /* store segment fd internally */ + if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0) + RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n", + rte_strerror(rte_errno)); } /* unmap the hugepage config file, since we are done using it */ munmap(hp, size); close(fd_hugepage); return 0; +fd_error: + close(fd); error: /* map all segments into memory to make sure we get the addrs */ cur_seg = 0; @@ -2093,18 +2131,65 @@ static int __rte_unused memseg_primary_init(void) { struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - int i, socket_id, hpi_idx, msl_idx = 0; + struct memtype { + uint64_t page_sz; + int socket_id; + } *memtypes = NULL; + int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */ struct rte_memseg_list *msl; - uint64_t max_mem, total_mem; + uint64_t max_mem, max_mem_per_type; + unsigned int max_seglists_per_type; + unsigned int n_memtypes, cur_type; /* no-huge does not need this at all */ if (internal_config.no_hugetlbfs) return 0; - max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; - total_mem = 0; + /* + * figuring out amount of memory we're going to have is a long and very + * involved process. the basic element we're operating with is a memory + * type, defined as a combination of NUMA node ID and page size (so that + * e.g. 2 sockets with 2 page sizes yield 4 memory types in total). + * + * deciding amount of memory going towards each memory type is a + * balancing act between maximum segments per type, maximum memory per + * type, and number of detected NUMA nodes. the goal is to make sure + * each memory type gets at least one memseg list. + * + * the total amount of memory is limited by RTE_MAX_MEM_MB value. + * + * the total amount of memory per type is limited by either + * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number + * of detected NUMA nodes. additionally, maximum number of segments per + * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for + * smaller page sizes, it can take hundreds of thousands of segments to + * reach the above specified per-type memory limits. + * + * additionally, each type may have multiple memseg lists associated + * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger + * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones. + * + * the number of memseg lists per type is decided based on the above + * limits, and also taking number of detected NUMA nodes, to make sure + * that we don't run out of memseg lists before we populate all NUMA + * nodes with memory. + * + * we do this in three stages. first, we collect the number of types. + * then, we figure out memory constraints and populate the list of + * would-be memseg lists. then, we go ahead and allocate the memseg + * lists. + */ - /* create memseg lists */ + /* create space for mem types */ + n_memtypes = internal_config.num_hugepage_sizes * rte_socket_count(); + memtypes = calloc(n_memtypes, sizeof(*memtypes)); + if (memtypes == NULL) { + RTE_LOG(ERR, EAL, "Cannot allocate space for memory types\n"); + return -1; + } + + /* populate mem types */ + cur_type = 0; for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes; hpi_idx++) { struct hugepage_info *hpi; @@ -2113,62 +2198,114 @@ memseg_primary_init(void) hpi = &internal_config.hugepage_info[hpi_idx]; hugepage_sz = hpi->hugepage_sz; - for (i = 0; i < (int) rte_socket_count(); i++) { - uint64_t max_type_mem, total_type_mem = 0; - int type_msl_idx, max_segs, total_segs = 0; - - socket_id = rte_socket_id_by_idx(i); + for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) { + int socket_id = rte_socket_id_by_idx(i); #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES if (socket_id > 0) break; #endif + memtypes[cur_type].page_sz = hugepage_sz; + memtypes[cur_type].socket_id = socket_id; - if (total_mem >= max_mem) - break; - - max_type_mem = RTE_MIN(max_mem - total_mem, - (uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20); - max_segs = RTE_MAX_MEMSEG_PER_TYPE; + RTE_LOG(DEBUG, EAL, "Detected memory type: " + "socket_id:%u hugepage_sz:%" PRIu64 "\n", + socket_id, hugepage_sz); + } + } - type_msl_idx = 0; - while (total_type_mem < max_type_mem && - total_segs < max_segs) { - uint64_t cur_max_mem, cur_mem; - unsigned int n_segs; + /* set up limits for types */ + max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; + max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20, + max_mem / n_memtypes); + /* + * limit maximum number of segment lists per type to ensure there's + * space for memseg lists for all NUMA nodes with all page sizes + */ + max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes; - if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { - RTE_LOG(ERR, EAL, - "No more space in memseg lists, please increase %s\n", - RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); - return -1; - } + if (max_seglists_per_type == 0) { + RTE_LOG(ERR, EAL, "Cannot accommodate all memory types, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + goto out; + } - msl = &mcfg->memsegs[msl_idx++]; + /* go through all mem types and create segment lists */ + msl_idx = 0; + for (cur_type = 0; cur_type < n_memtypes; cur_type++) { + unsigned int cur_seglist, n_seglists, n_segs; + unsigned int max_segs_per_type, max_segs_per_list; + struct memtype *type = &memtypes[cur_type]; + uint64_t max_mem_per_list, pagesz; + int socket_id; - cur_max_mem = max_type_mem - total_type_mem; + pagesz = type->page_sz; + socket_id = type->socket_id; - cur_mem = get_mem_amount(hugepage_sz, - cur_max_mem); - n_segs = cur_mem / hugepage_sz; + /* + * we need to create segment lists for this type. we must take + * into account the following things: + * + * 1. total amount of memory we can use for this memory type + * 2. total amount of memory per memseg list allowed + * 3. number of segments needed to fit the amount of memory + * 4. number of segments allowed per type + * 5. number of segments allowed per memseg list + * 6. number of memseg lists we are allowed to take up + */ - if (alloc_memseg_list(msl, hugepage_sz, n_segs, - socket_id, type_msl_idx)) - return -1; + /* calculate how much segments we will need in total */ + max_segs_per_type = max_mem_per_type / pagesz; + /* limit number of segments to maximum allowed per type */ + max_segs_per_type = RTE_MIN(max_segs_per_type, + (unsigned int)RTE_MAX_MEMSEG_PER_TYPE); + /* limit number of segments to maximum allowed per list */ + max_segs_per_list = RTE_MIN(max_segs_per_type, + (unsigned int)RTE_MAX_MEMSEG_PER_LIST); + + /* calculate how much memory we can have per segment list */ + max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz, + (uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20); + + /* calculate how many segments each segment list will have */ + n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz); + + /* calculate how many segment lists we can have */ + n_seglists = RTE_MIN(max_segs_per_type / n_segs, + max_mem_per_type / max_mem_per_list); + + /* limit number of segment lists according to our maximum */ + n_seglists = RTE_MIN(n_seglists, max_seglists_per_type); + + RTE_LOG(DEBUG, EAL, "Creating %i segment lists: " + "n_segs:%i socket_id:%i hugepage_sz:%" PRIu64 "\n", + n_seglists, n_segs, socket_id, pagesz); + + /* create all segment lists */ + for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) { + if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, + "No more space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + goto out; + } + msl = &mcfg->memsegs[msl_idx++]; - total_segs += msl->memseg_arr.len; - total_type_mem = total_segs * hugepage_sz; - type_msl_idx++; + if (alloc_memseg_list(msl, pagesz, n_segs, + socket_id, cur_seglist)) + goto out; - if (alloc_va_space(msl)) { - RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n"); - return -1; - } + if (alloc_va_space(msl)) { + RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n"); + goto out; } - total_mem += total_type_mem; } } - return 0; + /* we're successful */ + ret = 0; +out: + free(memtypes); + return ret; } static int @@ -2204,6 +2341,25 @@ memseg_secondary_init(void) int rte_eal_memseg_init(void) { + /* increase rlimit to maximum */ + struct rlimit lim; + + if (getrlimit(RLIMIT_NOFILE, &lim) == 0) { + /* set limit to maximum */ + lim.rlim_cur = lim.rlim_max; + + if (setrlimit(RLIMIT_NOFILE, &lim) < 0) { + RTE_LOG(DEBUG, EAL, "Setting maximum number of open files failed: %s\n", + strerror(errno)); + } else { + RTE_LOG(DEBUG, EAL, "Setting maximum number of open files to %" + PRIu64 "\n", + (uint64_t)lim.rlim_cur); + } + } else { + RTE_LOG(ERR, EAL, "Cannot get current resource limits\n"); + } + return rte_eal_process_type() == RTE_PROC_PRIMARY ? #ifndef RTE_ARCH_64 memseg_primary_init_32() : diff --git a/lib/librte_eal/linuxapp/eal/eal_thread.c b/lib/librte_eal/linuxapp/eal/eal_thread.c index b496fc71..379773b6 100644 --- a/lib/librte_eal/linuxapp/eal/eal_thread.c +++ b/lib/librte_eal/linuxapp/eal/eal_thread.c @@ -121,8 +121,8 @@ eal_thread_loop(__attribute__((unused)) void *arg) ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); - RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%x;cpuset=[%s%s])\n", - lcore_id, (int)thread_id, cpuset, ret == 0 ? "" : "..."); + RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%zx;cpuset=[%s%s])\n", + lcore_id, (uintptr_t)thread_id, cpuset, ret == 0 ? "" : "..."); /* read on our pipe to get commands */ while (1) { diff --git a/lib/librte_eal/linuxapp/eal/eal_timer.c b/lib/librte_eal/linuxapp/eal/eal_timer.c index 2766bd78..bc8f0519 100644 --- a/lib/librte_eal/linuxapp/eal/eal_timer.c +++ b/lib/librte_eal/linuxapp/eal/eal_timer.c @@ -87,7 +87,7 @@ static pthread_t msb_inc_thread_id; * containing used to process MSB of the HPET (unfortunately, we need * this because hpet is 32 bits by default under linux). */ -static void +static void * hpet_msb_inc(__attribute__((unused)) void *arg) { uint32_t t; @@ -98,6 +98,7 @@ hpet_msb_inc(__attribute__((unused)) void *arg) eal_hpet_msb ++; sleep(10); } + return NULL; } uint64_t @@ -178,7 +179,7 @@ rte_eal_hpet_init(int make_default) /* create a thread that will increment a global variable for * msb (hpet is 32 bits by default under linux) */ ret = rte_ctrl_thread_create(&msb_inc_thread_id, "hpet-msb-inc", NULL, - (void *(*)(void *))hpet_msb_inc, NULL); + hpet_msb_inc, NULL); if (ret != 0) { RTE_LOG(ERR, EAL, "ERROR: Cannot create HPET timer thread!\n"); internal_config.no_hpet = 1; diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c index c68dc38e..0516b159 100644 --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c @@ -345,46 +345,13 @@ get_vfio_cfg_by_group_num(int iommu_group_num) return NULL; } -static struct vfio_config * -get_vfio_cfg_by_group_fd(int vfio_group_fd) -{ - struct vfio_config *vfio_cfg; - int i, j; - - for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { - vfio_cfg = &vfio_cfgs[i]; - for (j = 0; j < VFIO_MAX_GROUPS; j++) - if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) - return vfio_cfg; - } - - return NULL; -} - -static struct vfio_config * -get_vfio_cfg_by_container_fd(int container_fd) -{ - int i; - - for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { - if (vfio_cfgs[i].vfio_container_fd == container_fd) - return &vfio_cfgs[i]; - } - - return NULL; -} - -int -rte_vfio_get_group_fd(int iommu_group_num) +static int +vfio_get_group_fd(struct vfio_config *vfio_cfg, + int iommu_group_num) { int i; int vfio_group_fd; struct vfio_group *cur_grp; - struct vfio_config *vfio_cfg; - - /* get the vfio_config it belongs to */ - vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); - vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; /* check if we already have the group descriptor open */ for (i = 0; i < VFIO_MAX_GROUPS; i++) @@ -423,6 +390,47 @@ rte_vfio_get_group_fd(int iommu_group_num) return vfio_group_fd; } +static struct vfio_config * +get_vfio_cfg_by_group_fd(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i, j; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + vfio_cfg = &vfio_cfgs[i]; + for (j = 0; j < VFIO_MAX_GROUPS; j++) + if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) + return vfio_cfg; + } + + return NULL; +} + +static struct vfio_config * +get_vfio_cfg_by_container_fd(int container_fd) +{ + int i; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + if (vfio_cfgs[i].vfio_container_fd == container_fd) + return &vfio_cfgs[i]; + } + + return NULL; +} + +int +rte_vfio_get_group_fd(int iommu_group_num) +{ + struct vfio_config *vfio_cfg; + + /* get the vfio_config it belongs to */ + vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); + vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; + + return vfio_get_group_fd(vfio_cfg, iommu_group_num); +} + static int get_vfio_group_idx(int vfio_group_fd) { @@ -509,7 +517,7 @@ vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len, msl = rte_mem_virt2memseg_list(addr); /* for IOVA as VA mode, no need to care for IOVA addresses */ - if (rte_eal_iova_mode() == RTE_IOVA_VA) { + if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) { uint64_t vfio_va = (uint64_t)(uintptr_t)addr; if (type == RTE_MEM_EVENT_ALLOC) vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va, @@ -523,13 +531,19 @@ vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len, /* memsegs are contiguous in memory */ ms = rte_mem_virt2memseg(addr, msl); while (cur_len < len) { + /* some memory segments may have invalid IOVA */ + if (ms->iova == RTE_BAD_IOVA) { + RTE_LOG(DEBUG, EAL, "Memory segment at %p has bad IOVA, skipping\n", + ms->addr); + goto next; + } if (type == RTE_MEM_EVENT_ALLOC) vfio_dma_mem_map(default_vfio_cfg, ms->addr_64, ms->iova, ms->len, 1); else vfio_dma_mem_map(default_vfio_cfg, ms->addr_64, ms->iova, ms->len, 0); - +next: cur_len += ms->len; ++ms; } @@ -896,7 +910,15 @@ rte_vfio_enable(const char *modname) return 0; } - default_vfio_cfg->vfio_container_fd = rte_vfio_get_container_fd(); + if (internal_config.process_type == RTE_PROC_PRIMARY) { + /* open a new container */ + default_vfio_cfg->vfio_container_fd = + rte_vfio_get_container_fd(); + } else { + /* get the default container from the primary process */ + default_vfio_cfg->vfio_container_fd = + vfio_get_default_container_fd(); + } /* check if we have VFIO driver enabled */ if (default_vfio_cfg->vfio_container_fd != -1) { @@ -916,6 +938,45 @@ rte_vfio_is_enabled(const char *modname) return default_vfio_cfg->vfio_enabled && mod_available; } +int +vfio_get_default_container_fd(void) +{ + struct rte_mp_msg mp_req, *mp_rep; + struct rte_mp_reply mp_reply; + struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; + struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; + + if (default_vfio_cfg->vfio_enabled) + return default_vfio_cfg->vfio_container_fd; + + if (internal_config.process_type == RTE_PROC_PRIMARY) { + /* if we were secondary process we would try requesting + * container fd from the primary, but we're the primary + * process so just exit here + */ + return -1; + } + + p->req = SOCKET_REQ_DEFAULT_CONTAINER; + strcpy(mp_req.name, EAL_VFIO_MP); + mp_req.len_param = sizeof(*p); + mp_req.num_fds = 0; + + if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && + mp_reply.nb_received == 1) { + mp_rep = &mp_reply.msgs[0]; + p = (struct vfio_mp_param *)mp_rep->param; + if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { + free(mp_reply.msgs); + return mp_rep->fds[0]; + } + free(mp_reply.msgs); + } + + RTE_LOG(ERR, EAL, " cannot request default container fd\n"); + return -1; +} + const struct vfio_iommu_type * vfio_set_iommu_type(int vfio_container_fd) { @@ -1028,8 +1089,9 @@ rte_vfio_get_container_fd(void) mp_rep = &mp_reply.msgs[0]; p = (struct vfio_mp_param *)mp_rep->param; if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { + vfio_container_fd = mp_rep->fds[0]; free(mp_reply.msgs); - return mp_rep->fds[0]; + return vfio_container_fd; } free(mp_reply.msgs); } @@ -1082,11 +1144,14 @@ rte_vfio_get_group_num(const char *sysfs_base, } static int -type1_map(const struct rte_memseg_list *msl __rte_unused, - const struct rte_memseg *ms, void *arg) +type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms, + void *arg) { int *vfio_container_fd = arg; + if (msl->external) + return 0; + return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova, ms->len, 1); } @@ -1145,8 +1210,22 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, struct vfio_iommu_type1_dma_map dma_map; struct vfio_iommu_type1_dma_unmap dma_unmap; int ret; + struct vfio_iommu_spapr_register_memory reg = { + .argsz = sizeof(reg), + .flags = 0 + }; + reg.vaddr = (uintptr_t) vaddr; + reg.size = len; if (do_map != 0) { + ret = ioctl(vfio_container_fd, + VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®); + if (ret) { + RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + memset(&dma_map, 0, sizeof(dma_map)); dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); dma_map.vaddr = vaddr; @@ -1163,13 +1242,6 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, } } else { - struct vfio_iommu_spapr_register_memory reg = { - .argsz = sizeof(reg), - .flags = 0 - }; - reg.vaddr = (uintptr_t) vaddr; - reg.size = len; - ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®); if (ret) { @@ -1196,12 +1268,15 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, } static int -vfio_spapr_map_walk(const struct rte_memseg_list *msl __rte_unused, +vfio_spapr_map_walk(const struct rte_memseg_list *msl, const struct rte_memseg *ms, void *arg) { int *vfio_container_fd = arg; - return vfio_spapr_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova, + if (msl->external) + return 0; + + return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova, ms->len, 1); } @@ -1210,12 +1285,15 @@ struct spapr_walk_param { uint64_t hugepage_sz; }; static int -vfio_spapr_window_size_walk(const struct rte_memseg_list *msl __rte_unused, +vfio_spapr_window_size_walk(const struct rte_memseg_list *msl, const struct rte_memseg *ms, void *arg) { struct spapr_walk_param *param = arg; uint64_t max = ms->iova + ms->len; + if (msl->external) + return 0; + if (max > param->window_size) { param->hugepage_sz = ms->hugepage_sz; param->window_size = max; @@ -1670,9 +1748,6 @@ int rte_vfio_container_group_bind(int container_fd, int iommu_group_num) { struct vfio_config *vfio_cfg; - struct vfio_group *cur_grp; - int vfio_group_fd; - int i; vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); if (vfio_cfg == NULL) { @@ -1680,36 +1755,7 @@ rte_vfio_container_group_bind(int container_fd, int iommu_group_num) return -1; } - /* Check room for new group */ - if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) { - RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n"); - return -1; - } - - /* Get an index for the new group */ - for (i = 0; i < VFIO_MAX_GROUPS; i++) - if (vfio_cfg->vfio_groups[i].group_num == -1) { - cur_grp = &vfio_cfg->vfio_groups[i]; - break; - } - - /* This should not happen */ - if (i == VFIO_MAX_GROUPS) { - RTE_LOG(ERR, EAL, "No VFIO group free slot found\n"); - return -1; - } - - vfio_group_fd = vfio_open_group_fd(iommu_group_num); - if (vfio_group_fd < 0) { - RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num); - return -1; - } - cur_grp->group_num = iommu_group_num; - cur_grp->fd = vfio_group_fd; - cur_grp->devices = 0; - vfio_cfg->vfio_active_groups++; - - return vfio_group_fd; + return vfio_get_group_fd(vfio_cfg, iommu_group_num); } int diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h index 68d4750a..63ae115c 100644 --- a/lib/librte_eal/linuxapp/eal/eal_vfio.h +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h @@ -115,6 +115,9 @@ struct vfio_iommu_type { vfio_dma_func_t dma_map_func; }; +/* get the vfio container that devices are bound to by default */ +int vfio_get_default_container_fd(void); + /* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */ const struct vfio_iommu_type * vfio_set_iommu_type(int vfio_container_fd); @@ -129,6 +132,7 @@ int vfio_mp_sync_setup(void); #define SOCKET_REQ_CONTAINER 0x100 #define SOCKET_REQ_GROUP 0x200 +#define SOCKET_REQ_DEFAULT_CONTAINER 0x400 #define SOCKET_OK 0x0 #define SOCKET_NO_FD 0x1 #define SOCKET_ERR 0xFF diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c index 680a24aa..a1e8c834 100644 --- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c +++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c @@ -66,6 +66,17 @@ vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer) reply.fds[0] = fd; } break; + case SOCKET_REQ_DEFAULT_CONTAINER: + r->req = SOCKET_REQ_DEFAULT_CONTAINER; + fd = vfio_get_default_container_fd(); + if (fd < 0) + r->result = SOCKET_ERR; + else { + r->result = SOCKET_OK; + reply.num_fds = 1; + reply.fds[0] = fd; + } + break; default: RTE_LOG(ERR, EAL, "vfio received invalid message!\n"); return -1; diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h index cfa9448b..5afa0871 100644 --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h @@ -8,6 +8,7 @@ #ifdef __KERNEL__ #include +#include #define RTE_STD_C11 #else #include @@ -54,8 +55,13 @@ struct rte_kni_request { * Writing should never overwrite the read position */ struct rte_kni_fifo { +#ifdef RTE_USE_C11_MEM_MODEL + unsigned write; /**< Next position to be written*/ + unsigned read; /**< Next position to be read */ +#else volatile unsigned write; /**< Next position to be written*/ volatile unsigned read; /**< Next position to be read */ +#endif unsigned len; /**< Circular buffer length */ unsigned elem_size; /**< Pointer size - for 32/64 bit OS */ void *volatile buffer[]; /**< The buffer contains mbuf pointers */ -- cgit 1.2.3-korg