summaryrefslogtreecommitdiffstats
path: root/lib/librte_eal/linuxapp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/librte_eal/linuxapp')
-rw-r--r--lib/librte_eal/linuxapp/eal/Makefile20
-rw-r--r--lib/librte_eal/linuxapp/eal/eal.c119
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_dev.c172
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_hugepage_info.c1
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_interrupts.c79
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_memalloc.c466
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_memory.c264
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_thread.c4
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_timer.c5
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_vfio.c216
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_vfio.h4
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c11
-rw-r--r--lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h6
13 files changed, 1085 insertions, 282 deletions
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index fd92c75c..51deb579 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -10,7 +10,7 @@ ARCH_DIR ?= $(RTE_ARCH)
EXPORT_MAP := ../../rte_eal_version.map
VPATH += $(RTE_SDK)/lib/librte_eal/common/arch/$(ARCH_DIR)
-LIBABIVER := 8
+LIBABIVER := 9
VPATH += $(RTE_SDK)/lib/librte_eal/common
@@ -70,10 +70,12 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_proc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_fbarray.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_uuid.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += hotplug_mp.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_mp.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_keepalive.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_option.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_service.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_reciprocal.c
@@ -85,22 +87,6 @@ SRCS-y += rte_cycles.c
CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST)
-CFLAGS_eal.o := -D_GNU_SOURCE
-CFLAGS_eal_interrupts.o := -D_GNU_SOURCE
-CFLAGS_eal_vfio_mp_sync.o := -D_GNU_SOURCE
-CFLAGS_eal_timer.o := -D_GNU_SOURCE
-CFLAGS_eal_lcore.o := -D_GNU_SOURCE
-CFLAGS_eal_memalloc.o := -D_GNU_SOURCE
-CFLAGS_eal_thread.o := -D_GNU_SOURCE
-CFLAGS_eal_log.o := -D_GNU_SOURCE
-CFLAGS_eal_common_log.o := -D_GNU_SOURCE
-CFLAGS_eal_hugepage_info.o := -D_GNU_SOURCE
-CFLAGS_eal_common_whitelist.o := -D_GNU_SOURCE
-CFLAGS_eal_common_options.o := -D_GNU_SOURCE
-CFLAGS_eal_common_thread.o := -D_GNU_SOURCE
-CFLAGS_eal_common_lcore.o := -D_GNU_SOURCE
-CFLAGS_rte_cycles.o := -D_GNU_SOURCE
-
# workaround for a gcc bug with noreturn attribute
# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603
ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index e59ac657..361744d4 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -48,6 +48,7 @@
#include <rte_atomic.h>
#include <malloc_heap.h>
#include <rte_vfio.h>
+#include <rte_option.h>
#include "eal_private.h"
#include "eal_thread.h"
@@ -149,7 +150,7 @@ eal_create_runtime_dir(void)
}
const char *
-eal_get_runtime_dir(void)
+rte_eal_get_runtime_dir(void)
{
return runtime_dir;
}
@@ -263,6 +264,8 @@ rte_eal_config_create(void)
* processes could later map the config into this exact location */
rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr;
+ rte_config.mem_config->dma_maskbits = 0;
+
}
/* attach to an existing shared memory config */
@@ -352,6 +355,24 @@ eal_proc_type_detect(void)
return ptype;
}
+/* copies data from internal config to shared config */
+static void
+eal_update_mem_config(void)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ mcfg->legacy_mem = internal_config.legacy_mem;
+ mcfg->single_file_segments = internal_config.single_file_segments;
+}
+
+/* copies data from shared config to internal config */
+static void
+eal_update_internal_config(void)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ internal_config.legacy_mem = mcfg->legacy_mem;
+ internal_config.single_file_segments = mcfg->single_file_segments;
+}
+
/* Sets up rte_config structure with the pointer to shared memory config.*/
static void
rte_config_init(void)
@@ -361,11 +382,13 @@ rte_config_init(void)
switch (rte_config.process_type){
case RTE_PROC_PRIMARY:
rte_eal_config_create();
+ eal_update_mem_config();
break;
case RTE_PROC_SECONDARY:
rte_eal_config_attach();
rte_eal_mcfg_wait_complete(rte_config.mem_config);
rte_eal_config_reattach();
+ eal_update_internal_config();
break;
case RTE_PROC_AUTO:
case RTE_PROC_INVALID:
@@ -580,12 +603,20 @@ eal_parse_args(int argc, char **argv)
argvopt = argv;
optind = 1;
+ opterr = 0;
while ((opt = getopt_long(argc, argvopt, eal_short_options,
eal_long_options, &option_index)) != EOF) {
- /* getopt is not happy, stop right now */
+ /*
+ * getopt didn't recognise the option, lets parse the
+ * registered options to see if the flag is valid
+ */
if (opt == '?') {
+ ret = rte_option_parse(argv[optind-1]);
+ if (ret == 0)
+ continue;
+
eal_usage(prgname);
ret = -1;
goto out;
@@ -725,6 +756,9 @@ check_socket(const struct rte_memseg_list *msl, void *arg)
{
int *socket_id = arg;
+ if (msl->external)
+ return 0;
+
return *socket_id == msl->socket_id;
}
@@ -793,7 +827,8 @@ rte_eal_init(int argc, char **argv)
int i, fctret, ret;
pthread_t thread_id;
static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0);
- const char *logid;
+ const char *p;
+ static char logid[PATH_MAX];
char cpuset[RTE_CPU_AFFINITY_STR_LEN];
char thread_name[RTE_MAX_THREAD_NAME_LEN];
@@ -810,9 +845,8 @@ rte_eal_init(int argc, char **argv)
return -1;
}
- logid = strrchr(argv[0], '/');
- logid = strdup(logid ? logid + 1: argv[0]);
-
+ p = strrchr(argv[0], '/');
+ strlcpy(logid, p ? p + 1 : argv[0], sizeof(logid));
thread_id = pthread_self();
eal_reset_internal_config(&internal_config);
@@ -835,7 +869,7 @@ rte_eal_init(int argc, char **argv)
}
if (eal_plugins_init() < 0) {
- rte_eal_init_alert("Cannot init plugins\n");
+ rte_eal_init_alert("Cannot init plugins");
rte_errno = EINVAL;
rte_atomic32_clear(&run_once);
return -1;
@@ -850,7 +884,7 @@ rte_eal_init(int argc, char **argv)
rte_config_init();
if (rte_eal_intr_init() < 0) {
- rte_eal_init_alert("Cannot init interrupt-handling thread\n");
+ rte_eal_init_alert("Cannot init interrupt-handling thread");
return -1;
}
@@ -858,30 +892,43 @@ rte_eal_init(int argc, char **argv)
* bus through mp channel in the secondary process before the bus scan.
*/
if (rte_mp_channel_init() < 0) {
- rte_eal_init_alert("failed to init mp channel\n");
+ rte_eal_init_alert("failed to init mp channel");
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
rte_errno = EFAULT;
return -1;
}
}
+ /* register multi-process action callbacks for hotplug */
+ if (rte_mp_dev_hotplug_init() < 0) {
+ rte_eal_init_alert("failed to register mp callback for hotplug");
+ return -1;
+ }
+
if (rte_bus_scan()) {
- rte_eal_init_alert("Cannot scan the buses for devices\n");
+ rte_eal_init_alert("Cannot scan the buses for devices");
rte_errno = ENODEV;
rte_atomic32_clear(&run_once);
return -1;
}
- /* autodetect the iova mapping mode (default is iova_pa) */
- rte_eal_get_configuration()->iova_mode = rte_bus_get_iommu_class();
-
- /* Workaround for KNI which requires physical address to work */
- if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA &&
- rte_eal_check_module("rte_kni") == 1) {
- rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA;
- RTE_LOG(WARNING, EAL,
- "Some devices want IOVA as VA but PA will be used because.. "
- "KNI module inserted\n");
+ /* if no EAL option "--iova-mode=<pa|va>", use bus IOVA scheme */
+ if (internal_config.iova_mode == RTE_IOVA_DC) {
+ /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */
+ rte_eal_get_configuration()->iova_mode =
+ rte_bus_get_iommu_class();
+
+ /* Workaround for KNI which requires physical address to work */
+ if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA &&
+ rte_eal_check_module("rte_kni") == 1) {
+ rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA;
+ RTE_LOG(WARNING, EAL,
+ "Some devices want IOVA as VA but PA will be used because.. "
+ "KNI module inserted\n");
+ }
+ } else {
+ rte_eal_get_configuration()->iova_mode =
+ internal_config.iova_mode;
}
if (internal_config.no_hugetlbfs == 0) {
@@ -924,7 +971,7 @@ rte_eal_init(int argc, char **argv)
#ifdef VFIO_PRESENT
if (rte_eal_vfio_setup() < 0) {
- rte_eal_init_alert("Cannot init VFIO\n");
+ rte_eal_init_alert("Cannot init VFIO");
rte_errno = EAGAIN;
rte_atomic32_clear(&run_once);
return -1;
@@ -935,13 +982,13 @@ rte_eal_init(int argc, char **argv)
* initialize memzones first.
*/
if (rte_eal_memzone_init() < 0) {
- rte_eal_init_alert("Cannot init memzone\n");
+ rte_eal_init_alert("Cannot init memzone");
rte_errno = ENODEV;
return -1;
}
if (rte_eal_memory_init() < 0) {
- rte_eal_init_alert("Cannot init memory\n");
+ rte_eal_init_alert("Cannot init memory");
rte_errno = ENOMEM;
return -1;
}
@@ -950,25 +997,25 @@ rte_eal_init(int argc, char **argv)
eal_hugedirs_unlock();
if (rte_eal_malloc_heap_init() < 0) {
- rte_eal_init_alert("Cannot init malloc heap\n");
+ rte_eal_init_alert("Cannot init malloc heap");
rte_errno = ENODEV;
return -1;
}
if (rte_eal_tailqs_init() < 0) {
- rte_eal_init_alert("Cannot init tail queues for objects\n");
+ rte_eal_init_alert("Cannot init tail queues for objects");
rte_errno = EFAULT;
return -1;
}
if (rte_eal_alarm_init() < 0) {
- rte_eal_init_alert("Cannot init interrupt-handling thread\n");
+ rte_eal_init_alert("Cannot init interrupt-handling thread");
/* rte_eal_alarm_init sets rte_errno on failure. */
return -1;
}
if (rte_eal_timer_init() < 0) {
- rte_eal_init_alert("Cannot init HPET or TSC timers\n");
+ rte_eal_init_alert("Cannot init HPET or TSC timers");
rte_errno = ENOTSUP;
return -1;
}
@@ -979,8 +1026,8 @@ rte_eal_init(int argc, char **argv)
ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
- RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%x;cpuset=[%s%s])\n",
- rte_config.master_lcore, (int)thread_id, cpuset,
+ RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%zx;cpuset=[%s%s])\n",
+ rte_config.master_lcore, (uintptr_t)thread_id, cpuset,
ret == 0 ? "" : "...");
RTE_LCORE_FOREACH_SLAVE(i) {
@@ -1022,14 +1069,14 @@ rte_eal_init(int argc, char **argv)
/* initialize services so vdevs register service during bus_probe. */
ret = rte_service_init();
if (ret) {
- rte_eal_init_alert("rte_service_init() failed\n");
+ rte_eal_init_alert("rte_service_init() failed");
rte_errno = ENOEXEC;
return -1;
}
/* Probe all the buses and devices/drivers on them */
if (rte_bus_probe()) {
- rte_eal_init_alert("Cannot probe devices\n");
+ rte_eal_init_alert("Cannot probe devices");
rte_errno = ENOTSUP;
return -1;
}
@@ -1051,6 +1098,9 @@ rte_eal_init(int argc, char **argv)
rte_eal_mcfg_complete();
+ /* Call each registered callback, if enabled */
+ rte_option_init();
+
return fctret;
}
@@ -1059,7 +1109,12 @@ mark_freeable(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
void *arg __rte_unused)
{
/* ms is const, so find this memseg */
- struct rte_memseg *found = rte_mem_virt2memseg(ms->addr, msl);
+ struct rte_memseg *found;
+
+ if (msl->external)
+ return 0;
+
+ found = rte_mem_virt2memseg(ms->addr, msl);
found->flags &= ~RTE_MEMSEG_FLAG_DO_NOT_FREE;
diff --git a/lib/librte_eal/linuxapp/eal/eal_dev.c b/lib/librte_eal/linuxapp/eal/eal_dev.c
index 1cf6aebf..d589c692 100644
--- a/lib/librte_eal/linuxapp/eal/eal_dev.c
+++ b/lib/librte_eal/linuxapp/eal/eal_dev.c
@@ -4,6 +4,8 @@
#include <string.h>
#include <unistd.h>
+#include <fcntl.h>
+#include <signal.h>
#include <sys/socket.h>
#include <linux/netlink.h>
@@ -14,15 +16,32 @@
#include <rte_malloc.h>
#include <rte_interrupts.h>
#include <rte_alarm.h>
+#include <rte_bus.h>
+#include <rte_eal.h>
+#include <rte_spinlock.h>
+#include <rte_errno.h>
#include "eal_private.h"
static struct rte_intr_handle intr_handle = {.fd = -1 };
static bool monitor_started;
+static bool hotplug_handle;
#define EAL_UEV_MSG_LEN 4096
#define EAL_UEV_MSG_ELEM_LEN 128
+/*
+ * spinlock for device hot-unplug failure handling. If it try to access bus or
+ * device, such as handle sigbus on bus or handle memory failure for device
+ * just need to use this lock. It could protect the bus and the device to avoid
+ * race condition.
+ */
+static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER;
+
+static struct sigaction sigbus_action_old;
+
+static int sigbus_need_recover;
+
static void dev_uev_handler(__rte_unused void *param);
/* identify the system layer which reports this event. */
@@ -33,6 +52,55 @@ enum eal_dev_event_subsystem {
EAL_DEV_EVENT_SUBSYSTEM_MAX
};
+static void
+sigbus_action_recover(void)
+{
+ if (sigbus_need_recover) {
+ sigaction(SIGBUS, &sigbus_action_old, NULL);
+ sigbus_need_recover = 0;
+ }
+}
+
+static void sigbus_handler(int signum, siginfo_t *info,
+ void *ctx __rte_unused)
+{
+ int ret;
+
+ RTE_LOG(DEBUG, EAL, "Thread[%d] catch SIGBUS, fault address:%p\n",
+ (int)pthread_self(), info->si_addr);
+
+ rte_spinlock_lock(&failure_handle_lock);
+ ret = rte_bus_sigbus_handler(info->si_addr);
+ rte_spinlock_unlock(&failure_handle_lock);
+ if (ret == -1) {
+ rte_exit(EXIT_FAILURE,
+ "Failed to handle SIGBUS for hot-unplug, "
+ "(rte_errno: %s)!", strerror(rte_errno));
+ } else if (ret == 1) {
+ if (sigbus_action_old.sa_flags == SA_SIGINFO
+ && sigbus_action_old.sa_sigaction) {
+ (*(sigbus_action_old.sa_sigaction))(signum,
+ info, ctx);
+ } else if (sigbus_action_old.sa_flags != SA_SIGINFO
+ && sigbus_action_old.sa_handler) {
+ (*(sigbus_action_old.sa_handler))(signum);
+ } else {
+ rte_exit(EXIT_FAILURE,
+ "Failed to handle generic SIGBUS!");
+ }
+ }
+
+ RTE_LOG(DEBUG, EAL, "Success to handle SIGBUS for hot-unplug!\n");
+}
+
+static int cmp_dev_name(const struct rte_device *dev,
+ const void *_name)
+{
+ const char *name = _name;
+
+ return strcmp(dev->name, name);
+}
+
static int
dev_uev_socket_fd_create(void)
{
@@ -147,6 +215,9 @@ dev_uev_handler(__rte_unused void *param)
struct rte_dev_event uevent;
int ret;
char buf[EAL_UEV_MSG_LEN];
+ struct rte_bus *bus;
+ struct rte_device *dev;
+ const char *busname = "";
memset(&uevent, 0, sizeof(struct rte_dev_event));
memset(buf, 0, EAL_UEV_MSG_LEN);
@@ -171,8 +242,43 @@ dev_uev_handler(__rte_unused void *param)
RTE_LOG(DEBUG, EAL, "receive uevent(name:%s, type:%d, subsystem:%d)\n",
uevent.devname, uevent.type, uevent.subsystem);
- if (uevent.devname)
- dev_callback_process(uevent.devname, uevent.type);
+ switch (uevent.subsystem) {
+ case EAL_DEV_EVENT_SUBSYSTEM_PCI:
+ case EAL_DEV_EVENT_SUBSYSTEM_UIO:
+ busname = "pci";
+ break;
+ default:
+ break;
+ }
+
+ if (uevent.devname) {
+ if (uevent.type == RTE_DEV_EVENT_REMOVE && hotplug_handle) {
+ rte_spinlock_lock(&failure_handle_lock);
+ bus = rte_bus_find_by_name(busname);
+ if (bus == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n",
+ busname);
+ return;
+ }
+
+ dev = bus->find_device(NULL, cmp_dev_name,
+ uevent.devname);
+ if (dev == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot find device (%s) on "
+ "bus (%s)\n", uevent.devname, busname);
+ return;
+ }
+
+ ret = bus->hot_unplug_handler(dev);
+ rte_spinlock_unlock(&failure_handle_lock);
+ if (ret) {
+ RTE_LOG(ERR, EAL, "Can not handle hot-unplug "
+ "for device (%s)\n", dev->name);
+ return;
+ }
+ }
+ rte_dev_event_callback_process(uevent.devname, uevent.type);
+ }
}
int __rte_experimental
@@ -220,5 +326,67 @@ rte_dev_event_monitor_stop(void)
close(intr_handle.fd);
intr_handle.fd = -1;
monitor_started = false;
+
return 0;
}
+
+int
+dev_sigbus_handler_register(void)
+{
+ sigset_t mask;
+ struct sigaction action;
+
+ rte_errno = 0;
+
+ if (sigbus_need_recover)
+ return 0;
+
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGBUS);
+ action.sa_flags = SA_SIGINFO;
+ action.sa_mask = mask;
+ action.sa_sigaction = sigbus_handler;
+ sigbus_need_recover = !sigaction(SIGBUS, &action, &sigbus_action_old);
+
+ return rte_errno;
+}
+
+int
+dev_sigbus_handler_unregister(void)
+{
+ rte_errno = 0;
+
+ sigbus_action_recover();
+
+ return rte_errno;
+}
+
+int __rte_experimental
+rte_dev_hotplug_handle_enable(void)
+{
+ int ret = 0;
+
+ ret = dev_sigbus_handler_register();
+ if (ret < 0)
+ RTE_LOG(ERR, EAL,
+ "fail to register sigbus handler for devices.\n");
+
+ hotplug_handle = true;
+
+ return ret;
+}
+
+int __rte_experimental
+rte_dev_hotplug_handle_disable(void)
+{
+ int ret = 0;
+
+ ret = dev_sigbus_handler_unregister();
+ if (ret < 0)
+ RTE_LOG(ERR, EAL,
+ "fail to unregister sigbus handler for devices.\n");
+
+ hotplug_handle = false;
+
+ return ret;
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index 3a7d4b22..0eab1cf7 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -6,6 +6,7 @@
#include <sys/types.h>
#include <sys/file.h>
#include <dirent.h>
+#include <fcntl.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 4076c6d6..39252a88 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -33,6 +33,7 @@
#include <rte_errno.h>
#include <rte_spinlock.h>
#include <rte_pause.h>
+#include <rte_vfio.h>
#include "eal_private.h"
#include "eal_vfio.h"
@@ -308,6 +309,66 @@ vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
return ret;
}
+
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+/* enable req notifier */
+static int
+vfio_enable_req(const struct rte_intr_handle *intr_handle)
+{
+ int len, ret;
+ char irq_set_buf[IRQ_SET_BUF_LEN];
+ struct vfio_irq_set *irq_set;
+ int *fd_ptr;
+
+ len = sizeof(irq_set_buf);
+
+ irq_set = (struct vfio_irq_set *) irq_set_buf;
+ irq_set->argsz = len;
+ irq_set->count = 1;
+ irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+ VFIO_IRQ_SET_ACTION_TRIGGER;
+ irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
+ irq_set->start = 0;
+ fd_ptr = (int *) &irq_set->data;
+ *fd_ptr = intr_handle->fd;
+
+ ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+ if (ret) {
+ RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n",
+ intr_handle->fd);
+ return -1;
+ }
+
+ return 0;
+}
+
+/* disable req notifier */
+static int
+vfio_disable_req(const struct rte_intr_handle *intr_handle)
+{
+ struct vfio_irq_set *irq_set;
+ char irq_set_buf[IRQ_SET_BUF_LEN];
+ int len, ret;
+
+ len = sizeof(struct vfio_irq_set);
+
+ irq_set = (struct vfio_irq_set *) irq_set_buf;
+ irq_set->argsz = len;
+ irq_set->count = 0;
+ irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+ irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
+ irq_set->start = 0;
+
+ ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+ if (ret)
+ RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n",
+ intr_handle->fd);
+
+ return ret;
+}
+#endif
#endif
static int
@@ -556,6 +617,12 @@ rte_intr_enable(const struct rte_intr_handle *intr_handle)
if (vfio_enable_intx(intr_handle))
return -1;
break;
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+ case RTE_INTR_HANDLE_VFIO_REQ:
+ if (vfio_enable_req(intr_handle))
+ return -1;
+ break;
+#endif
#endif
/* not used at this moment */
case RTE_INTR_HANDLE_DEV_EVENT:
@@ -606,6 +673,12 @@ rte_intr_disable(const struct rte_intr_handle *intr_handle)
if (vfio_disable_intx(intr_handle))
return -1;
break;
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+ case RTE_INTR_HANDLE_VFIO_REQ:
+ if (vfio_disable_req(intr_handle))
+ return -1;
+ break;
+#endif
#endif
/* not used at this moment */
case RTE_INTR_HANDLE_DEV_EVENT:
@@ -672,6 +745,12 @@ eal_intr_process_interrupts(struct epoll_event *events, int nfds)
case RTE_INTR_HANDLE_VFIO_LEGACY:
bytes_read = sizeof(buf.vfio_intr_count);
break;
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+ case RTE_INTR_HANDLE_VFIO_REQ:
+ bytes_read = 0;
+ call = true;
+ break;
+#endif
#endif
case RTE_INTR_HANDLE_VDEV:
case RTE_INTR_HANDLE_EXT:
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index aa95551a..48b9c736 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -34,6 +34,7 @@
#include <rte_log.h>
#include <rte_eal_memconfig.h>
#include <rte_eal.h>
+#include <rte_errno.h>
#include <rte_memory.h>
#include <rte_spinlock.h>
@@ -52,30 +53,55 @@ const int anonymous_hugepages_supported =
#endif
/*
+ * we don't actually care if memfd itself is supported - we only need to check
+ * if memfd supports hugetlbfs, as that already implies memfd support.
+ *
+ * also, this is not a constant, because while we may be *compiled* with memfd
+ * hugetlbfs support, we might not be *running* on a system that supports memfd
+ * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at
+ * runtime, and fall back to anonymous memory.
+ */
+static int memfd_create_supported =
+#ifdef MFD_HUGETLB
+#define MEMFD_SUPPORTED
+ 1;
+#else
+ 0;
+#endif
+
+/*
* not all kernel version support fallocate on hugetlbfs, so fall back to
* ftruncate and disallow deallocation if fallocate is not supported.
*/
static int fallocate_supported = -1; /* unknown */
-/* for single-file segments, we need some kind of mechanism to keep track of
+/*
+ * we have two modes - single file segments, and file-per-page mode.
+ *
+ * for single-file segments, we need some kind of mechanism to keep track of
* which hugepages can be freed back to the system, and which cannot. we cannot
* use flock() because they don't allow locking parts of a file, and we cannot
* use fcntl() due to issues with their semantics, so we will have to rely on a
- * bunch of lockfiles for each page.
+ * bunch of lockfiles for each page. so, we will use 'fds' array to keep track
+ * of per-page lockfiles. we will store the actual segment list fd in the
+ * 'memseg_list_fd' field.
+ *
+ * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd'
+ * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's.
*
* we cannot know how many pages a system will have in advance, but we do know
* that they come in lists, and we know lengths of these lists. so, simply store
* a malloc'd array of fd's indexed by list and segment index.
*
* they will be initialized at startup, and filled as we allocate/deallocate
- * segments. also, use this to track memseg list proper fd.
+ * segments.
*/
static struct {
int *fds; /**< dynamically allocated array of segment lock fd's */
int memseg_list_fd; /**< memseg list fd */
int len; /**< total length of the array */
int count; /**< entries used in an array */
-} lock_fds[RTE_MAX_MEMSEG_LISTS];
+} fd_list[RTE_MAX_MEMSEG_LISTS];
/** local copy of a memory map, used to synchronize memory hotplug in MP */
static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS];
@@ -182,6 +208,31 @@ get_file_size(int fd)
return st.st_size;
}
+static inline uint32_t
+bsf64(uint64_t v)
+{
+ return (uint32_t)__builtin_ctzll(v);
+}
+
+static inline uint32_t
+log2_u64(uint64_t v)
+{
+ if (v == 0)
+ return 0;
+ v = rte_align64pow2(v);
+ return bsf64(v);
+}
+
+static int
+pagesz_flags(uint64_t page_sz)
+{
+ /* as per mmap() manpage, all page sizes are log2 of page size
+ * shifted by MAP_HUGE_SHIFT
+ */
+ int log2 = log2_u64(page_sz);
+ return log2 << RTE_MAP_HUGE_SHIFT;
+}
+
/* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */
static int lock(int fd, int type)
{
@@ -209,12 +260,12 @@ static int get_segment_lock_fd(int list_idx, int seg_idx)
char path[PATH_MAX] = {0};
int fd;
- if (list_idx < 0 || list_idx >= (int)RTE_DIM(lock_fds))
+ if (list_idx < 0 || list_idx >= (int)RTE_DIM(fd_list))
return -1;
- if (seg_idx < 0 || seg_idx >= lock_fds[list_idx].len)
+ if (seg_idx < 0 || seg_idx >= fd_list[list_idx].len)
return -1;
- fd = lock_fds[list_idx].fds[seg_idx];
+ fd = fd_list[list_idx].fds[seg_idx];
/* does this lock already exist? */
if (fd >= 0)
return fd;
@@ -236,8 +287,8 @@ static int get_segment_lock_fd(int list_idx, int seg_idx)
return -1;
}
/* store it for future reference */
- lock_fds[list_idx].fds[seg_idx] = fd;
- lock_fds[list_idx].count++;
+ fd_list[list_idx].fds[seg_idx] = fd;
+ fd_list[list_idx].count++;
return fd;
}
@@ -245,12 +296,12 @@ static int unlock_segment(int list_idx, int seg_idx)
{
int fd, ret;
- if (list_idx < 0 || list_idx >= (int)RTE_DIM(lock_fds))
+ if (list_idx < 0 || list_idx >= (int)RTE_DIM(fd_list))
return -1;
- if (seg_idx < 0 || seg_idx >= lock_fds[list_idx].len)
+ if (seg_idx < 0 || seg_idx >= fd_list[list_idx].len)
return -1;
- fd = lock_fds[list_idx].fds[seg_idx];
+ fd = fd_list[list_idx].fds[seg_idx];
/* upgrade lock to exclusive to see if we can remove the lockfile */
ret = lock(fd, LOCK_EX);
@@ -270,8 +321,8 @@ static int unlock_segment(int list_idx, int seg_idx)
* and remove it from list anyway.
*/
close(fd);
- lock_fds[list_idx].fds[seg_idx] = -1;
- lock_fds[list_idx].count--;
+ fd_list[list_idx].fds[seg_idx] = -1;
+ fd_list[list_idx].count--;
if (ret < 0)
return -1;
@@ -279,16 +330,68 @@ static int unlock_segment(int list_idx, int seg_idx)
}
static int
+get_seg_memfd(struct hugepage_info *hi __rte_unused,
+ unsigned int list_idx __rte_unused,
+ unsigned int seg_idx __rte_unused)
+{
+#ifdef MEMFD_SUPPORTED
+ int fd;
+ char segname[250]; /* as per manpage, limit is 249 bytes plus null */
+
+ if (internal_config.single_file_segments) {
+ fd = fd_list[list_idx].memseg_list_fd;
+
+ if (fd < 0) {
+ int flags = MFD_HUGETLB | pagesz_flags(hi->hugepage_sz);
+
+ snprintf(segname, sizeof(segname), "seg_%i", list_idx);
+ fd = memfd_create(segname, flags);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ fd_list[list_idx].memseg_list_fd = fd;
+ }
+ } else {
+ fd = fd_list[list_idx].fds[seg_idx];
+
+ if (fd < 0) {
+ int flags = MFD_HUGETLB | pagesz_flags(hi->hugepage_sz);
+
+ snprintf(segname, sizeof(segname), "seg_%i-%i",
+ list_idx, seg_idx);
+ fd = memfd_create(segname, flags);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ fd_list[list_idx].fds[seg_idx] = fd;
+ }
+ }
+ return fd;
+#endif
+ return -1;
+}
+
+static int
get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
unsigned int list_idx, unsigned int seg_idx)
{
int fd;
+ /* for in-memory mode, we only make it here when we're sure we support
+ * memfd, and this is a special case.
+ */
+ if (internal_config.in_memory)
+ return get_seg_memfd(hi, list_idx, seg_idx);
+
if (internal_config.single_file_segments) {
/* create a hugepage file path */
eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx);
- fd = lock_fds[list_idx].memseg_list_fd;
+ fd = fd_list[list_idx].memseg_list_fd;
if (fd < 0) {
fd = open(path, O_CREAT | O_RDWR, 0600);
@@ -304,24 +407,30 @@ get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
close(fd);
return -1;
}
- lock_fds[list_idx].memseg_list_fd = fd;
+ fd_list[list_idx].memseg_list_fd = fd;
}
} else {
/* create a hugepage file path */
eal_get_hugefile_path(path, buflen, hi->hugedir,
list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
- fd = open(path, O_CREAT | O_RDWR, 0600);
+
+ fd = fd_list[list_idx].fds[seg_idx];
+
if (fd < 0) {
- RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
- strerror(errno));
- return -1;
- }
- /* take out a read lock */
- if (lock(fd, LOCK_SH) < 0) {
- RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n",
- __func__, strerror(errno));
- close(fd);
- return -1;
+ fd = open(path, O_CREAT | O_RDWR, 0600);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ /* take out a read lock */
+ if (lock(fd, LOCK_SH) < 0) {
+ RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n",
+ __func__, strerror(errno));
+ close(fd);
+ return -1;
+ }
+ fd_list[list_idx].fds[seg_idx] = fd;
}
}
return fd;
@@ -332,6 +441,33 @@ resize_hugefile(int fd, char *path, int list_idx, int seg_idx,
uint64_t fa_offset, uint64_t page_sz, bool grow)
{
bool again = false;
+
+ /* in-memory mode is a special case, because we don't need to perform
+ * any locking, and we can be sure that fallocate() is supported.
+ */
+ if (internal_config.in_memory) {
+ int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_KEEP_SIZE;
+ int ret;
+
+ /* grow or shrink the file */
+ ret = fallocate(fd, flags, fa_offset, page_sz);
+
+ if (ret < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
+ __func__,
+ strerror(errno));
+ return -1;
+ }
+ /* increase/decrease total segment count */
+ fd_list[list_idx].count += (grow ? 1 : -1);
+ if (!grow && fd_list[list_idx].count == 0) {
+ close(fd_list[list_idx].memseg_list_fd);
+ fd_list[list_idx].memseg_list_fd = -1;
+ }
+ return 0;
+ }
+
do {
if (fallocate_supported == 0) {
/* we cannot deallocate memory if fallocate() is not
@@ -410,9 +546,9 @@ resize_hugefile(int fd, char *path, int list_idx, int seg_idx,
* page file fd, so that one of the processes
* could then delete the file after shrinking.
*/
- if (ret < 1 && lock_fds[list_idx].count == 0) {
+ if (ret < 1 && fd_list[list_idx].count == 0) {
close(fd);
- lock_fds[list_idx].memseg_list_fd = -1;
+ fd_list[list_idx].memseg_list_fd = -1;
}
if (ret < 0) {
@@ -448,13 +584,13 @@ resize_hugefile(int fd, char *path, int list_idx, int seg_idx,
* more segments active in this segment list,
* and remove the file if there aren't.
*/
- if (lock_fds[list_idx].count == 0) {
+ if (fd_list[list_idx].count == 0) {
if (unlink(path))
RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n",
__func__, path,
strerror(errno));
close(fd);
- lock_fds[list_idx].memseg_list_fd = -1;
+ fd_list[list_idx].memseg_list_fd = -1;
}
}
}
@@ -481,26 +617,34 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
void *new_addr;
alloc_sz = hi->hugepage_sz;
- if (!internal_config.single_file_segments &&
- internal_config.in_memory &&
- anonymous_hugepages_supported) {
- int log2, flags;
-
- log2 = rte_log2_u32(alloc_sz);
- /* as per mmap() manpage, all page sizes are log2 of page size
- * shifted by MAP_HUGE_SHIFT
- */
- flags = (log2 << RTE_MAP_HUGE_SHIFT) | MAP_HUGETLB | MAP_FIXED |
+
+ /* these are checked at init, but code analyzers don't know that */
+ if (internal_config.in_memory && !anonymous_hugepages_supported) {
+ RTE_LOG(ERR, EAL, "Anonymous hugepages not supported, in-memory mode cannot allocate memory\n");
+ return -1;
+ }
+ if (internal_config.in_memory && !memfd_create_supported &&
+ internal_config.single_file_segments) {
+ RTE_LOG(ERR, EAL, "Single-file segments are not supported without memfd support\n");
+ return -1;
+ }
+
+ /* in-memory without memfd is a special case */
+ int mmap_flags;
+
+ if (internal_config.in_memory && !memfd_create_supported) {
+ int pagesz_flag, flags;
+
+ pagesz_flag = pagesz_flags(alloc_sz);
+ flags = pagesz_flag | MAP_HUGETLB | MAP_FIXED |
MAP_PRIVATE | MAP_ANONYMOUS;
fd = -1;
- va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, flags, -1, 0);
-
- /* single-file segments codepath will never be active because
- * in-memory mode is incompatible with it and it's stopped at
- * EAL initialization stage, however the compiler doesn't know
- * that and complains about map_offset being used uninitialized
- * on failure codepaths while having in-memory mode enabled. so,
- * assign a value here.
+ mmap_flags = flags;
+
+ /* single-file segments codepath will never be active
+ * here because in-memory mode is incompatible with the
+ * fallback path, and it's stopped at EAL initialization
+ * stage.
*/
map_offset = 0;
} else {
@@ -524,7 +668,8 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
__func__, strerror(errno));
goto resized;
}
- if (internal_config.hugepage_unlink) {
+ if (internal_config.hugepage_unlink &&
+ !internal_config.in_memory) {
if (unlink(path)) {
RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n",
__func__, strerror(errno));
@@ -532,16 +677,16 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
}
}
}
-
- /*
- * map the segment, and populate page tables, the kernel fills
- * this segment with zeros if it's a new page.
- */
- va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd,
- map_offset);
+ mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED;
}
+ /*
+ * map the segment, and populate page tables, the kernel fills
+ * this segment with zeros if it's a new page.
+ */
+ va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd,
+ map_offset);
+
if (va == MAP_FAILED) {
RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
strerror(errno));
@@ -593,10 +738,6 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
goto mapped;
}
#endif
- /* for non-single file segments that aren't in-memory, we can close fd
- * here */
- if (!internal_config.single_file_segments && !internal_config.in_memory)
- close(fd);
ms->addr = addr;
ms->hugepage_sz = alloc_sz;
@@ -626,7 +767,10 @@ unmapped:
RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n");
}
resized:
- /* in-memory mode will never be single-file-segments mode */
+ /* some codepaths will return negative fd, so exit early */
+ if (fd < 0)
+ return -1;
+
if (internal_config.single_file_segments) {
resize_hugefile(fd, path, list_idx, seg_idx, map_offset,
alloc_sz, false);
@@ -638,6 +782,7 @@ resized:
lock(fd, LOCK_EX) == 1)
unlink(path);
close(fd);
+ fd_list[list_idx].fds[seg_idx] = -1;
}
return -1;
}
@@ -648,7 +793,8 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
{
uint64_t map_offset;
char path[PATH_MAX];
- int fd, ret;
+ int fd, ret = 0;
+ bool exit_early;
/* erase page data */
memset(ms->addr, 0, ms->len);
@@ -660,8 +806,17 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
return -1;
}
+ exit_early = false;
+
+ /* if we're using anonymous hugepages, nothing to be done */
+ if (internal_config.in_memory && !memfd_create_supported)
+ exit_early = true;
+
/* if we've already unlinked the page, nothing needs to be done */
- if (internal_config.hugepage_unlink) {
+ if (!internal_config.in_memory && internal_config.hugepage_unlink)
+ exit_early = true;
+
+ if (exit_early) {
memset(ms, 0, sizeof(*ms));
return 0;
}
@@ -684,14 +839,17 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
/* if we're able to take out a write lock, we're the last one
* holding onto this page.
*/
- ret = lock(fd, LOCK_EX);
- if (ret >= 0) {
- /* no one else is using this page */
- if (ret == 1)
- unlink(path);
+ if (!internal_config.in_memory) {
+ ret = lock(fd, LOCK_EX);
+ if (ret >= 0) {
+ /* no one else is using this page */
+ if (ret == 1)
+ unlink(path);
+ }
}
/* closing fd will drop the lock */
close(fd);
+ fd_list[list_idx].fds[seg_idx] = -1;
}
memset(ms, 0, sizeof(*ms));
@@ -828,7 +986,7 @@ free_seg_walk(const struct rte_memseg_list *msl, void *arg)
int msl_idx, seg_idx, ret, dir_fd = -1;
start_addr = (uintptr_t) msl->base_va;
- end_addr = start_addr + msl->memseg_arr.len * (size_t)msl->page_sz;
+ end_addr = start_addr + msl->len;
if ((uintptr_t)wa->ms->addr < start_addr ||
(uintptr_t)wa->ms->addr >= end_addr)
@@ -1250,6 +1408,9 @@ sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused)
unsigned int i;
int msl_idx;
+ if (msl->external)
+ return 0;
+
msl_idx = msl - mcfg->memsegs;
primary_msl = &mcfg->memsegs[msl_idx];
local_msl = &local_memsegs[msl_idx];
@@ -1298,6 +1459,9 @@ secondary_msl_create_walk(const struct rte_memseg_list *msl,
char name[PATH_MAX];
int msl_idx, ret;
+ if (msl->external)
+ return 0;
+
msl_idx = msl - mcfg->memsegs;
primary_msl = &mcfg->memsegs[msl_idx];
local_msl = &local_memsegs[msl_idx];
@@ -1314,50 +1478,176 @@ secondary_msl_create_walk(const struct rte_memseg_list *msl,
return -1;
}
local_msl->base_va = primary_msl->base_va;
+ local_msl->len = primary_msl->len;
return 0;
}
static int
-secondary_lock_list_create_walk(const struct rte_memseg_list *msl,
- void *arg __rte_unused)
+alloc_list(int list_idx, int len)
{
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- unsigned int i, len;
- int msl_idx;
int *data;
+ int i;
- msl_idx = msl - mcfg->memsegs;
- len = msl->memseg_arr.len;
-
- /* ensure we have space to store lock fd per each possible segment */
+ /* ensure we have space to store fd per each possible segment */
data = malloc(sizeof(int) * len);
if (data == NULL) {
- RTE_LOG(ERR, EAL, "Unable to allocate space for lock descriptors\n");
+ RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n");
return -1;
}
/* set all fd's as invalid */
for (i = 0; i < len; i++)
data[i] = -1;
- lock_fds[msl_idx].fds = data;
- lock_fds[msl_idx].len = len;
- lock_fds[msl_idx].count = 0;
- lock_fds[msl_idx].memseg_list_fd = -1;
+ fd_list[list_idx].fds = data;
+ fd_list[list_idx].len = len;
+ fd_list[list_idx].count = 0;
+ fd_list[list_idx].memseg_list_fd = -1;
+
+ return 0;
+}
+
+static int
+fd_list_create_walk(const struct rte_memseg_list *msl,
+ void *arg __rte_unused)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ unsigned int len;
+ int msl_idx;
+
+ if (msl->external)
+ return 0;
+
+ msl_idx = msl - mcfg->memsegs;
+ len = msl->memseg_arr.len;
+
+ return alloc_list(msl_idx, len);
+}
+
+int
+eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+
+ /* if list is not allocated, allocate it */
+ if (fd_list[list_idx].len == 0) {
+ int len = mcfg->memsegs[list_idx].memseg_arr.len;
+
+ if (alloc_list(list_idx, len) < 0)
+ return -ENOMEM;
+ }
+ fd_list[list_idx].fds[seg_idx] = fd;
return 0;
}
int
+eal_memalloc_get_seg_fd(int list_idx, int seg_idx)
+{
+ int fd;
+ if (internal_config.single_file_segments) {
+ fd = fd_list[list_idx].memseg_list_fd;
+ } else if (fd_list[list_idx].len == 0) {
+ /* list not initialized */
+ fd = -1;
+ } else {
+ fd = fd_list[list_idx].fds[seg_idx];
+ }
+ if (fd < 0)
+ return -ENODEV;
+ return fd;
+}
+
+static int
+test_memfd_create(void)
+{
+#ifdef MEMFD_SUPPORTED
+ unsigned int i;
+ for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
+ uint64_t pagesz = internal_config.hugepage_info[i].hugepage_sz;
+ int pagesz_flag = pagesz_flags(pagesz);
+ int flags;
+
+ flags = pagesz_flag | MFD_HUGETLB;
+ int fd = memfd_create("test", flags);
+ if (fd < 0) {
+ /* we failed - let memalloc know this isn't working */
+ if (errno == EINVAL) {
+ memfd_create_supported = 0;
+ return 0; /* not supported */
+ }
+
+ /* we got other error - something's wrong */
+ return -1; /* error */
+ }
+ close(fd);
+ return 1; /* supported */
+ }
+#endif
+ return 0; /* not supported */
+}
+
+int
+eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+
+ /* fd_list not initialized? */
+ if (fd_list[list_idx].len == 0)
+ return -ENODEV;
+ if (internal_config.single_file_segments) {
+ size_t pgsz = mcfg->memsegs[list_idx].page_sz;
+
+ /* segment not active? */
+ if (fd_list[list_idx].memseg_list_fd < 0)
+ return -ENOENT;
+ *offset = pgsz * seg_idx;
+ } else {
+ /* segment not active? */
+ if (fd_list[list_idx].fds[seg_idx] < 0)
+ return -ENOENT;
+ *offset = 0;
+ }
+ return 0;
+}
+
+int
eal_memalloc_init(void)
{
if (rte_eal_process_type() == RTE_PROC_SECONDARY)
if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0)
return -1;
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
+ internal_config.in_memory) {
+ int mfd_res = test_memfd_create();
- /* initialize all of the lock fd lists */
- if (internal_config.single_file_segments)
- if (rte_memseg_list_walk(secondary_lock_list_create_walk, NULL))
+ if (mfd_res < 0) {
+ RTE_LOG(ERR, EAL, "Unable to check if memfd is supported\n");
+ return -1;
+ }
+ if (mfd_res == 1)
+ RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n");
+ else
+ RTE_LOG(INFO, EAL, "Using memfd is not supported, falling back to anonymous hugepages\n");
+
+ /* we only support single-file segments mode with in-memory mode
+ * if we support hugetlbfs with memfd_create. this code will
+ * test if we do.
+ */
+ if (internal_config.single_file_segments &&
+ mfd_res != 1) {
+ RTE_LOG(ERR, EAL, "Single-file segments mode cannot be used without memfd support\n");
return -1;
+ }
+ /* this cannot ever happen but better safe than sorry */
+ if (!anonymous_hugepages_supported) {
+ RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n");
+ return -1;
+ }
+ }
+
+ /* initialize all of the fd lists */
+ if (rte_memseg_list_walk(fd_list_create_walk, NULL))
+ return -1;
return 0;
}
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index dbf19499..fce86fda 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -5,6 +5,7 @@
#define _FILE_OFFSET_BITS 64
#include <errno.h>
+#include <fcntl.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdlib.h>
@@ -17,6 +18,7 @@
#include <sys/stat.h>
#include <sys/queue.h>
#include <sys/file.h>
+#include <sys/resource.h>
#include <unistd.h>
#include <limits.h>
#include <sys/ioctl.h>
@@ -263,7 +265,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
int node_id = -1;
int essential_prev = 0;
int oldpolicy;
- struct bitmask *oldmask = numa_allocate_nodemask();
+ struct bitmask *oldmask = NULL;
bool have_numa = true;
unsigned long maxnode = 0;
@@ -275,6 +277,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
if (have_numa) {
RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
+ oldmask = numa_allocate_nodemask();
if (get_mempolicy(&oldpolicy, oldmask->maskp,
oldmask->size + 1, 0, 0) < 0) {
RTE_LOG(ERR, EAL,
@@ -402,7 +405,8 @@ out:
numa_set_localalloc();
}
}
- numa_free_cpumask(oldmask);
+ if (oldmask != NULL)
+ numa_free_cpumask(oldmask);
#endif
return i;
}
@@ -584,7 +588,7 @@ unlink_hugepage_files(struct hugepage_file *hugepg_tbl,
for (page = 0; page < nrpages; page++) {
struct hugepage_file *hp = &hugepg_tbl[page];
- if (hp->final_va != NULL && unlink(hp->filepath)) {
+ if (hp->orig_va != NULL && unlink(hp->filepath)) {
RTE_LOG(WARNING, EAL, "%s(): Removing %s failed: %s\n",
__func__, hp->filepath, strerror(errno));
}
@@ -771,7 +775,10 @@ remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end)
rte_fbarray_set_used(arr, ms_idx);
- close(fd);
+ /* store segment fd internally */
+ if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0)
+ RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n",
+ rte_strerror(rte_errno));
}
RTE_LOG(DEBUG, EAL, "Allocated %" PRIu64 "M on socket %i\n",
(seg_len * page_sz) >> 20, socket_id);
@@ -857,6 +864,7 @@ alloc_va_space(struct rte_memseg_list *msl)
return -1;
}
msl->base_va = addr;
+ msl->len = mem_sz;
return 0;
}
@@ -1365,6 +1373,7 @@ eal_legacy_hugepage_init(void)
msl->base_va = addr;
msl->page_sz = page_sz;
msl->socket_id = 0;
+ msl->len = internal_config.memory;
/* populate memsegs. each memseg is one page long */
for (cur_seg = 0; cur_seg < n_segs; cur_seg++) {
@@ -1611,7 +1620,7 @@ eal_legacy_hugepage_init(void)
if (msl->memseg_arr.count > 0)
continue;
/* this is an unused list, deallocate it */
- mem_sz = (size_t)msl->page_sz * msl->memseg_arr.len;
+ mem_sz = msl->len;
munmap(msl->base_va, mem_sz);
msl->base_va = NULL;
@@ -1770,6 +1779,7 @@ getFileSize(int fd)
static int
eal_legacy_hugepage_attach(void)
{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct hugepage_file *hp = NULL;
unsigned int num_hp = 0;
unsigned int i = 0;
@@ -1813,6 +1823,9 @@ eal_legacy_hugepage_attach(void)
struct hugepage_file *hf = &hp[i];
size_t map_sz = hf->size;
void *map_addr = hf->final_va;
+ int msl_idx, ms_idx;
+ struct rte_memseg_list *msl;
+ struct rte_memseg *ms;
/* if size is zero, no more pages left */
if (map_sz == 0)
@@ -1830,25 +1843,50 @@ eal_legacy_hugepage_attach(void)
if (map_addr == MAP_FAILED) {
RTE_LOG(ERR, EAL, "Could not map %s: %s\n",
hf->filepath, strerror(errno));
- close(fd);
- goto error;
+ goto fd_error;
}
/* set shared lock on the file. */
if (flock(fd, LOCK_SH) < 0) {
RTE_LOG(DEBUG, EAL, "%s(): Locking file failed: %s\n",
__func__, strerror(errno));
- close(fd);
- goto error;
+ goto fd_error;
}
- close(fd);
+ /* find segment data */
+ msl = rte_mem_virt2memseg_list(map_addr);
+ if (msl == NULL) {
+ RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg list\n",
+ __func__);
+ goto fd_error;
+ }
+ ms = rte_mem_virt2memseg(map_addr, msl);
+ if (ms == NULL) {
+ RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg\n",
+ __func__);
+ goto fd_error;
+ }
+
+ msl_idx = msl - mcfg->memsegs;
+ ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
+ if (ms_idx < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg idx\n",
+ __func__);
+ goto fd_error;
+ }
+
+ /* store segment fd internally */
+ if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0)
+ RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n",
+ rte_strerror(rte_errno));
}
/* unmap the hugepage config file, since we are done using it */
munmap(hp, size);
close(fd_hugepage);
return 0;
+fd_error:
+ close(fd);
error:
/* map all segments into memory to make sure we get the addrs */
cur_seg = 0;
@@ -2093,18 +2131,65 @@ static int __rte_unused
memseg_primary_init(void)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- int i, socket_id, hpi_idx, msl_idx = 0;
+ struct memtype {
+ uint64_t page_sz;
+ int socket_id;
+ } *memtypes = NULL;
+ int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */
struct rte_memseg_list *msl;
- uint64_t max_mem, total_mem;
+ uint64_t max_mem, max_mem_per_type;
+ unsigned int max_seglists_per_type;
+ unsigned int n_memtypes, cur_type;
/* no-huge does not need this at all */
if (internal_config.no_hugetlbfs)
return 0;
- max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
- total_mem = 0;
+ /*
+ * figuring out amount of memory we're going to have is a long and very
+ * involved process. the basic element we're operating with is a memory
+ * type, defined as a combination of NUMA node ID and page size (so that
+ * e.g. 2 sockets with 2 page sizes yield 4 memory types in total).
+ *
+ * deciding amount of memory going towards each memory type is a
+ * balancing act between maximum segments per type, maximum memory per
+ * type, and number of detected NUMA nodes. the goal is to make sure
+ * each memory type gets at least one memseg list.
+ *
+ * the total amount of memory is limited by RTE_MAX_MEM_MB value.
+ *
+ * the total amount of memory per type is limited by either
+ * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number
+ * of detected NUMA nodes. additionally, maximum number of segments per
+ * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for
+ * smaller page sizes, it can take hundreds of thousands of segments to
+ * reach the above specified per-type memory limits.
+ *
+ * additionally, each type may have multiple memseg lists associated
+ * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger
+ * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones.
+ *
+ * the number of memseg lists per type is decided based on the above
+ * limits, and also taking number of detected NUMA nodes, to make sure
+ * that we don't run out of memseg lists before we populate all NUMA
+ * nodes with memory.
+ *
+ * we do this in three stages. first, we collect the number of types.
+ * then, we figure out memory constraints and populate the list of
+ * would-be memseg lists. then, we go ahead and allocate the memseg
+ * lists.
+ */
- /* create memseg lists */
+ /* create space for mem types */
+ n_memtypes = internal_config.num_hugepage_sizes * rte_socket_count();
+ memtypes = calloc(n_memtypes, sizeof(*memtypes));
+ if (memtypes == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot allocate space for memory types\n");
+ return -1;
+ }
+
+ /* populate mem types */
+ cur_type = 0;
for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes;
hpi_idx++) {
struct hugepage_info *hpi;
@@ -2113,62 +2198,114 @@ memseg_primary_init(void)
hpi = &internal_config.hugepage_info[hpi_idx];
hugepage_sz = hpi->hugepage_sz;
- for (i = 0; i < (int) rte_socket_count(); i++) {
- uint64_t max_type_mem, total_type_mem = 0;
- int type_msl_idx, max_segs, total_segs = 0;
-
- socket_id = rte_socket_id_by_idx(i);
+ for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) {
+ int socket_id = rte_socket_id_by_idx(i);
#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
if (socket_id > 0)
break;
#endif
+ memtypes[cur_type].page_sz = hugepage_sz;
+ memtypes[cur_type].socket_id = socket_id;
- if (total_mem >= max_mem)
- break;
-
- max_type_mem = RTE_MIN(max_mem - total_mem,
- (uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20);
- max_segs = RTE_MAX_MEMSEG_PER_TYPE;
+ RTE_LOG(DEBUG, EAL, "Detected memory type: "
+ "socket_id:%u hugepage_sz:%" PRIu64 "\n",
+ socket_id, hugepage_sz);
+ }
+ }
- type_msl_idx = 0;
- while (total_type_mem < max_type_mem &&
- total_segs < max_segs) {
- uint64_t cur_max_mem, cur_mem;
- unsigned int n_segs;
+ /* set up limits for types */
+ max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
+ max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20,
+ max_mem / n_memtypes);
+ /*
+ * limit maximum number of segment lists per type to ensure there's
+ * space for memseg lists for all NUMA nodes with all page sizes
+ */
+ max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes;
- if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
- RTE_LOG(ERR, EAL,
- "No more space in memseg lists, please increase %s\n",
- RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
- return -1;
- }
+ if (max_seglists_per_type == 0) {
+ RTE_LOG(ERR, EAL, "Cannot accommodate all memory types, please increase %s\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+ goto out;
+ }
- msl = &mcfg->memsegs[msl_idx++];
+ /* go through all mem types and create segment lists */
+ msl_idx = 0;
+ for (cur_type = 0; cur_type < n_memtypes; cur_type++) {
+ unsigned int cur_seglist, n_seglists, n_segs;
+ unsigned int max_segs_per_type, max_segs_per_list;
+ struct memtype *type = &memtypes[cur_type];
+ uint64_t max_mem_per_list, pagesz;
+ int socket_id;
- cur_max_mem = max_type_mem - total_type_mem;
+ pagesz = type->page_sz;
+ socket_id = type->socket_id;
- cur_mem = get_mem_amount(hugepage_sz,
- cur_max_mem);
- n_segs = cur_mem / hugepage_sz;
+ /*
+ * we need to create segment lists for this type. we must take
+ * into account the following things:
+ *
+ * 1. total amount of memory we can use for this memory type
+ * 2. total amount of memory per memseg list allowed
+ * 3. number of segments needed to fit the amount of memory
+ * 4. number of segments allowed per type
+ * 5. number of segments allowed per memseg list
+ * 6. number of memseg lists we are allowed to take up
+ */
- if (alloc_memseg_list(msl, hugepage_sz, n_segs,
- socket_id, type_msl_idx))
- return -1;
+ /* calculate how much segments we will need in total */
+ max_segs_per_type = max_mem_per_type / pagesz;
+ /* limit number of segments to maximum allowed per type */
+ max_segs_per_type = RTE_MIN(max_segs_per_type,
+ (unsigned int)RTE_MAX_MEMSEG_PER_TYPE);
+ /* limit number of segments to maximum allowed per list */
+ max_segs_per_list = RTE_MIN(max_segs_per_type,
+ (unsigned int)RTE_MAX_MEMSEG_PER_LIST);
+
+ /* calculate how much memory we can have per segment list */
+ max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz,
+ (uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20);
+
+ /* calculate how many segments each segment list will have */
+ n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz);
+
+ /* calculate how many segment lists we can have */
+ n_seglists = RTE_MIN(max_segs_per_type / n_segs,
+ max_mem_per_type / max_mem_per_list);
+
+ /* limit number of segment lists according to our maximum */
+ n_seglists = RTE_MIN(n_seglists, max_seglists_per_type);
+
+ RTE_LOG(DEBUG, EAL, "Creating %i segment lists: "
+ "n_segs:%i socket_id:%i hugepage_sz:%" PRIu64 "\n",
+ n_seglists, n_segs, socket_id, pagesz);
+
+ /* create all segment lists */
+ for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) {
+ if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
+ RTE_LOG(ERR, EAL,
+ "No more space in memseg lists, please increase %s\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+ goto out;
+ }
+ msl = &mcfg->memsegs[msl_idx++];
- total_segs += msl->memseg_arr.len;
- total_type_mem = total_segs * hugepage_sz;
- type_msl_idx++;
+ if (alloc_memseg_list(msl, pagesz, n_segs,
+ socket_id, cur_seglist))
+ goto out;
- if (alloc_va_space(msl)) {
- RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
- return -1;
- }
+ if (alloc_va_space(msl)) {
+ RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
+ goto out;
}
- total_mem += total_type_mem;
}
}
- return 0;
+ /* we're successful */
+ ret = 0;
+out:
+ free(memtypes);
+ return ret;
}
static int
@@ -2204,6 +2341,25 @@ memseg_secondary_init(void)
int
rte_eal_memseg_init(void)
{
+ /* increase rlimit to maximum */
+ struct rlimit lim;
+
+ if (getrlimit(RLIMIT_NOFILE, &lim) == 0) {
+ /* set limit to maximum */
+ lim.rlim_cur = lim.rlim_max;
+
+ if (setrlimit(RLIMIT_NOFILE, &lim) < 0) {
+ RTE_LOG(DEBUG, EAL, "Setting maximum number of open files failed: %s\n",
+ strerror(errno));
+ } else {
+ RTE_LOG(DEBUG, EAL, "Setting maximum number of open files to %"
+ PRIu64 "\n",
+ (uint64_t)lim.rlim_cur);
+ }
+ } else {
+ RTE_LOG(ERR, EAL, "Cannot get current resource limits\n");
+ }
+
return rte_eal_process_type() == RTE_PROC_PRIMARY ?
#ifndef RTE_ARCH_64
memseg_primary_init_32() :
diff --git a/lib/librte_eal/linuxapp/eal/eal_thread.c b/lib/librte_eal/linuxapp/eal/eal_thread.c
index b496fc71..379773b6 100644
--- a/lib/librte_eal/linuxapp/eal/eal_thread.c
+++ b/lib/librte_eal/linuxapp/eal/eal_thread.c
@@ -121,8 +121,8 @@ eal_thread_loop(__attribute__((unused)) void *arg)
ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
- RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%x;cpuset=[%s%s])\n",
- lcore_id, (int)thread_id, cpuset, ret == 0 ? "" : "...");
+ RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%zx;cpuset=[%s%s])\n",
+ lcore_id, (uintptr_t)thread_id, cpuset, ret == 0 ? "" : "...");
/* read on our pipe to get commands */
while (1) {
diff --git a/lib/librte_eal/linuxapp/eal/eal_timer.c b/lib/librte_eal/linuxapp/eal/eal_timer.c
index 2766bd78..bc8f0519 100644
--- a/lib/librte_eal/linuxapp/eal/eal_timer.c
+++ b/lib/librte_eal/linuxapp/eal/eal_timer.c
@@ -87,7 +87,7 @@ static pthread_t msb_inc_thread_id;
* containing used to process MSB of the HPET (unfortunately, we need
* this because hpet is 32 bits by default under linux).
*/
-static void
+static void *
hpet_msb_inc(__attribute__((unused)) void *arg)
{
uint32_t t;
@@ -98,6 +98,7 @@ hpet_msb_inc(__attribute__((unused)) void *arg)
eal_hpet_msb ++;
sleep(10);
}
+ return NULL;
}
uint64_t
@@ -178,7 +179,7 @@ rte_eal_hpet_init(int make_default)
/* create a thread that will increment a global variable for
* msb (hpet is 32 bits by default under linux) */
ret = rte_ctrl_thread_create(&msb_inc_thread_id, "hpet-msb-inc", NULL,
- (void *(*)(void *))hpet_msb_inc, NULL);
+ hpet_msb_inc, NULL);
if (ret != 0) {
RTE_LOG(ERR, EAL, "ERROR: Cannot create HPET timer thread!\n");
internal_config.no_hpet = 1;
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index c68dc38e..0516b159 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -345,46 +345,13 @@ get_vfio_cfg_by_group_num(int iommu_group_num)
return NULL;
}
-static struct vfio_config *
-get_vfio_cfg_by_group_fd(int vfio_group_fd)
-{
- struct vfio_config *vfio_cfg;
- int i, j;
-
- for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
- vfio_cfg = &vfio_cfgs[i];
- for (j = 0; j < VFIO_MAX_GROUPS; j++)
- if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
- return vfio_cfg;
- }
-
- return NULL;
-}
-
-static struct vfio_config *
-get_vfio_cfg_by_container_fd(int container_fd)
-{
- int i;
-
- for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
- if (vfio_cfgs[i].vfio_container_fd == container_fd)
- return &vfio_cfgs[i];
- }
-
- return NULL;
-}
-
-int
-rte_vfio_get_group_fd(int iommu_group_num)
+static int
+vfio_get_group_fd(struct vfio_config *vfio_cfg,
+ int iommu_group_num)
{
int i;
int vfio_group_fd;
struct vfio_group *cur_grp;
- struct vfio_config *vfio_cfg;
-
- /* get the vfio_config it belongs to */
- vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
- vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
/* check if we already have the group descriptor open */
for (i = 0; i < VFIO_MAX_GROUPS; i++)
@@ -423,6 +390,47 @@ rte_vfio_get_group_fd(int iommu_group_num)
return vfio_group_fd;
}
+static struct vfio_config *
+get_vfio_cfg_by_group_fd(int vfio_group_fd)
+{
+ struct vfio_config *vfio_cfg;
+ int i, j;
+
+ for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+ vfio_cfg = &vfio_cfgs[i];
+ for (j = 0; j < VFIO_MAX_GROUPS; j++)
+ if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
+ return vfio_cfg;
+ }
+
+ return NULL;
+}
+
+static struct vfio_config *
+get_vfio_cfg_by_container_fd(int container_fd)
+{
+ int i;
+
+ for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+ if (vfio_cfgs[i].vfio_container_fd == container_fd)
+ return &vfio_cfgs[i];
+ }
+
+ return NULL;
+}
+
+int
+rte_vfio_get_group_fd(int iommu_group_num)
+{
+ struct vfio_config *vfio_cfg;
+
+ /* get the vfio_config it belongs to */
+ vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
+ vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
+
+ return vfio_get_group_fd(vfio_cfg, iommu_group_num);
+}
+
static int
get_vfio_group_idx(int vfio_group_fd)
{
@@ -509,7 +517,7 @@ vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
msl = rte_mem_virt2memseg_list(addr);
/* for IOVA as VA mode, no need to care for IOVA addresses */
- if (rte_eal_iova_mode() == RTE_IOVA_VA) {
+ if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) {
uint64_t vfio_va = (uint64_t)(uintptr_t)addr;
if (type == RTE_MEM_EVENT_ALLOC)
vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va,
@@ -523,13 +531,19 @@ vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
/* memsegs are contiguous in memory */
ms = rte_mem_virt2memseg(addr, msl);
while (cur_len < len) {
+ /* some memory segments may have invalid IOVA */
+ if (ms->iova == RTE_BAD_IOVA) {
+ RTE_LOG(DEBUG, EAL, "Memory segment at %p has bad IOVA, skipping\n",
+ ms->addr);
+ goto next;
+ }
if (type == RTE_MEM_EVENT_ALLOC)
vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
ms->iova, ms->len, 1);
else
vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
ms->iova, ms->len, 0);
-
+next:
cur_len += ms->len;
++ms;
}
@@ -896,7 +910,15 @@ rte_vfio_enable(const char *modname)
return 0;
}
- default_vfio_cfg->vfio_container_fd = rte_vfio_get_container_fd();
+ if (internal_config.process_type == RTE_PROC_PRIMARY) {
+ /* open a new container */
+ default_vfio_cfg->vfio_container_fd =
+ rte_vfio_get_container_fd();
+ } else {
+ /* get the default container from the primary process */
+ default_vfio_cfg->vfio_container_fd =
+ vfio_get_default_container_fd();
+ }
/* check if we have VFIO driver enabled */
if (default_vfio_cfg->vfio_container_fd != -1) {
@@ -916,6 +938,45 @@ rte_vfio_is_enabled(const char *modname)
return default_vfio_cfg->vfio_enabled && mod_available;
}
+int
+vfio_get_default_container_fd(void)
+{
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+
+ if (default_vfio_cfg->vfio_enabled)
+ return default_vfio_cfg->vfio_container_fd;
+
+ if (internal_config.process_type == RTE_PROC_PRIMARY) {
+ /* if we were secondary process we would try requesting
+ * container fd from the primary, but we're the primary
+ * process so just exit here
+ */
+ return -1;
+ }
+
+ p->req = SOCKET_REQ_DEFAULT_CONTAINER;
+ strcpy(mp_req.name, EAL_VFIO_MP);
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ free(mp_reply.msgs);
+ return mp_rep->fds[0];
+ }
+ free(mp_reply.msgs);
+ }
+
+ RTE_LOG(ERR, EAL, " cannot request default container fd\n");
+ return -1;
+}
+
const struct vfio_iommu_type *
vfio_set_iommu_type(int vfio_container_fd)
{
@@ -1028,8 +1089,9 @@ rte_vfio_get_container_fd(void)
mp_rep = &mp_reply.msgs[0];
p = (struct vfio_mp_param *)mp_rep->param;
if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ vfio_container_fd = mp_rep->fds[0];
free(mp_reply.msgs);
- return mp_rep->fds[0];
+ return vfio_container_fd;
}
free(mp_reply.msgs);
}
@@ -1082,11 +1144,14 @@ rte_vfio_get_group_num(const char *sysfs_base,
}
static int
-type1_map(const struct rte_memseg_list *msl __rte_unused,
- const struct rte_memseg *ms, void *arg)
+type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+ void *arg)
{
int *vfio_container_fd = arg;
+ if (msl->external)
+ return 0;
+
return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
ms->len, 1);
}
@@ -1145,8 +1210,22 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
struct vfio_iommu_type1_dma_map dma_map;
struct vfio_iommu_type1_dma_unmap dma_unmap;
int ret;
+ struct vfio_iommu_spapr_register_memory reg = {
+ .argsz = sizeof(reg),
+ .flags = 0
+ };
+ reg.vaddr = (uintptr_t) vaddr;
+ reg.size = len;
if (do_map != 0) {
+ ret = ioctl(vfio_container_fd,
+ VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
memset(&dma_map, 0, sizeof(dma_map));
dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
dma_map.vaddr = vaddr;
@@ -1163,13 +1242,6 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
}
} else {
- struct vfio_iommu_spapr_register_memory reg = {
- .argsz = sizeof(reg),
- .flags = 0
- };
- reg.vaddr = (uintptr_t) vaddr;
- reg.size = len;
-
ret = ioctl(vfio_container_fd,
VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
if (ret) {
@@ -1196,12 +1268,15 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
}
static int
-vfio_spapr_map_walk(const struct rte_memseg_list *msl __rte_unused,
+vfio_spapr_map_walk(const struct rte_memseg_list *msl,
const struct rte_memseg *ms, void *arg)
{
int *vfio_container_fd = arg;
- return vfio_spapr_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
+ if (msl->external)
+ return 0;
+
+ return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova,
ms->len, 1);
}
@@ -1210,12 +1285,15 @@ struct spapr_walk_param {
uint64_t hugepage_sz;
};
static int
-vfio_spapr_window_size_walk(const struct rte_memseg_list *msl __rte_unused,
+vfio_spapr_window_size_walk(const struct rte_memseg_list *msl,
const struct rte_memseg *ms, void *arg)
{
struct spapr_walk_param *param = arg;
uint64_t max = ms->iova + ms->len;
+ if (msl->external)
+ return 0;
+
if (max > param->window_size) {
param->hugepage_sz = ms->hugepage_sz;
param->window_size = max;
@@ -1670,9 +1748,6 @@ int
rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
{
struct vfio_config *vfio_cfg;
- struct vfio_group *cur_grp;
- int vfio_group_fd;
- int i;
vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
if (vfio_cfg == NULL) {
@@ -1680,36 +1755,7 @@ rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
return -1;
}
- /* Check room for new group */
- if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) {
- RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
- return -1;
- }
-
- /* Get an index for the new group */
- for (i = 0; i < VFIO_MAX_GROUPS; i++)
- if (vfio_cfg->vfio_groups[i].group_num == -1) {
- cur_grp = &vfio_cfg->vfio_groups[i];
- break;
- }
-
- /* This should not happen */
- if (i == VFIO_MAX_GROUPS) {
- RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
- return -1;
- }
-
- vfio_group_fd = vfio_open_group_fd(iommu_group_num);
- if (vfio_group_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num);
- return -1;
- }
- cur_grp->group_num = iommu_group_num;
- cur_grp->fd = vfio_group_fd;
- cur_grp->devices = 0;
- vfio_cfg->vfio_active_groups++;
-
- return vfio_group_fd;
+ return vfio_get_group_fd(vfio_cfg, iommu_group_num);
}
int
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 68d4750a..63ae115c 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -115,6 +115,9 @@ struct vfio_iommu_type {
vfio_dma_func_t dma_map_func;
};
+/* get the vfio container that devices are bound to by default */
+int vfio_get_default_container_fd(void);
+
/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */
const struct vfio_iommu_type *
vfio_set_iommu_type(int vfio_container_fd);
@@ -129,6 +132,7 @@ int vfio_mp_sync_setup(void);
#define SOCKET_REQ_CONTAINER 0x100
#define SOCKET_REQ_GROUP 0x200
+#define SOCKET_REQ_DEFAULT_CONTAINER 0x400
#define SOCKET_OK 0x0
#define SOCKET_NO_FD 0x1
#define SOCKET_ERR 0xFF
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
index 680a24aa..a1e8c834 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
@@ -66,6 +66,17 @@ vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
reply.fds[0] = fd;
}
break;
+ case SOCKET_REQ_DEFAULT_CONTAINER:
+ r->req = SOCKET_REQ_DEFAULT_CONTAINER;
+ fd = vfio_get_default_container_fd();
+ if (fd < 0)
+ r->result = SOCKET_ERR;
+ else {
+ r->result = SOCKET_OK;
+ reply.num_fds = 1;
+ reply.fds[0] = fd;
+ }
+ break;
default:
RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
return -1;
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
index cfa9448b..5afa0871 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
@@ -8,6 +8,7 @@
#ifdef __KERNEL__
#include <linux/if.h>
+#include <asm/barrier.h>
#define RTE_STD_C11
#else
#include <rte_common.h>
@@ -54,8 +55,13 @@ struct rte_kni_request {
* Writing should never overwrite the read position
*/
struct rte_kni_fifo {
+#ifdef RTE_USE_C11_MEM_MODEL
+ unsigned write; /**< Next position to be written*/
+ unsigned read; /**< Next position to be read */
+#else
volatile unsigned write; /**< Next position to be written*/
volatile unsigned read; /**< Next position to be read */
+#endif
unsigned len; /**< Circular buffer length */
unsigned elem_size; /**< Pointer size - for 32/64 bit OS */
void *volatile buffer[]; /**< The buffer contains mbuf pointers */