aboutsummaryrefslogtreecommitdiffstats
path: root/lib/librte_eal
diff options
context:
space:
mode:
Diffstat (limited to 'lib/librte_eal')
-rw-r--r--lib/librte_eal/bsdapp/eal/Makefile9
-rw-r--r--lib/librte_eal/bsdapp/eal/eal.c54
-rw-r--r--lib/librte_eal/bsdapp/eal/eal_dev.c14
-rw-r--r--lib/librte_eal/bsdapp/eal/eal_memalloc.c21
-rw-r--r--lib/librte_eal/bsdapp/eal/eal_memory.c9
-rw-r--r--lib/librte_eal/common/Makefile1
-rw-r--r--lib/librte_eal/common/arch/arm/meson.build2
-rw-r--r--lib/librte_eal/common/arch/ppc_64/meson.build5
-rw-r--r--lib/librte_eal/common/arch/x86/meson.build2
-rw-r--r--lib/librte_eal/common/eal_common_bus.c45
-rw-r--r--lib/librte_eal/common/eal_common_class.c2
-rw-r--r--lib/librte_eal/common/eal_common_dev.c347
-rw-r--r--lib/librte_eal/common/eal_common_devargs.c52
-rw-r--r--lib/librte_eal/common/eal_common_fbarray.c5
-rw-r--r--lib/librte_eal/common/eal_common_memory.c210
-rw-r--r--lib/librte_eal/common/eal_common_memzone.c8
-rw-r--r--lib/librte_eal/common/eal_common_options.c42
-rw-r--r--lib/librte_eal/common/eal_common_proc.c10
-rw-r--r--lib/librte_eal/common/eal_common_string_fns.c26
-rw-r--r--lib/librte_eal/common/eal_common_timer.c24
-rw-r--r--lib/librte_eal/common/eal_filesystem.h15
-rw-r--r--lib/librte_eal/common/eal_internal_cfg.h1
-rw-r--r--lib/librte_eal/common/eal_memalloc.h11
-rw-r--r--lib/librte_eal/common/eal_options.h2
-rw-r--r--lib/librte_eal/common/eal_private.h90
-rw-r--r--lib/librte_eal/common/hotplug_mp.c426
-rw-r--r--lib/librte_eal/common/hotplug_mp.h46
-rw-r--r--lib/librte_eal/common/include/arch/arm/rte_cycles_32.h4
-rw-r--r--lib/librte_eal/common/include/arch/ppc_64/meson.build16
-rw-r--r--lib/librte_eal/common/include/arch/ppc_64/rte_pause.h7
-rw-r--r--lib/librte_eal/common/include/generic/rte_cycles.h11
-rw-r--r--lib/librte_eal/common/include/rte_bitmap.h14
-rw-r--r--lib/librte_eal/common/include/rte_bus.h34
-rw-r--r--lib/librte_eal/common/include/rte_common.h11
-rw-r--r--lib/librte_eal/common/include/rte_dev.h123
-rw-r--r--lib/librte_eal/common/include/rte_devargs.h81
-rw-r--r--lib/librte_eal/common/include/rte_eal.h20
-rw-r--r--lib/librte_eal/common/include/rte_eal_interrupts.h1
-rw-r--r--lib/librte_eal/common/include/rte_eal_memconfig.h18
-rw-r--r--lib/librte_eal/common/include/rte_malloc.h192
-rw-r--r--lib/librte_eal/common/include/rte_malloc_heap.h3
-rw-r--r--lib/librte_eal/common/include/rte_memory.h109
-rw-r--r--lib/librte_eal/common/include/rte_option.h63
-rw-r--r--lib/librte_eal/common/include/rte_string_fns.h26
-rw-r--r--lib/librte_eal/common/include/rte_version.h6
-rw-r--r--lib/librte_eal/common/include/rte_vfio.h31
-rw-r--r--lib/librte_eal/common/malloc_elem.c10
-rw-r--r--lib/librte_eal/common/malloc_heap.c340
-rw-r--r--lib/librte_eal/common/malloc_heap.h17
-rw-r--r--lib/librte_eal/common/malloc_mp.c4
-rw-r--r--lib/librte_eal/common/meson.build5
-rw-r--r--lib/librte_eal/common/rte_malloc.c436
-rw-r--r--lib/librte_eal/common/rte_option.c54
-rw-r--r--lib/librte_eal/linuxapp/eal/Makefile20
-rw-r--r--lib/librte_eal/linuxapp/eal/eal.c119
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_dev.c172
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_hugepage_info.c1
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_interrupts.c79
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_memalloc.c466
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_memory.c264
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_thread.c4
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_timer.c5
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_vfio.c216
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_vfio.h4
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c11
-rw-r--r--lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h6
-rw-r--r--lib/librte_eal/meson.build3
-rw-r--r--lib/librte_eal/rte_eal_version.map39
68 files changed, 3862 insertions, 662 deletions
diff --git a/lib/librte_eal/bsdapp/eal/Makefile b/lib/librte_eal/bsdapp/eal/Makefile
index d27da3d1..bfeddaad 100644
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@@ -22,7 +22,7 @@ LDLIBS += -lrte_kvargs
EXPORT_MAP := ../../rte_eal_version.map
-LIBABIVER := 8
+LIBABIVER := 9
# specific to bsdapp exec-env
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) := eal.c
@@ -62,10 +62,12 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_proc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_fbarray.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_uuid.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_malloc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += hotplug_mp.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_elem.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_heap.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_mp.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_keepalive.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_option.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_service.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_reciprocal.c
@@ -77,11 +79,6 @@ SRCS-y += rte_cycles.c
CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST)
-CFLAGS_eal.o := -D_GNU_SOURCE
-#CFLAGS_eal_thread.o := -D_GNU_SOURCE
-CFLAGS_eal_log.o := -D_GNU_SOURCE
-CFLAGS_eal_common_log.o := -D_GNU_SOURCE
-
# workaround for a gcc bug with noreturn attribute
# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603
ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index d7ae9d68..508cbc46 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -42,6 +42,7 @@
#include <rte_devargs.h>
#include <rte_version.h>
#include <rte_vfio.h>
+#include <rte_option.h>
#include <rte_atomic.h>
#include <malloc_heap.h>
@@ -141,7 +142,7 @@ eal_create_runtime_dir(void)
}
const char *
-eal_get_runtime_dir(void)
+rte_eal_get_runtime_dir(void)
{
return runtime_dir;
}
@@ -414,12 +415,20 @@ eal_parse_args(int argc, char **argv)
argvopt = argv;
optind = 1;
optreset = 1;
+ opterr = 0;
while ((opt = getopt_long(argc, argvopt, eal_short_options,
eal_long_options, &option_index)) != EOF) {
- /* getopt is not happy, stop right now */
+ /*
+ * getopt didn't recognise the option, lets parse the
+ * registered options to see if the flag is valid
+ */
if (opt == '?') {
+ ret = rte_option_parse(argv[optind-1]);
+ if (ret == 0)
+ continue;
+
eal_usage(prgname);
ret = -1;
goto out;
@@ -502,6 +511,9 @@ check_socket(const struct rte_memseg_list *msl, void *arg)
{
int *socket_id = arg;
+ if (msl->external)
+ return 0;
+
if (msl->socket_id == *socket_id && msl->memseg_arr.count != 0)
return 1;
@@ -607,7 +619,7 @@ rte_eal_init(int argc, char **argv)
internal_config.legacy_mem = true;
if (eal_plugins_init() < 0) {
- rte_eal_init_alert("Cannot init plugins\n");
+ rte_eal_init_alert("Cannot init plugins");
rte_errno = EINVAL;
rte_atomic32_clear(&run_once);
return -1;
@@ -622,7 +634,7 @@ rte_eal_init(int argc, char **argv)
rte_config_init();
if (rte_eal_intr_init() < 0) {
- rte_eal_init_alert("Cannot init interrupt-handling thread\n");
+ rte_eal_init_alert("Cannot init interrupt-handling thread");
return -1;
}
@@ -630,7 +642,7 @@ rte_eal_init(int argc, char **argv)
* bus through mp channel in the secondary process before the bus scan.
*/
if (rte_mp_channel_init() < 0) {
- rte_eal_init_alert("failed to init mp channel\n");
+ rte_eal_init_alert("failed to init mp channel");
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
rte_errno = EFAULT;
return -1;
@@ -638,14 +650,21 @@ rte_eal_init(int argc, char **argv)
}
if (rte_bus_scan()) {
- rte_eal_init_alert("Cannot scan the buses for devices\n");
+ rte_eal_init_alert("Cannot scan the buses for devices");
rte_errno = ENODEV;
rte_atomic32_clear(&run_once);
return -1;
}
- /* autodetect the iova mapping mode (default is iova_pa) */
- rte_eal_get_configuration()->iova_mode = rte_bus_get_iommu_class();
+ /* if no EAL option "--iova-mode=<pa|va>", use bus IOVA scheme */
+ if (internal_config.iova_mode == RTE_IOVA_DC) {
+ /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */
+ rte_eal_get_configuration()->iova_mode =
+ rte_bus_get_iommu_class();
+ } else {
+ rte_eal_get_configuration()->iova_mode =
+ internal_config.iova_mode;
+ }
if (internal_config.no_hugetlbfs == 0) {
/* rte_config isn't initialized yet */
@@ -685,37 +704,37 @@ rte_eal_init(int argc, char **argv)
* initialize memzones first.
*/
if (rte_eal_memzone_init() < 0) {
- rte_eal_init_alert("Cannot init memzone\n");
+ rte_eal_init_alert("Cannot init memzone");
rte_errno = ENODEV;
return -1;
}
if (rte_eal_memory_init() < 0) {
- rte_eal_init_alert("Cannot init memory\n");
+ rte_eal_init_alert("Cannot init memory");
rte_errno = ENOMEM;
return -1;
}
if (rte_eal_malloc_heap_init() < 0) {
- rte_eal_init_alert("Cannot init malloc heap\n");
+ rte_eal_init_alert("Cannot init malloc heap");
rte_errno = ENODEV;
return -1;
}
if (rte_eal_tailqs_init() < 0) {
- rte_eal_init_alert("Cannot init tail queues for objects\n");
+ rte_eal_init_alert("Cannot init tail queues for objects");
rte_errno = EFAULT;
return -1;
}
if (rte_eal_alarm_init() < 0) {
- rte_eal_init_alert("Cannot init interrupt-handling thread\n");
+ rte_eal_init_alert("Cannot init interrupt-handling thread");
/* rte_eal_alarm_init sets rte_errno on failure. */
return -1;
}
if (rte_eal_timer_init() < 0) {
- rte_eal_init_alert("Cannot init HPET or TSC timers\n");
+ rte_eal_init_alert("Cannot init HPET or TSC timers");
rte_errno = ENOTSUP;
return -1;
}
@@ -765,14 +784,14 @@ rte_eal_init(int argc, char **argv)
/* initialize services so vdevs register service during bus_probe. */
ret = rte_service_init();
if (ret) {
- rte_eal_init_alert("rte_service_init() failed\n");
+ rte_eal_init_alert("rte_service_init() failed");
rte_errno = ENOEXEC;
return -1;
}
/* Probe all the buses and devices/drivers on them */
if (rte_bus_probe()) {
- rte_eal_init_alert("Cannot probe devices\n");
+ rte_eal_init_alert("Cannot probe devices");
rte_errno = ENOTSUP;
return -1;
}
@@ -788,6 +807,9 @@ rte_eal_init(int argc, char **argv)
rte_eal_mcfg_complete();
+ /* Call each registered callback, if enabled */
+ rte_option_init();
+
return fctret;
}
diff --git a/lib/librte_eal/bsdapp/eal/eal_dev.c b/lib/librte_eal/bsdapp/eal/eal_dev.c
index 1c6c51bd..255d611e 100644
--- a/lib/librte_eal/bsdapp/eal/eal_dev.c
+++ b/lib/librte_eal/bsdapp/eal/eal_dev.c
@@ -19,3 +19,17 @@ rte_dev_event_monitor_stop(void)
RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n");
return -1;
}
+
+int __rte_experimental
+rte_dev_hotplug_handle_enable(void)
+{
+ RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n");
+ return -1;
+}
+
+int __rte_experimental
+rte_dev_hotplug_handle_disable(void)
+{
+ RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n");
+ return -1;
+}
diff --git a/lib/librte_eal/bsdapp/eal/eal_memalloc.c b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
index f7f07abd..a5847f0b 100644
--- a/lib/librte_eal/bsdapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
@@ -4,6 +4,7 @@
#include <inttypes.h>
+#include <rte_errno.h>
#include <rte_log.h>
#include <rte_memory.h>
@@ -48,6 +49,26 @@ eal_memalloc_sync_with_primary(void)
}
int
+eal_memalloc_get_seg_fd(int list_idx __rte_unused, int seg_idx __rte_unused)
+{
+ return -ENOTSUP;
+}
+
+int
+eal_memalloc_set_seg_fd(int list_idx __rte_unused, int seg_idx __rte_unused,
+ int fd __rte_unused)
+{
+ return -ENOTSUP;
+}
+
+int
+eal_memalloc_get_seg_fd_offset(int list_idx __rte_unused,
+ int seg_idx __rte_unused, size_t *offset __rte_unused)
+{
+ return -ENOTSUP;
+}
+
+int
eal_memalloc_init(void)
{
return 0;
diff --git a/lib/librte_eal/bsdapp/eal/eal_memory.c b/lib/librte_eal/bsdapp/eal/eal_memory.c
index 16d2bc7c..4b092e1f 100644
--- a/lib/librte_eal/bsdapp/eal/eal_memory.c
+++ b/lib/librte_eal/bsdapp/eal/eal_memory.c
@@ -79,6 +79,7 @@ rte_eal_hugepage_init(void)
}
msl->base_va = addr;
msl->page_sz = page_sz;
+ msl->len = internal_config.memory;
msl->socket_id = 0;
/* populate memsegs. each memseg is 1 page long */
@@ -235,12 +236,15 @@ struct attach_walk_args {
int seg_idx;
};
static int
-attach_segment(const struct rte_memseg_list *msl __rte_unused,
- const struct rte_memseg *ms, void *arg)
+attach_segment(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+ void *arg)
{
struct attach_walk_args *wa = arg;
void *addr;
+ if (msl->external)
+ return 0;
+
addr = mmap(ms->addr, ms->len, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, wa->fd_hugepage,
wa->seg_idx * EAL_PAGE_SIZE);
@@ -370,6 +374,7 @@ alloc_va_space(struct rte_memseg_list *msl)
return -1;
}
msl->base_va = addr;
+ msl->len = mem_sz;
return 0;
}
diff --git a/lib/librte_eal/common/Makefile b/lib/librte_eal/common/Makefile
index cca68826..87d8c455 100644
--- a/lib/librte_eal/common/Makefile
+++ b/lib/librte_eal/common/Makefile
@@ -12,6 +12,7 @@ INC += rte_tailq.h rte_interrupts.h rte_alarm.h
INC += rte_string_fns.h rte_version.h
INC += rte_eal_memconfig.h rte_malloc_heap.h
INC += rte_hexdump.h rte_devargs.h rte_bus.h rte_dev.h rte_class.h
+INC += rte_option.h
INC += rte_pci_dev_feature_defs.h rte_pci_dev_features.h
INC += rte_malloc.h rte_keepalive.h rte_time.h
INC += rte_service.h rte_service_component.h
diff --git a/lib/librte_eal/common/arch/arm/meson.build b/lib/librte_eal/common/arch/arm/meson.build
index c6bd9227..79731e1a 100644
--- a/lib/librte_eal/common/arch/arm/meson.build
+++ b/lib/librte_eal/common/arch/arm/meson.build
@@ -2,4 +2,4 @@
# Copyright(c) 2017 Intel Corporation.
eal_common_arch_sources = files('rte_cpuflags.c',
- 'rte_cycles.c')
+ 'rte_cycles.c', 'rte_hypervisor.c')
diff --git a/lib/librte_eal/common/arch/ppc_64/meson.build b/lib/librte_eal/common/arch/ppc_64/meson.build
new file mode 100644
index 00000000..40b3dc53
--- /dev/null
+++ b/lib/librte_eal/common/arch/ppc_64/meson.build
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Luca Boccassi <bluca@debian.org>
+
+eal_common_arch_sources = files('rte_cpuflags.c',
+ 'rte_cycles.c', 'rte_hypervisor.c')
diff --git a/lib/librte_eal/common/arch/x86/meson.build b/lib/librte_eal/common/arch/x86/meson.build
index 4e0f7790..14bf204c 100644
--- a/lib/librte_eal/common/arch/x86/meson.build
+++ b/lib/librte_eal/common/arch/x86/meson.build
@@ -2,4 +2,4 @@
# Copyright(c) 2017 Intel Corporation
eal_common_arch_sources = files('rte_spinlock.c', 'rte_cpuflags.c',
- 'rte_cycles.c')
+ 'rte_cycles.c', 'rte_hypervisor.c')
diff --git a/lib/librte_eal/common/eal_common_bus.c b/lib/librte_eal/common/eal_common_bus.c
index 0943851c..c8f1901f 100644
--- a/lib/librte_eal/common/eal_common_bus.c
+++ b/lib/librte_eal/common/eal_common_bus.c
@@ -37,10 +37,11 @@
#include <rte_bus.h>
#include <rte_debug.h>
#include <rte_string_fns.h>
+#include <rte_errno.h>
#include "eal_private.h"
-struct rte_bus_list rte_bus_list =
+static struct rte_bus_list rte_bus_list =
TAILQ_HEAD_INITIALIZER(rte_bus_list);
void
@@ -242,3 +243,45 @@ rte_bus_get_iommu_class(void)
}
return mode;
}
+
+static int
+bus_handle_sigbus(const struct rte_bus *bus,
+ const void *failure_addr)
+{
+ int ret;
+
+ if (!bus->sigbus_handler)
+ return -1;
+
+ ret = bus->sigbus_handler(failure_addr);
+
+ /* find bus but handle failed, keep the errno be set. */
+ if (ret < 0 && rte_errno == 0)
+ rte_errno = ENOTSUP;
+
+ return ret > 0;
+}
+
+int
+rte_bus_sigbus_handler(const void *failure_addr)
+{
+ struct rte_bus *bus;
+
+ int ret = 0;
+ int old_errno = rte_errno;
+
+ rte_errno = 0;
+
+ bus = rte_bus_find(NULL, bus_handle_sigbus, failure_addr);
+ /* can not find bus. */
+ if (!bus)
+ return 1;
+ /* find bus but handle failed, pass on the new errno. */
+ else if (rte_errno != 0)
+ return -1;
+
+ /* restore the old errno. */
+ rte_errno = old_errno;
+
+ return ret;
+}
diff --git a/lib/librte_eal/common/eal_common_class.c b/lib/librte_eal/common/eal_common_class.c
index 404a9065..d922266d 100644
--- a/lib/librte_eal/common/eal_common_class.c
+++ b/lib/librte_eal/common/eal_common_class.c
@@ -9,7 +9,7 @@
#include <rte_class.h>
#include <rte_debug.h>
-struct rte_class_list rte_class_list =
+static struct rte_class_list rte_class_list =
TAILQ_HEAD_INITIALIZER(rte_class_list);
__rte_experimental void
diff --git a/lib/librte_eal/common/eal_common_dev.c b/lib/librte_eal/common/eal_common_dev.c
index 678dbcac..62e9ed47 100644
--- a/lib/librte_eal/common/eal_common_dev.c
+++ b/lib/librte_eal/common/eal_common_dev.c
@@ -19,8 +19,10 @@
#include <rte_log.h>
#include <rte_spinlock.h>
#include <rte_malloc.h>
+#include <rte_string_fns.h>
#include "eal_private.h"
+#include "hotplug_mp.h"
/**
* The device event callback description.
@@ -74,119 +76,110 @@ static int cmp_dev_name(const struct rte_device *dev, const void *_name)
return strcmp(dev->name, name);
}
-int rte_eal_dev_attach(const char *name, const char *devargs)
+int __rte_experimental
+rte_dev_is_probed(const struct rte_device *dev)
{
- struct rte_bus *bus;
+ /* The field driver should be set only when the probe is successful. */
+ return dev->driver != NULL;
+}
- if (name == NULL || devargs == NULL) {
- RTE_LOG(ERR, EAL, "Invalid device or arguments provided\n");
+/* helper function to build devargs, caller should free the memory */
+static int
+build_devargs(const char *busname, const char *devname,
+ const char *drvargs, char **devargs)
+{
+ int length;
+
+ length = snprintf(NULL, 0, "%s:%s,%s", busname, devname, drvargs);
+ if (length < 0)
return -EINVAL;
- }
- bus = rte_bus_find_by_device_name(name);
- if (bus == NULL) {
- RTE_LOG(ERR, EAL, "Unable to find a bus for the device '%s'\n",
- name);
+ *devargs = malloc(length + 1);
+ if (*devargs == NULL)
+ return -ENOMEM;
+
+ length = snprintf(*devargs, length + 1, "%s:%s,%s",
+ busname, devname, drvargs);
+ if (length < 0) {
+ free(*devargs);
return -EINVAL;
}
- if (strcmp(bus->name, "pci") == 0 || strcmp(bus->name, "vdev") == 0)
- return rte_eal_hotplug_add(bus->name, name, devargs);
-
- RTE_LOG(ERR, EAL,
- "Device attach is only supported for PCI and vdev devices.\n");
- return -ENOTSUP;
+ return 0;
}
-int rte_eal_dev_detach(struct rte_device *dev)
+int
+rte_eal_hotplug_add(const char *busname, const char *devname,
+ const char *drvargs)
{
- struct rte_bus *bus;
- int ret;
- if (dev == NULL) {
- RTE_LOG(ERR, EAL, "Invalid device provided.\n");
- return -EINVAL;
- }
+ char *devargs;
+ int ret;
- bus = rte_bus_find_by_device(dev);
- if (bus == NULL) {
- RTE_LOG(ERR, EAL, "Cannot find bus for device (%s)\n",
- dev->name);
- return -EINVAL;
- }
+ ret = build_devargs(busname, devname, drvargs, &devargs);
+ if (ret != 0)
+ return ret;
- if (bus->unplug == NULL) {
- RTE_LOG(ERR, EAL, "Bus function not supported\n");
- return -ENOTSUP;
- }
+ ret = rte_dev_probe(devargs);
+ free(devargs);
- ret = bus->unplug(dev);
- if (ret)
- RTE_LOG(ERR, EAL, "Driver cannot detach the device (%s)\n",
- dev->name);
return ret;
}
-int __rte_experimental rte_eal_hotplug_add(const char *busname, const char *devname,
- const char *devargs)
+/* probe device at local process. */
+int
+local_dev_probe(const char *devargs, struct rte_device **new_dev)
{
- struct rte_bus *bus;
struct rte_device *dev;
struct rte_devargs *da;
int ret;
- bus = rte_bus_find_by_name(busname);
- if (bus == NULL) {
- RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", busname);
- return -ENOENT;
- }
-
- if (bus->plug == NULL) {
- RTE_LOG(ERR, EAL, "Function plug not supported by bus (%s)\n",
- bus->name);
- return -ENOTSUP;
- }
-
+ *new_dev = NULL;
da = calloc(1, sizeof(*da));
if (da == NULL)
return -ENOMEM;
- ret = rte_devargs_parsef(da, "%s:%s,%s",
- busname, devname, devargs);
+ ret = rte_devargs_parse(da, devargs);
if (ret)
goto err_devarg;
+ if (da->bus->plug == NULL) {
+ RTE_LOG(ERR, EAL, "Function plug not supported by bus (%s)\n",
+ da->bus->name);
+ ret = -ENOTSUP;
+ goto err_devarg;
+ }
+
ret = rte_devargs_insert(da);
if (ret)
goto err_devarg;
- ret = bus->scan();
+ ret = da->bus->scan();
if (ret)
goto err_devarg;
- dev = bus->find_device(NULL, cmp_dev_name, devname);
+ dev = da->bus->find_device(NULL, cmp_dev_name, da->name);
if (dev == NULL) {
RTE_LOG(ERR, EAL, "Cannot find device (%s)\n",
- devname);
+ da->name);
ret = -ENODEV;
goto err_devarg;
}
- if (dev->driver != NULL) {
- RTE_LOG(ERR, EAL, "Device is already plugged\n");
- return -EEXIST;
- }
-
- ret = bus->plug(dev);
+ ret = dev->bus->plug(dev);
if (ret) {
+ if (rte_dev_is_probed(dev)) /* if already succeeded earlier */
+ return ret; /* no rollback */
RTE_LOG(ERR, EAL, "Driver cannot attach the device (%s)\n",
dev->name);
goto err_devarg;
}
+
+ *new_dev = dev;
return 0;
err_devarg:
- if (rte_devargs_remove(busname, devname)) {
+ if (rte_devargs_remove(da) != 0) {
free(da->args);
free(da);
}
@@ -194,40 +187,235 @@ err_devarg:
}
int __rte_experimental
-rte_eal_hotplug_remove(const char *busname, const char *devname)
+rte_dev_probe(const char *devargs)
{
- struct rte_bus *bus;
+ struct eal_dev_mp_req req;
struct rte_device *dev;
int ret;
+ memset(&req, 0, sizeof(req));
+ req.t = EAL_DEV_REQ_TYPE_ATTACH;
+ strlcpy(req.devargs, devargs, EAL_DEV_MP_DEV_ARGS_MAX_LEN);
+
+ if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
+ /**
+ * If in secondary process, just send IPC request to
+ * primary process.
+ */
+ ret = eal_dev_hotplug_request_to_primary(&req);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL,
+ "Failed to send hotplug request to primary\n");
+ return -ENOMSG;
+ }
+ if (req.result != 0)
+ RTE_LOG(ERR, EAL,
+ "Failed to hotplug add device\n");
+ return req.result;
+ }
+
+ /* attach a shared device from primary start from here: */
+
+ /* primary attach the new device itself. */
+ ret = local_dev_probe(devargs, &dev);
+
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL,
+ "Failed to attach device on primary process\n");
+
+ /**
+ * it is possible that secondary process failed to attached a
+ * device that primary process have during initialization,
+ * so for -EEXIST case, we still need to sync with secondary
+ * process.
+ */
+ if (ret != -EEXIST)
+ return ret;
+ }
+
+ /* primary send attach sync request to secondary. */
+ ret = eal_dev_hotplug_request_to_secondary(&req);
+
+ /* if any communication error, we need to rollback. */
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL,
+ "Failed to send hotplug add request to secondary\n");
+ ret = -ENOMSG;
+ goto rollback;
+ }
+
+ /**
+ * if any secondary failed to attach, we need to consider if rollback
+ * is necessary.
+ */
+ if (req.result != 0) {
+ RTE_LOG(ERR, EAL,
+ "Failed to attach device on secondary process\n");
+ ret = req.result;
+
+ /* for -EEXIST, we don't need to rollback. */
+ if (ret == -EEXIST)
+ return ret;
+ goto rollback;
+ }
+
+ return 0;
+
+rollback:
+ req.t = EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK;
+
+ /* primary send rollback request to secondary. */
+ if (eal_dev_hotplug_request_to_secondary(&req) != 0)
+ RTE_LOG(WARNING, EAL,
+ "Failed to rollback device attach on secondary."
+ "Devices in secondary may not sync with primary\n");
+
+ /* primary rollback itself. */
+ if (local_dev_remove(dev) != 0)
+ RTE_LOG(WARNING, EAL,
+ "Failed to rollback device attach on primary."
+ "Devices in secondary may not sync with primary\n");
+
+ return ret;
+}
+
+int
+rte_eal_hotplug_remove(const char *busname, const char *devname)
+{
+ struct rte_device *dev;
+ struct rte_bus *bus;
+
bus = rte_bus_find_by_name(busname);
if (bus == NULL) {
RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", busname);
return -ENOENT;
}
- if (bus->unplug == NULL) {
- RTE_LOG(ERR, EAL, "Function unplug not supported by bus (%s)\n",
- bus->name);
- return -ENOTSUP;
- }
-
dev = bus->find_device(NULL, cmp_dev_name, devname);
if (dev == NULL) {
RTE_LOG(ERR, EAL, "Cannot find plugged device (%s)\n", devname);
return -EINVAL;
}
- if (dev->driver == NULL) {
- RTE_LOG(ERR, EAL, "Device is already unplugged\n");
- return -ENOENT;
+ return rte_dev_remove(dev);
+}
+
+/* remove device at local process. */
+int
+local_dev_remove(struct rte_device *dev)
+{
+ int ret;
+
+ if (dev->bus->unplug == NULL) {
+ RTE_LOG(ERR, EAL, "Function unplug not supported by bus (%s)\n",
+ dev->bus->name);
+ return -ENOTSUP;
}
- ret = bus->unplug(dev);
- if (ret)
+ ret = dev->bus->unplug(dev);
+ if (ret) {
RTE_LOG(ERR, EAL, "Driver cannot detach the device (%s)\n",
dev->name);
- rte_devargs_remove(busname, devname);
+ return ret;
+ }
+
+ return 0;
+}
+
+int __rte_experimental
+rte_dev_remove(struct rte_device *dev)
+{
+ struct eal_dev_mp_req req;
+ char *devargs;
+ int ret;
+
+ if (!rte_dev_is_probed(dev)) {
+ RTE_LOG(ERR, EAL, "Device is not probed\n");
+ return -ENOENT;
+ }
+
+ ret = build_devargs(dev->bus->name, dev->name, "", &devargs);
+ if (ret != 0)
+ return ret;
+
+ memset(&req, 0, sizeof(req));
+ req.t = EAL_DEV_REQ_TYPE_DETACH;
+ strlcpy(req.devargs, devargs, EAL_DEV_MP_DEV_ARGS_MAX_LEN);
+ free(devargs);
+
+ if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
+ /**
+ * If in secondary process, just send IPC request to
+ * primary process.
+ */
+ ret = eal_dev_hotplug_request_to_primary(&req);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL,
+ "Failed to send hotplug request to primary\n");
+ return -ENOMSG;
+ }
+ if (req.result != 0)
+ RTE_LOG(ERR, EAL,
+ "Failed to hotplug remove device\n");
+ return req.result;
+ }
+
+ /* detach a device from primary start from here: */
+
+ /* primary send detach sync request to secondary */
+ ret = eal_dev_hotplug_request_to_secondary(&req);
+
+ /**
+ * if communication error, we need to rollback, because it is possible
+ * part of the secondary processes still detached it successfully.
+ */
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL,
+ "Failed to send device detach request to secondary\n");
+ ret = -ENOMSG;
+ goto rollback;
+ }
+
+ /**
+ * if any secondary failed to detach, we need to consider if rollback
+ * is necessary.
+ */
+ if (req.result != 0) {
+ RTE_LOG(ERR, EAL,
+ "Failed to detach device on secondary process\n");
+ ret = req.result;
+ /**
+ * if -ENOENT, we don't need to rollback, since devices is
+ * already detached on secondary process.
+ */
+ if (ret != -ENOENT)
+ goto rollback;
+ }
+
+ /* primary detach the device itself. */
+ ret = local_dev_remove(dev);
+
+ /* if primary failed, still need to consider if rollback is necessary */
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL,
+ "Failed to detach device on primary process\n");
+ /* if -ENOENT, we don't need to rollback */
+ if (ret == -ENOENT)
+ return ret;
+ goto rollback;
+ }
+
+ return 0;
+
+rollback:
+ req.t = EAL_DEV_REQ_TYPE_DETACH_ROLLBACK;
+
+ /* primary send rollback request to secondary. */
+ if (eal_dev_hotplug_request_to_secondary(&req) != 0)
+ RTE_LOG(WARNING, EAL,
+ "Failed to rollback device detach on secondary."
+ "Devices in secondary may not sync with primary\n");
+
return ret;
}
@@ -342,8 +530,9 @@ rte_dev_event_callback_unregister(const char *device_name,
return ret;
}
-void
-dev_callback_process(char *device_name, enum rte_dev_event_type event)
+void __rte_experimental
+rte_dev_event_callback_process(const char *device_name,
+ enum rte_dev_event_type event)
{
struct dev_event_callback *cb_lst;
diff --git a/lib/librte_eal/common/eal_common_devargs.c b/lib/librte_eal/common/eal_common_devargs.c
index dac2402a..b7b9cb69 100644
--- a/lib/librte_eal/common/eal_common_devargs.c
+++ b/lib/librte_eal/common/eal_common_devargs.c
@@ -4,9 +4,6 @@
/* This file manages the list of devices and their arguments, as given
* by the user at startup
- *
- * Code here should not call rte_log since the EAL environment
- * may not be initialized.
*/
#include <stdio.h>
@@ -28,39 +25,9 @@
TAILQ_HEAD(rte_devargs_list, rte_devargs);
/** Global list of user devices */
-struct rte_devargs_list devargs_list =
+static struct rte_devargs_list devargs_list =
TAILQ_HEAD_INITIALIZER(devargs_list);
-int
-rte_eal_parse_devargs_str(const char *devargs_str,
- char **drvname, char **drvargs)
-{
- char *sep;
-
- if ((devargs_str) == NULL || (drvname) == NULL || (drvargs == NULL))
- return -1;
-
- *drvname = strdup(devargs_str);
- if (*drvname == NULL)
- return -1;
-
- /* set the first ',' to '\0' to split name and arguments */
- sep = strchr(*drvname, ',');
- if (sep != NULL) {
- sep[0] = '\0';
- *drvargs = strdup(sep + 1);
- } else {
- *drvargs = strdup("");
- }
-
- if (*drvargs == NULL) {
- free(*drvname);
- *drvname = NULL;
- return -1;
- }
- return 0;
-}
-
static size_t
devargs_layer_count(const char *s)
{
@@ -270,6 +237,7 @@ rte_devargs_parsef(struct rte_devargs *da, const char *format, ...)
va_list ap;
size_t len;
char *dev;
+ int ret;
if (da == NULL)
return -EINVAL;
@@ -288,7 +256,10 @@ rte_devargs_parsef(struct rte_devargs *da, const char *format, ...)
vsnprintf(dev, len + 1, format, ap);
va_end(ap);
- return rte_devargs_parse(da, dev);
+ ret = rte_devargs_parse(da, dev);
+
+ free(dev);
+ return ret;
}
int __rte_experimental
@@ -296,7 +267,7 @@ rte_devargs_insert(struct rte_devargs *da)
{
int ret;
- ret = rte_devargs_remove(da->bus->name, da->name);
+ ret = rte_devargs_remove(da);
if (ret < 0)
return ret;
TAILQ_INSERT_TAIL(&devargs_list, da, next);
@@ -342,14 +313,17 @@ fail:
}
int __rte_experimental
-rte_devargs_remove(const char *busname, const char *devname)
+rte_devargs_remove(struct rte_devargs *devargs)
{
struct rte_devargs *d;
void *tmp;
+ if (devargs == NULL || devargs->bus == NULL)
+ return -1;
+
TAILQ_FOREACH_SAFE(d, &devargs_list, next, tmp) {
- if (strcmp(d->bus->name, busname) == 0 &&
- strcmp(d->name, devname) == 0) {
+ if (strcmp(d->bus->name, devargs->bus->name) == 0 &&
+ strcmp(d->name, devargs->name) == 0) {
TAILQ_REMOVE(&devargs_list, d, next);
free(d->args);
free(d);
diff --git a/lib/librte_eal/common/eal_common_fbarray.c b/lib/librte_eal/common/eal_common_fbarray.c
index 43caf3ce..ea0735cb 100644
--- a/lib/librte_eal/common/eal_common_fbarray.c
+++ b/lib/librte_eal/common/eal_common_fbarray.c
@@ -2,6 +2,7 @@
* Copyright(c) 2017-2018 Intel Corporation
*/
+#include <fcntl.h>
#include <inttypes.h>
#include <limits.h>
#include <sys/mman.h>
@@ -878,6 +879,10 @@ rte_fbarray_destroy(struct rte_fbarray *arr)
if (ret)
return ret;
+ /* with no shconf, there were never any files to begin with */
+ if (internal_config.no_shconf)
+ return 0;
+
/* try deleting the file */
eal_get_fbarray_path(path, sizeof(path), arr->name);
diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index fbfb1b05..12dcedf5 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -2,6 +2,7 @@
* Copyright(c) 2010-2014 Intel Corporation
*/
+#include <fcntl.h>
#include <errno.h>
#include <stdio.h>
#include <stdint.h>
@@ -37,6 +38,23 @@
static void *next_baseaddr;
static uint64_t system_page_sz;
+#ifdef RTE_ARCH_64
+/*
+ * Linux kernel uses a really high address as starting address for serving
+ * mmaps calls. If there exists addressing limitations and IOVA mode is VA,
+ * this starting address is likely too high for those devices. However, it
+ * is possible to use a lower address in the process virtual address space
+ * as with 64 bits there is a lot of available space.
+ *
+ * Current known limitations are 39 or 40 bits. Setting the starting address
+ * at 4GB implies there are 508GB or 1020GB for mapping the available
+ * hugepages. This is likely enough for most systems, although a device with
+ * addressing limitations should call rte_eal_check_dma_mask for ensuring all
+ * memory is within supported range.
+ */
+static uint64_t baseaddr = 0x100000000;
+#endif
+
void *
eal_get_virtual_area(void *requested_addr, size_t *size,
size_t page_sz, int flags, int mmap_flags)
@@ -60,6 +78,11 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
rte_eal_process_type() == RTE_PROC_PRIMARY)
next_baseaddr = (void *) internal_config.base_virtaddr;
+#ifdef RTE_ARCH_64
+ if (next_baseaddr == NULL && internal_config.base_virtaddr == 0 &&
+ rte_eal_process_type() == RTE_PROC_PRIMARY)
+ next_baseaddr = (void *) baseaddr;
+#endif
if (requested_addr == NULL && next_baseaddr != NULL) {
requested_addr = next_baseaddr;
requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz);
@@ -91,7 +114,17 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
mmap_flags, -1, 0);
if (mapped_addr == MAP_FAILED && allow_shrink)
*size -= page_sz;
- } while (allow_shrink && mapped_addr == MAP_FAILED && *size > 0);
+
+ if (mapped_addr != MAP_FAILED && addr_is_hint &&
+ mapped_addr != requested_addr) {
+ /* hint was not used. Try with another offset */
+ munmap(mapped_addr, map_sz);
+ mapped_addr = MAP_FAILED;
+ next_baseaddr = RTE_PTR_ADD(next_baseaddr, page_sz);
+ requested_addr = next_baseaddr;
+ }
+ } while ((allow_shrink || addr_is_hint) &&
+ mapped_addr == MAP_FAILED && *size > 0);
/* align resulting address - if map failed, we will ignore the value
* anyway, so no need to add additional checks.
@@ -171,7 +204,7 @@ virt2memseg(const void *addr, const struct rte_memseg_list *msl)
/* a memseg list was specified, check if it's the right one */
start = msl->base_va;
- end = RTE_PTR_ADD(start, (size_t)msl->page_sz * msl->memseg_arr.len);
+ end = RTE_PTR_ADD(start, msl->len);
if (addr < start || addr >= end)
return NULL;
@@ -194,8 +227,7 @@ virt2memseg_list(const void *addr)
msl = &mcfg->memsegs[msl_idx];
start = msl->base_va;
- end = RTE_PTR_ADD(start,
- (size_t)msl->page_sz * msl->memseg_arr.len);
+ end = RTE_PTR_ADD(start, msl->len);
if (addr >= start && addr < end)
break;
}
@@ -273,6 +305,9 @@ physmem_size(const struct rte_memseg_list *msl, void *arg)
{
uint64_t *total_len = arg;
+ if (msl->external)
+ return 0;
+
*total_len += msl->memseg_arr.count * msl->page_sz;
return 0;
@@ -294,7 +329,7 @@ dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
void *arg)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- int msl_idx, ms_idx;
+ int msl_idx, ms_idx, fd;
FILE *f = arg;
msl_idx = msl - mcfg->memsegs;
@@ -305,10 +340,11 @@ dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
if (ms_idx < 0)
return -1;
+ fd = eal_memalloc_get_seg_fd(msl_idx, ms_idx);
fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, "
"virt:%p, socket_id:%"PRId32", "
"hugepage_sz:%"PRIu64", nchannel:%"PRIx32", "
- "nrank:%"PRIx32"\n",
+ "nrank:%"PRIx32" fd:%i\n",
msl_idx, ms_idx,
ms->iova,
ms->len,
@@ -316,7 +352,8 @@ dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
ms->socket_id,
ms->hugepage_sz,
ms->nchannel,
- ms->nrank);
+ ms->nrank,
+ fd);
return 0;
}
@@ -383,6 +420,66 @@ rte_dump_physmem_layout(FILE *f)
rte_memseg_walk(dump_memseg, f);
}
+static int
+check_iova(const struct rte_memseg_list *msl __rte_unused,
+ const struct rte_memseg *ms, void *arg)
+{
+ uint64_t *mask = arg;
+ rte_iova_t iova;
+
+ /* higher address within segment */
+ iova = (ms->iova + ms->len) - 1;
+ if (!(iova & *mask))
+ return 0;
+
+ RTE_LOG(DEBUG, EAL, "memseg iova %"PRIx64", len %zx, out of range\n",
+ ms->iova, ms->len);
+
+ RTE_LOG(DEBUG, EAL, "\tusing dma mask %"PRIx64"\n", *mask);
+ return 1;
+}
+
+#if defined(RTE_ARCH_64)
+#define MAX_DMA_MASK_BITS 63
+#else
+#define MAX_DMA_MASK_BITS 31
+#endif
+
+/* check memseg iovas are within the required range based on dma mask */
+int __rte_experimental
+rte_eal_check_dma_mask(uint8_t maskbits)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ uint64_t mask;
+
+ /* sanity check */
+ if (maskbits > MAX_DMA_MASK_BITS) {
+ RTE_LOG(ERR, EAL, "wrong dma mask size %u (Max: %u)\n",
+ maskbits, MAX_DMA_MASK_BITS);
+ return -1;
+ }
+
+ /* create dma mask */
+ mask = ~((1ULL << maskbits) - 1);
+
+ if (rte_memseg_walk(check_iova, &mask))
+ /*
+ * Dma mask precludes hugepage usage.
+ * This device can not be used and we do not need to keep
+ * the dma mask.
+ */
+ return 1;
+
+ /*
+ * we need to keep the more restricted maskbit for checking
+ * potential dynamic memory allocation in the future.
+ */
+ mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits :
+ RTE_MIN(mcfg->dma_maskbits, maskbits);
+
+ return 0;
+}
+
/* return the number of memory channels */
unsigned rte_memory_get_nchannel(void)
{
@@ -548,6 +645,105 @@ rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg)
return ret;
}
+int __rte_experimental
+rte_memseg_get_fd_thread_unsafe(const struct rte_memseg *ms)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl;
+ struct rte_fbarray *arr;
+ int msl_idx, seg_idx, ret;
+
+ if (ms == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ msl = rte_mem_virt2memseg_list(ms->addr);
+ if (msl == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ arr = &msl->memseg_arr;
+
+ msl_idx = msl - mcfg->memsegs;
+ seg_idx = rte_fbarray_find_idx(arr, ms);
+
+ if (!rte_fbarray_is_used(arr, seg_idx)) {
+ rte_errno = ENOENT;
+ return -1;
+ }
+
+ ret = eal_memalloc_get_seg_fd(msl_idx, seg_idx);
+ if (ret < 0) {
+ rte_errno = -ret;
+ ret = -1;
+ }
+ return ret;
+}
+
+int __rte_experimental
+rte_memseg_get_fd(const struct rte_memseg *ms)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int ret;
+
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+ ret = rte_memseg_get_fd_thread_unsafe(ms);
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+
+ return ret;
+}
+
+int __rte_experimental
+rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms,
+ size_t *offset)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl;
+ struct rte_fbarray *arr;
+ int msl_idx, seg_idx, ret;
+
+ if (ms == NULL || offset == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ msl = rte_mem_virt2memseg_list(ms->addr);
+ if (msl == NULL) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ arr = &msl->memseg_arr;
+
+ msl_idx = msl - mcfg->memsegs;
+ seg_idx = rte_fbarray_find_idx(arr, ms);
+
+ if (!rte_fbarray_is_used(arr, seg_idx)) {
+ rte_errno = ENOENT;
+ return -1;
+ }
+
+ ret = eal_memalloc_get_seg_fd_offset(msl_idx, seg_idx, offset);
+ if (ret < 0) {
+ rte_errno = -ret;
+ ret = -1;
+ }
+ return ret;
+}
+
+int __rte_experimental
+rte_memseg_get_fd_offset(const struct rte_memseg *ms, size_t *offset)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int ret;
+
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+ ret = rte_memseg_get_fd_offset_thread_unsafe(ms, offset);
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+
+ return ret;
+}
+
/* init memory subsystem */
int
rte_eal_memory_init(void)
diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index 7300fe05..b7081afb 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -120,13 +120,15 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
return NULL;
}
- if ((socket_id != SOCKET_ID_ANY) &&
- (socket_id >= RTE_MAX_NUMA_NODES || socket_id < 0)) {
+ if ((socket_id != SOCKET_ID_ANY) && socket_id < 0) {
rte_errno = EINVAL;
return NULL;
}
- if (!rte_eal_has_hugepages())
+ /* only set socket to SOCKET_ID_ANY if we aren't allocating for an
+ * external heap.
+ */
+ if (!rte_eal_has_hugepages() && socket_id < RTE_MAX_NUMA_NODES)
socket_id = SOCKET_ID_ANY;
contig = (flags & RTE_MEMZONE_IOVA_CONTIG) != 0;
diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index dd5f9740..b82f3ddd 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -58,6 +58,7 @@ eal_long_options[] = {
{OPT_HELP, 0, NULL, OPT_HELP_NUM },
{OPT_HUGE_DIR, 1, NULL, OPT_HUGE_DIR_NUM },
{OPT_HUGE_UNLINK, 0, NULL, OPT_HUGE_UNLINK_NUM },
+ {OPT_IOVA_MODE, 1, NULL, OPT_IOVA_MODE_NUM },
{OPT_LCORES, 1, NULL, OPT_LCORES_NUM },
{OPT_LOG_LEVEL, 1, NULL, OPT_LOG_LEVEL_NUM },
{OPT_MASTER_LCORE, 1, NULL, OPT_MASTER_LCORE_NUM },
@@ -205,6 +206,7 @@ eal_reset_internal_config(struct internal_config *internal_cfg)
#endif
internal_cfg->vmware_tsc_map = 0;
internal_cfg->create_uio_dev = 0;
+ internal_cfg->iova_mode = RTE_IOVA_DC;
internal_cfg->user_mbuf_pool_ops_name = NULL;
internal_cfg->init_complete = 0;
}
@@ -1075,6 +1077,25 @@ eal_parse_proc_type(const char *arg)
return RTE_PROC_INVALID;
}
+static int
+eal_parse_iova_mode(const char *name)
+{
+ int mode;
+
+ if (name == NULL)
+ return -1;
+
+ if (!strcmp("pa", name))
+ mode = RTE_IOVA_PA;
+ else if (!strcmp("va", name))
+ mode = RTE_IOVA_VA;
+ else
+ return -1;
+
+ internal_config.iova_mode = mode;
+ return 0;
+}
+
int
eal_parse_common_option(int opt, const char *optarg,
struct internal_config *conf)
@@ -1281,6 +1302,13 @@ eal_parse_common_option(int opt, const char *optarg,
case OPT_SINGLE_FILE_SEGMENTS_NUM:
conf->single_file_segments = 1;
break;
+ case OPT_IOVA_MODE_NUM:
+ if (eal_parse_iova_mode(optarg) < 0) {
+ RTE_LOG(ERR, EAL, "invalid parameters for --"
+ OPT_IOVA_MODE "\n");
+ return -1;
+ }
+ break;
/* don't know what to do, leave this to caller */
default:
@@ -1384,10 +1412,16 @@ eal_check_common_options(struct internal_config *internal_cfg)
" is only supported in non-legacy memory mode\n");
}
if (internal_cfg->single_file_segments &&
- internal_cfg->hugepage_unlink) {
+ internal_cfg->hugepage_unlink &&
+ !internal_cfg->in_memory) {
RTE_LOG(ERR, EAL, "Option --"OPT_SINGLE_FILE_SEGMENTS" is "
- "not compatible with neither --"OPT_IN_MEMORY" nor "
- "--"OPT_HUGE_UNLINK"\n");
+ "not compatible with --"OPT_HUGE_UNLINK"\n");
+ return -1;
+ }
+ if (internal_cfg->legacy_mem &&
+ internal_cfg->in_memory) {
+ RTE_LOG(ERR, EAL, "Option --"OPT_LEGACY_MEM" is not compatible "
+ "with --"OPT_IN_MEMORY"\n");
return -1;
}
@@ -1428,6 +1462,8 @@ eal_common_usage(void)
" --"OPT_VDEV" Add a virtual device.\n"
" The argument format is <driver><id>[,key=val,...]\n"
" (ex: --vdev=net_pcap0,iface=eth2).\n"
+ " --"OPT_IOVA_MODE" Set IOVA mode. 'pa' for IOVA_PA\n"
+ " 'va' for IOVA_VA\n"
" -d LIB.so|DIR Add a driver or driver directory\n"
" (can be used multiple times)\n"
" --"OPT_VMWARE_TSC_MAP" Use VMware TSC map instead of native RDTSC\n"
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 9fcb9121..97663d3b 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -939,13 +939,17 @@ rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply,
if (check_input(req) == false)
return -1;
+ reply->nb_sent = 0;
+ reply->nb_received = 0;
+ reply->msgs = NULL;
+
if (internal_config.no_shconf) {
RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC is disabled\n");
return 0;
}
if (gettimeofday(&now, NULL) < 0) {
- RTE_LOG(ERR, EAL, "Faile to get current time\n");
+ RTE_LOG(ERR, EAL, "Failed to get current time\n");
rte_errno = errno;
return -1;
}
@@ -954,10 +958,6 @@ rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply,
end.tv_sec = now.tv_sec + ts->tv_sec +
(now.tv_usec * 1000 + ts->tv_nsec) / 1000000000;
- reply->nb_sent = 0;
- reply->nb_received = 0;
- reply->msgs = NULL;
-
/* for secondary process, send request to the primary process only */
if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
pthread_mutex_lock(&pending_requests.lock);
diff --git a/lib/librte_eal/common/eal_common_string_fns.c b/lib/librte_eal/common/eal_common_string_fns.c
index 6ac5f828..60c5dd66 100644
--- a/lib/librte_eal/common/eal_common_string_fns.c
+++ b/lib/librte_eal/common/eal_common_string_fns.c
@@ -38,3 +38,29 @@ einval_error:
errno = EINVAL;
return -1;
}
+
+/* Copy src string into dst.
+ *
+ * Return negative value and NUL-terminate if dst is too short,
+ * Otherwise return number of bytes copied.
+ */
+ssize_t
+rte_strscpy(char *dst, const char *src, size_t dsize)
+{
+ size_t nleft = dsize;
+ size_t res = 0;
+
+ /* Copy as many bytes as will fit. */
+ while (nleft != 0) {
+ dst[res] = src[res];
+ if (src[res] == '\0')
+ return res;
+ res++;
+ nleft--;
+ }
+
+ /* Not enough room in dst, set NUL and return error. */
+ if (res != 0)
+ dst[res - 1] = '\0';
+ return -E2BIG;
+}
diff --git a/lib/librte_eal/common/eal_common_timer.c b/lib/librte_eal/common/eal_common_timer.c
index 2e2b770f..dcf26bfe 100644
--- a/lib/librte_eal/common/eal_common_timer.c
+++ b/lib/librte_eal/common/eal_common_timer.c
@@ -7,9 +7,11 @@
#include <unistd.h>
#include <inttypes.h>
#include <sys/types.h>
+#include <time.h>
#include <errno.h>
#include <rte_common.h>
+#include <rte_compat.h>
#include <rte_log.h>
#include <rte_cycles.h>
#include <rte_pause.h>
@@ -31,6 +33,28 @@ rte_delay_us_block(unsigned int us)
rte_pause();
}
+void __rte_experimental
+rte_delay_us_sleep(unsigned int us)
+{
+ struct timespec wait[2];
+ int ind = 0;
+
+ wait[0].tv_sec = 0;
+ if (us >= US_PER_S) {
+ wait[0].tv_sec = us / US_PER_S;
+ us -= wait[0].tv_sec * US_PER_S;
+ }
+ wait[0].tv_nsec = 1000 * us;
+
+ while (nanosleep(&wait[ind], &wait[1 - ind]) && errno == EINTR) {
+ /*
+ * Sleep was interrupted. Flip the index, so the 'remainder'
+ * will become the 'request' for a next call.
+ */
+ ind = 1 - ind;
+ }
+}
+
uint64_t
rte_get_tsc_hz(void)
{
diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
index de05febf..b3e8ae5e 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -27,7 +27,7 @@ eal_create_runtime_dir(void);
/* returns runtime dir */
const char *
-eal_get_runtime_dir(void);
+rte_eal_get_runtime_dir(void);
#define RUNTIME_CONFIG_FNAME "config"
static inline const char *
@@ -35,7 +35,7 @@ eal_runtime_config_path(void)
{
static char buffer[PATH_MAX]; /* static so auto-zeroed */
- snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(),
+ snprintf(buffer, sizeof(buffer) - 1, "%s/%s", rte_eal_get_runtime_dir(),
RUNTIME_CONFIG_FNAME);
return buffer;
}
@@ -47,7 +47,7 @@ eal_mp_socket_path(void)
{
static char buffer[PATH_MAX]; /* static so auto-zeroed */
- snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(),
+ snprintf(buffer, sizeof(buffer) - 1, "%s/%s", rte_eal_get_runtime_dir(),
MP_SOCKET_FNAME);
return buffer;
}
@@ -55,7 +55,8 @@ eal_mp_socket_path(void)
#define FBARRAY_NAME_FMT "%s/fbarray_%s"
static inline const char *
eal_get_fbarray_path(char *buffer, size_t buflen, const char *name) {
- snprintf(buffer, buflen, FBARRAY_NAME_FMT, eal_get_runtime_dir(), name);
+ snprintf(buffer, buflen, FBARRAY_NAME_FMT, rte_eal_get_runtime_dir(),
+ name);
return buffer;
}
@@ -66,7 +67,7 @@ eal_hugepage_info_path(void)
{
static char buffer[PATH_MAX]; /* static so auto-zeroed */
- snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(),
+ snprintf(buffer, sizeof(buffer) - 1, "%s/%s", rte_eal_get_runtime_dir(),
HUGEPAGE_INFO_FNAME);
return buffer;
}
@@ -78,7 +79,7 @@ eal_hugepage_data_path(void)
{
static char buffer[PATH_MAX]; /* static so auto-zeroed */
- snprintf(buffer, sizeof(buffer) - 1, "%s/%s", eal_get_runtime_dir(),
+ snprintf(buffer, sizeof(buffer) - 1, "%s/%s", rte_eal_get_runtime_dir(),
HUGEPAGE_DATA_FNAME);
return buffer;
}
@@ -99,7 +100,7 @@ eal_get_hugefile_path(char *buffer, size_t buflen, const char *hugedir, int f_id
static inline const char *
eal_get_hugefile_lock_path(char *buffer, size_t buflen, int f_id)
{
- snprintf(buffer, buflen, HUGEFILE_LOCK_FMT, eal_get_runtime_dir(),
+ snprintf(buffer, buflen, HUGEFILE_LOCK_FMT, rte_eal_get_runtime_dir(),
f_id);
buffer[buflen - 1] = '\0';
return buffer;
diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index 00ee6e06..737f17e3 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -70,6 +70,7 @@ struct internal_config {
/**< user defined mbuf pool ops name */
unsigned num_hugepage_sizes; /**< how many sizes on this system */
struct hugepage_info hugepage_info[MAX_HUGEPAGE_SIZES];
+ enum rte_iova_mode iova_mode ; /**< Set IOVA mode on this system */
volatile unsigned int init_complete;
/**< indicates whether EAL has completed initialization */
};
diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h
index 36bb1a02..af917c2f 100644
--- a/lib/librte_eal/common/eal_memalloc.h
+++ b/lib/librte_eal/common/eal_memalloc.h
@@ -76,6 +76,17 @@ eal_memalloc_mem_alloc_validator_unregister(const char *name, int socket_id);
int
eal_memalloc_mem_alloc_validate(int socket_id, size_t new_len);
+/* returns fd or -errno */
+int
+eal_memalloc_get_seg_fd(int list_idx, int seg_idx);
+
+/* returns 0 or -errno */
+int
+eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd);
+
+int
+eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset);
+
int
eal_memalloc_init(void);
diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
index 96e16678..5271f944 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -63,6 +63,8 @@ enum {
OPT_LEGACY_MEM_NUM,
#define OPT_SINGLE_FILE_SEGMENTS "single-file-segments"
OPT_SINGLE_FILE_SEGMENTS_NUM,
+#define OPT_IOVA_MODE "iova-mode"
+ OPT_IOVA_MODE_NUM,
OPT_LONG_MAX_NUM
};
diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index 4f809a83..442c6dc4 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -259,18 +259,6 @@ struct rte_bus *rte_bus_find_by_device_name(const char *str);
int rte_mp_channel_init(void);
/**
- * Internal Executes all the user application registered callbacks for
- * the specific device. It is for DPDK internal user only. User
- * application should not call it directly.
- *
- * @param device_name
- * The device name.
- * @param event
- * the device event type.
- */
-void dev_callback_process(char *device_name, enum rte_dev_event_type event);
-
-/**
* @internal
* Parse a device string and store its information in an
* rte_devargs structure.
@@ -304,4 +292,82 @@ int
rte_devargs_layers_parse(struct rte_devargs *devargs,
const char *devstr);
+/*
+ * probe a device at local process.
+ *
+ * @param devargs
+ * Device arguments including bus, class and driver properties.
+ * @param new_dev
+ * new device be probed as output.
+ * @return
+ * 0 on success, negative on error.
+ */
+int local_dev_probe(const char *devargs, struct rte_device **new_dev);
+
+/**
+ * Hotplug remove a given device from a specific bus at local process.
+ *
+ * @param dev
+ * Data structure of the device to remove.
+ * @return
+ * 0 on success, negative on error.
+ */
+int local_dev_remove(struct rte_device *dev);
+
+/**
+ * Iterate over all buses to find the corresponding bus to handle the sigbus
+ * error.
+ * @param failure_addr
+ * Pointer of the fault address of the sigbus error.
+ *
+ * @return
+ * 0 success to handle the sigbus.
+ * -1 failed to handle the sigbus
+ * 1 no bus can handler the sigbus
+ */
+int rte_bus_sigbus_handler(const void *failure_addr);
+
+/**
+ * @internal
+ * Register the sigbus handler.
+ *
+ * @return
+ * - On success, zero.
+ * - On failure, a negative value.
+ */
+int
+dev_sigbus_handler_register(void);
+
+/**
+ * @internal
+ * Unregister the sigbus handler.
+ *
+ * @return
+ * - On success, zero.
+ * - On failure, a negative value.
+ */
+int
+dev_sigbus_handler_unregister(void);
+
+/**
+ * Check if the option is registered.
+ *
+ * @param option
+ * The option to be parsed.
+ *
+ * @return
+ * 0 on success
+ * @return
+ * -1 on fail
+ */
+int
+rte_option_parse(const char *opt);
+
+/**
+ * Iterate through the registered options and execute the associated
+ * callback if enabled.
+ */
+void
+rte_option_init(void);
+
#endif /* _EAL_PRIVATE_H_ */
diff --git a/lib/librte_eal/common/hotplug_mp.c b/lib/librte_eal/common/hotplug_mp.c
new file mode 100644
index 00000000..84f59d95
--- /dev/null
+++ b/lib/librte_eal/common/hotplug_mp.c
@@ -0,0 +1,426 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+#include <string.h>
+
+#include <rte_eal.h>
+#include <rte_alarm.h>
+#include <rte_string_fns.h>
+#include <rte_devargs.h>
+
+#include "hotplug_mp.h"
+#include "eal_private.h"
+
+#define MP_TIMEOUT_S 5 /**< 5 seconds timeouts */
+
+struct mp_reply_bundle {
+ struct rte_mp_msg msg;
+ void *peer;
+};
+
+static int cmp_dev_name(const struct rte_device *dev, const void *_name)
+{
+ const char *name = _name;
+
+ return strcmp(dev->name, name);
+}
+
+/**
+ * Secondary to primary request.
+ * start from function eal_dev_hotplug_request_to_primary.
+ *
+ * device attach on secondary:
+ * a) secondary send sync request to the primary.
+ * b) primary receive the request and attach the new device if
+ * failed goto i).
+ * c) primary forward attach sync request to all secondary.
+ * d) secondary receive the request and attach the device and send a reply.
+ * e) primary check the reply if all success goes to j).
+ * f) primary send attach rollback sync request to all secondary.
+ * g) secondary receive the request and detach the device and send a reply.
+ * h) primary receive the reply and detach device as rollback action.
+ * i) send attach fail to secondary as a reply of step a), goto k).
+ * j) send attach success to secondary as a reply of step a).
+ * k) secondary receive reply and return.
+ *
+ * device detach on secondary:
+ * a) secondary send sync request to the primary.
+ * b) primary send detach sync request to all secondary.
+ * c) secondary detach the device and send a reply.
+ * d) primary check the reply if all success goes to g).
+ * e) primary send detach rollback sync request to all secondary.
+ * f) secondary receive the request and attach back device. goto h).
+ * g) primary detach the device if success goto i), else goto e).
+ * h) primary send detach fail to secondary as a reply of step a), goto j).
+ * i) primary send detach success to secondary as a reply of step a).
+ * j) secondary receive reply and return.
+ */
+
+static int
+send_response_to_secondary(const struct eal_dev_mp_req *req,
+ int result,
+ const void *peer)
+{
+ struct rte_mp_msg mp_resp;
+ struct eal_dev_mp_req *resp =
+ (struct eal_dev_mp_req *)mp_resp.param;
+ int ret;
+
+ memset(&mp_resp, 0, sizeof(mp_resp));
+ mp_resp.len_param = sizeof(*resp);
+ strlcpy(mp_resp.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_resp.name));
+ memcpy(resp, req, sizeof(*req));
+ resp->result = result;
+
+ ret = rte_mp_reply(&mp_resp, peer);
+ if (ret != 0)
+ RTE_LOG(ERR, EAL, "failed to send response to secondary\n");
+
+ return ret;
+}
+
+static void
+__handle_secondary_request(void *param)
+{
+ struct mp_reply_bundle *bundle = param;
+ const struct rte_mp_msg *msg = &bundle->msg;
+ const struct eal_dev_mp_req *req =
+ (const struct eal_dev_mp_req *)msg->param;
+ struct eal_dev_mp_req tmp_req;
+ struct rte_devargs *da;
+ struct rte_device *dev;
+ struct rte_bus *bus;
+ int ret = 0;
+
+ tmp_req = *req;
+
+ if (req->t == EAL_DEV_REQ_TYPE_ATTACH) {
+ ret = local_dev_probe(req->devargs, &dev);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Failed to hotplug add device on primary\n");
+ if (ret != -EEXIST)
+ goto finish;
+ }
+ ret = eal_dev_hotplug_request_to_secondary(&tmp_req);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Failed to send hotplug request to secondary\n");
+ ret = -ENOMSG;
+ goto rollback;
+ }
+ if (tmp_req.result != 0) {
+ ret = tmp_req.result;
+ RTE_LOG(ERR, EAL, "Failed to hotplug add device on secondary\n");
+ if (ret != -EEXIST)
+ goto rollback;
+ }
+ } else if (req->t == EAL_DEV_REQ_TYPE_DETACH) {
+ da = calloc(1, sizeof(*da));
+ if (da == NULL) {
+ ret = -ENOMEM;
+ goto finish;
+ }
+
+ ret = rte_devargs_parse(da, req->devargs);
+ if (ret != 0)
+ goto finish;
+
+ ret = eal_dev_hotplug_request_to_secondary(&tmp_req);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Failed to send hotplug request to secondary\n");
+ ret = -ENOMSG;
+ goto rollback;
+ }
+
+ bus = rte_bus_find_by_name(da->bus->name);
+ if (bus == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", da->bus->name);
+ ret = -ENOENT;
+ goto finish;
+ }
+
+ dev = bus->find_device(NULL, cmp_dev_name, da->name);
+ if (dev == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot find plugged device (%s)\n", da->name);
+ ret = -ENOENT;
+ goto finish;
+ }
+
+ if (tmp_req.result != 0) {
+ RTE_LOG(ERR, EAL, "Failed to hotplug remove device on secondary\n");
+ ret = tmp_req.result;
+ if (ret != -ENOENT)
+ goto rollback;
+ }
+
+ ret = local_dev_remove(dev);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Failed to hotplug remove device on primary\n");
+ if (ret != -ENOENT)
+ goto rollback;
+ }
+ } else {
+ RTE_LOG(ERR, EAL, "unsupported secondary to primary request\n");
+ ret = -ENOTSUP;
+ }
+ goto finish;
+
+rollback:
+ if (req->t == EAL_DEV_REQ_TYPE_ATTACH) {
+ tmp_req.t = EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK;
+ eal_dev_hotplug_request_to_secondary(&tmp_req);
+ local_dev_remove(dev);
+ } else {
+ tmp_req.t = EAL_DEV_REQ_TYPE_DETACH_ROLLBACK;
+ eal_dev_hotplug_request_to_secondary(&tmp_req);
+ }
+
+finish:
+ ret = send_response_to_secondary(&tmp_req, ret, bundle->peer);
+ if (ret)
+ RTE_LOG(ERR, EAL, "failed to send response to secondary\n");
+
+ free(bundle->peer);
+ free(bundle);
+}
+
+static int
+handle_secondary_request(const struct rte_mp_msg *msg, const void *peer)
+{
+ struct mp_reply_bundle *bundle;
+ const struct eal_dev_mp_req *req =
+ (const struct eal_dev_mp_req *)msg->param;
+ int ret = 0;
+
+ bundle = malloc(sizeof(*bundle));
+ if (bundle == NULL) {
+ RTE_LOG(ERR, EAL, "not enough memory\n");
+ return send_response_to_secondary(req, -ENOMEM, peer);
+ }
+
+ bundle->msg = *msg;
+ /**
+ * We need to send reply on interrupt thread, but peer can't be
+ * parsed directly, so this is a temporal hack, need to be fixed
+ * when it is ready.
+ */
+ bundle->peer = strdup(peer);
+
+ /**
+ * We are at IPC callback thread, sync IPC is not allowed due to
+ * dead lock, so we delegate the task to interrupt thread.
+ */
+ ret = rte_eal_alarm_set(1, __handle_secondary_request, bundle);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "failed to add mp task\n");
+ return send_response_to_secondary(req, ret, peer);
+ }
+ return 0;
+}
+
+static void __handle_primary_request(void *param)
+{
+ struct mp_reply_bundle *bundle = param;
+ struct rte_mp_msg *msg = &bundle->msg;
+ const struct eal_dev_mp_req *req =
+ (const struct eal_dev_mp_req *)msg->param;
+ struct rte_mp_msg mp_resp;
+ struct eal_dev_mp_req *resp =
+ (struct eal_dev_mp_req *)mp_resp.param;
+ struct rte_devargs *da;
+ struct rte_device *dev;
+ struct rte_bus *bus;
+ int ret = 0;
+
+ memset(&mp_resp, 0, sizeof(mp_resp));
+
+ switch (req->t) {
+ case EAL_DEV_REQ_TYPE_ATTACH:
+ case EAL_DEV_REQ_TYPE_DETACH_ROLLBACK:
+ ret = local_dev_probe(req->devargs, &dev);
+ break;
+ case EAL_DEV_REQ_TYPE_DETACH:
+ case EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK:
+ da = calloc(1, sizeof(*da));
+ if (da == NULL) {
+ ret = -ENOMEM;
+ goto quit;
+ }
+
+ ret = rte_devargs_parse(da, req->devargs);
+ if (ret != 0)
+ goto quit;
+
+ bus = rte_bus_find_by_name(da->bus->name);
+ if (bus == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", da->bus->name);
+ ret = -ENOENT;
+ goto quit;
+ }
+
+ dev = bus->find_device(NULL, cmp_dev_name, da->name);
+ if (dev == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot find plugged device (%s)\n", da->name);
+ ret = -ENOENT;
+ goto quit;
+ }
+
+ ret = local_dev_remove(dev);
+quit:
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ strlcpy(mp_resp.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_resp.name));
+ mp_resp.len_param = sizeof(*req);
+ memcpy(resp, req, sizeof(*resp));
+ resp->result = ret;
+ if (rte_mp_reply(&mp_resp, bundle->peer) < 0)
+ RTE_LOG(ERR, EAL, "failed to send reply to primary request\n");
+
+ free(bundle->peer);
+ free(bundle);
+}
+
+static int
+handle_primary_request(const struct rte_mp_msg *msg, const void *peer)
+{
+ struct rte_mp_msg mp_resp;
+ const struct eal_dev_mp_req *req =
+ (const struct eal_dev_mp_req *)msg->param;
+ struct eal_dev_mp_req *resp =
+ (struct eal_dev_mp_req *)mp_resp.param;
+ struct mp_reply_bundle *bundle;
+ int ret = 0;
+
+ memset(&mp_resp, 0, sizeof(mp_resp));
+ strlcpy(mp_resp.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_resp.name));
+ mp_resp.len_param = sizeof(*req);
+ memcpy(resp, req, sizeof(*resp));
+
+ bundle = calloc(1, sizeof(*bundle));
+ if (bundle == NULL) {
+ resp->result = -ENOMEM;
+ ret = rte_mp_reply(&mp_resp, peer);
+ if (ret)
+ RTE_LOG(ERR, EAL, "failed to send reply to primary request\n");
+ return ret;
+ }
+
+ bundle->msg = *msg;
+ /**
+ * We need to send reply on interrupt thread, but peer can't be
+ * parsed directly, so this is a temporal hack, need to be fixed
+ * when it is ready.
+ */
+ bundle->peer = (void *)strdup(peer);
+
+ /**
+ * We are at IPC callback thread, sync IPC is not allowed due to
+ * dead lock, so we delegate the task to interrupt thread.
+ */
+ ret = rte_eal_alarm_set(1, __handle_primary_request, bundle);
+ if (ret != 0) {
+ resp->result = ret;
+ ret = rte_mp_reply(&mp_resp, peer);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "failed to send reply to primary request\n");
+ return ret;
+ }
+ }
+ return 0;
+}
+
+int eal_dev_hotplug_request_to_primary(struct eal_dev_mp_req *req)
+{
+ struct rte_mp_msg mp_req;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = MP_TIMEOUT_S, .tv_nsec = 0};
+ struct eal_dev_mp_req *resp;
+ int ret;
+
+ memset(&mp_req, 0, sizeof(mp_req));
+ memcpy(mp_req.param, req, sizeof(*req));
+ mp_req.len_param = sizeof(*req);
+ strlcpy(mp_req.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_req.name));
+
+ ret = rte_mp_request_sync(&mp_req, &mp_reply, &ts);
+ if (ret || mp_reply.nb_received != 1) {
+ RTE_LOG(ERR, EAL, "cannot send request to primary");
+ if (!ret)
+ return -1;
+ return ret;
+ }
+
+ resp = (struct eal_dev_mp_req *)mp_reply.msgs[0].param;
+ req->result = resp->result;
+
+ return ret;
+}
+
+int eal_dev_hotplug_request_to_secondary(struct eal_dev_mp_req *req)
+{
+ struct rte_mp_msg mp_req;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = MP_TIMEOUT_S, .tv_nsec = 0};
+ int ret;
+ int i;
+
+ memset(&mp_req, 0, sizeof(mp_req));
+ memcpy(mp_req.param, req, sizeof(*req));
+ mp_req.len_param = sizeof(*req);
+ strlcpy(mp_req.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_req.name));
+
+ ret = rte_mp_request_sync(&mp_req, &mp_reply, &ts);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "rte_mp_request_sync failed\n");
+ return ret;
+ }
+
+ if (mp_reply.nb_sent != mp_reply.nb_received) {
+ RTE_LOG(ERR, EAL, "not all secondary reply\n");
+ return -1;
+ }
+
+ req->result = 0;
+ for (i = 0; i < mp_reply.nb_received; i++) {
+ struct eal_dev_mp_req *resp =
+ (struct eal_dev_mp_req *)mp_reply.msgs[i].param;
+ if (resp->result != 0) {
+ req->result = resp->result;
+ if (req->t == EAL_DEV_REQ_TYPE_ATTACH &&
+ req->result != -EEXIST)
+ break;
+ if (req->t == EAL_DEV_REQ_TYPE_DETACH &&
+ req->result != -ENOENT)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+int rte_mp_dev_hotplug_init(void)
+{
+ int ret;
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ ret = rte_mp_action_register(EAL_DEV_MP_ACTION_REQUEST,
+ handle_secondary_request);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ EAL_DEV_MP_ACTION_REQUEST);
+ return ret;
+ }
+ } else {
+ ret = rte_mp_action_register(EAL_DEV_MP_ACTION_REQUEST,
+ handle_primary_request);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ EAL_DEV_MP_ACTION_REQUEST);
+ return ret;
+ }
+ }
+
+ return 0;
+}
diff --git a/lib/librte_eal/common/hotplug_mp.h b/lib/librte_eal/common/hotplug_mp.h
new file mode 100644
index 00000000..597fde3d
--- /dev/null
+++ b/lib/librte_eal/common/hotplug_mp.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _HOTPLUG_MP_H_
+#define _HOTPLUG_MP_H_
+
+#include "rte_dev.h"
+#include "rte_bus.h"
+
+#define EAL_DEV_MP_ACTION_REQUEST "eal_dev_mp_request"
+#define EAL_DEV_MP_ACTION_RESPONSE "eal_dev_mp_response"
+
+#define EAL_DEV_MP_DEV_NAME_MAX_LEN RTE_DEV_NAME_MAX_LEN
+#define EAL_DEV_MP_BUS_NAME_MAX_LEN 32
+#define EAL_DEV_MP_DEV_ARGS_MAX_LEN 128
+
+enum eal_dev_req_type {
+ EAL_DEV_REQ_TYPE_ATTACH,
+ EAL_DEV_REQ_TYPE_DETACH,
+ EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK,
+ EAL_DEV_REQ_TYPE_DETACH_ROLLBACK,
+};
+
+struct eal_dev_mp_req {
+ enum eal_dev_req_type t;
+ char devargs[EAL_DEV_MP_DEV_ARGS_MAX_LEN];
+ int result;
+};
+
+/**
+ * This is a synchronous wrapper for secondary process send
+ * request to primary process, this is invoked when an attach
+ * or detach request is issued from primary process.
+ */
+int eal_dev_hotplug_request_to_primary(struct eal_dev_mp_req *req);
+
+/**
+ * this is a synchronous wrapper for primary process send
+ * request to secondary process, this is invoked when an attach
+ * or detach request issued from secondary process.
+ */
+int eal_dev_hotplug_request_to_secondary(struct eal_dev_mp_req *req);
+
+
+#endif /* _HOTPLUG_MP_H_ */
diff --git a/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h b/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h
index c4f974fe..859b0974 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h
@@ -29,8 +29,8 @@ extern "C" {
#ifndef RTE_ARM_EAL_RDTSC_USE_PMU
/**
- * This call is easily portable to any ARM architecture, however,
- * it may be damn slow and inprecise for some tasks.
+ * This call is easily portable to any architecture, however,
+ * it may require a system call and inprecise for some tasks.
*/
static inline uint64_t
__rte_rdtsc_syscall(void)
diff --git a/lib/librte_eal/common/include/arch/ppc_64/meson.build b/lib/librte_eal/common/include/arch/ppc_64/meson.build
new file mode 100644
index 00000000..00f96117
--- /dev/null
+++ b/lib/librte_eal/common/include/arch/ppc_64/meson.build
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Luca Boccassi <bluca@debian.org>
+
+install_headers(
+ 'rte_atomic.h',
+ 'rte_byteorder.h',
+ 'rte_cpuflags.h',
+ 'rte_cycles.h',
+ 'rte_io.h',
+ 'rte_memcpy.h',
+ 'rte_pause.h',
+ 'rte_prefetch.h',
+ 'rte_rwlock.h',
+ 'rte_spinlock.h',
+ 'rte_vect.h',
+ subdir: get_option('include_subdir_arch'))
diff --git a/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h b/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h
index 8bd83576..16e47ce2 100644
--- a/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h
+++ b/lib/librte_eal/common/include/arch/ppc_64/rte_pause.h
@@ -9,10 +9,17 @@
extern "C" {
#endif
+#include "rte_atomic.h"
+
#include "generic/rte_pause.h"
static inline void rte_pause(void)
{
+ /* Set hardware multi-threading low priority */
+ asm volatile("or 1,1,1");
+ /* Set hardware multi-threading medium priority */
+ asm volatile("or 2,2,2");
+ rte_compiler_barrier();
}
#ifdef __cplusplus
diff --git a/lib/librte_eal/common/include/generic/rte_cycles.h b/lib/librte_eal/common/include/generic/rte_cycles.h
index 0ff1af50..ac379e87 100644
--- a/lib/librte_eal/common/include/generic/rte_cycles.h
+++ b/lib/librte_eal/common/include/generic/rte_cycles.h
@@ -13,6 +13,7 @@
*/
#include <stdint.h>
+#include <rte_compat.h>
#include <rte_debug.h>
#include <rte_atomic.h>
@@ -158,6 +159,16 @@ rte_delay_ms(unsigned ms)
void rte_delay_us_block(unsigned int us);
/**
+ * Delay function that uses system sleep.
+ * Does not block the CPU core.
+ *
+ * @param us
+ * Number of microseconds to wait.
+ */
+void __rte_experimental
+rte_delay_us_sleep(unsigned int us);
+
+/**
* Replace rte_delay_us with user defined function.
*
* @param userfunc
diff --git a/lib/librte_eal/common/include/rte_bitmap.h b/lib/librte_eal/common/include/rte_bitmap.h
index d9facc64..7a36ce73 100644
--- a/lib/librte_eal/common/include/rte_bitmap.h
+++ b/lib/librte_eal/common/include/rte_bitmap.h
@@ -88,7 +88,7 @@ __rte_bitmap_index1_inc(struct rte_bitmap *bmp)
static inline uint64_t
__rte_bitmap_mask1_get(struct rte_bitmap *bmp)
{
- return (~1lu) << bmp->offset1;
+ return (~1llu) << bmp->offset1;
}
static inline void
@@ -317,7 +317,7 @@ rte_bitmap_get(struct rte_bitmap *bmp, uint32_t pos)
index2 = pos >> RTE_BITMAP_SLAB_BIT_SIZE_LOG2;
offset2 = pos & RTE_BITMAP_SLAB_BIT_MASK;
slab2 = bmp->array2 + index2;
- return (*slab2) & (1lu << offset2);
+ return (*slab2) & (1llu << offset2);
}
/**
@@ -342,8 +342,8 @@ rte_bitmap_set(struct rte_bitmap *bmp, uint32_t pos)
slab2 = bmp->array2 + index2;
slab1 = bmp->array1 + index1;
- *slab2 |= 1lu << offset2;
- *slab1 |= 1lu << offset1;
+ *slab2 |= 1llu << offset2;
+ *slab1 |= 1llu << offset1;
}
/**
@@ -370,7 +370,7 @@ rte_bitmap_set_slab(struct rte_bitmap *bmp, uint32_t pos, uint64_t slab)
slab1 = bmp->array1 + index1;
*slab2 |= slab;
- *slab1 |= 1lu << offset1;
+ *slab1 |= 1llu << offset1;
}
static inline uint64_t
@@ -408,7 +408,7 @@ rte_bitmap_clear(struct rte_bitmap *bmp, uint32_t pos)
slab2 = bmp->array2 + index2;
/* Return if array2 slab is not all-zeros */
- *slab2 &= ~(1lu << offset2);
+ *slab2 &= ~(1llu << offset2);
if (*slab2){
return;
}
@@ -424,7 +424,7 @@ rte_bitmap_clear(struct rte_bitmap *bmp, uint32_t pos)
index1 = pos >> (RTE_BITMAP_SLAB_BIT_SIZE_LOG2 + RTE_BITMAP_CL_BIT_SIZE_LOG2);
offset1 = (pos >> RTE_BITMAP_CL_BIT_SIZE_LOG2) & RTE_BITMAP_SLAB_BIT_MASK;
slab1 = bmp->array1 + index1;
- *slab1 &= ~(1lu << offset1);
+ *slab1 &= ~(1llu << offset1);
return;
}
diff --git a/lib/librte_eal/common/include/rte_bus.h b/lib/librte_eal/common/include/rte_bus.h
index b7b5b084..6be4b5ca 100644
--- a/lib/librte_eal/common/include/rte_bus.h
+++ b/lib/librte_eal/common/include/rte_bus.h
@@ -168,6 +168,35 @@ typedef int (*rte_bus_unplug_t)(struct rte_device *dev);
typedef int (*rte_bus_parse_t)(const char *name, void *addr);
/**
+ * Implement a specific hot-unplug handler, which is responsible for
+ * handle the failure when device be hot-unplugged. When the event of
+ * hot-unplug be detected, it could call this function to handle
+ * the hot-unplug failure and avoid app crash.
+ * @param dev
+ * Pointer of the device structure.
+ *
+ * @return
+ * 0 on success.
+ * !0 on error.
+ */
+typedef int (*rte_bus_hot_unplug_handler_t)(struct rte_device *dev);
+
+/**
+ * Implement a specific sigbus handler, which is responsible for handling
+ * the sigbus error which is either original memory error, or specific memory
+ * error that caused of device be hot-unplugged. When sigbus error be captured,
+ * it could call this function to handle sigbus error.
+ * @param failure_addr
+ * Pointer of the fault address of the sigbus error.
+ *
+ * @return
+ * 0 for success handle the sigbus for hot-unplug.
+ * 1 for not process it, because it is a generic sigbus error.
+ * -1 for failed to handle the sigbus for hot-unplug.
+ */
+typedef int (*rte_bus_sigbus_handler_t)(const void *failure_addr);
+
+/**
* Bus scan policies
*/
enum rte_bus_scan_mode {
@@ -212,6 +241,11 @@ struct rte_bus {
struct rte_bus_conf conf; /**< Bus configuration */
rte_bus_get_iommu_class_t get_iommu_class; /**< Get iommu class */
rte_dev_iterate_t dev_iterate; /**< Device iterator. */
+ rte_bus_hot_unplug_handler_t hot_unplug_handler;
+ /**< handle hot-unplug failure on the bus */
+ rte_bus_sigbus_handler_t sigbus_handler;
+ /**< handle sigbus error on the bus */
+
};
/**
diff --git a/lib/librte_eal/common/include/rte_common.h b/lib/librte_eal/common/include/rte_common.h
index 069c13ec..cba7bbc1 100644
--- a/lib/librte_eal/common/include/rte_common.h
+++ b/lib/librte_eal/common/include/rte_common.h
@@ -68,6 +68,11 @@ typedef uint16_t unaligned_uint16_t;
/******* Macro to mark functions and fields scheduled for removal *****/
#define __rte_deprecated __attribute__((__deprecated__))
+/**
+ * Mark a function or variable to a weak reference.
+ */
+#define __rte_weak __attribute__((__weak__))
+
/*********** Macros to eliminate unused variable warnings ********/
/**
@@ -164,6 +169,12 @@ static void __attribute__((destructor(RTE_PRIO(prio)), used)) func(void)
*/
#define RTE_PTR_DIFF(ptr1, ptr2) ((uintptr_t)(ptr1) - (uintptr_t)(ptr2))
+/**
+ * Workaround to cast a const field of a structure to non-const type.
+ */
+#define RTE_CAST_FIELD(var, field, type) \
+ (*(type *)((uintptr_t)(var) + offsetof(typeof(*(var)), field)))
+
/*********** Macros/static functions for doing alignment ********/
diff --git a/lib/librte_eal/common/include/rte_dev.h b/lib/librte_eal/common/include/rte_dev.h
index b80a8059..cd6c187c 100644
--- a/lib/librte_eal/common/include/rte_dev.h
+++ b/lib/librte_eal/common/include/rte_dev.h
@@ -39,7 +39,7 @@ struct rte_dev_event {
char *devname; /**< device name */
};
-typedef void (*rte_dev_event_cb_fn)(char *device_name,
+typedef void (*rte_dev_event_cb_fn)(const char *device_name,
enum rte_dev_event_type event,
void *cb_arg);
@@ -156,63 +156,67 @@ struct rte_driver {
struct rte_device {
TAILQ_ENTRY(rte_device) next; /**< Next device */
const char *name; /**< Device name */
- const struct rte_driver *driver;/**< Associated driver */
+ const struct rte_driver *driver; /**< Driver assigned after probing */
+ const struct rte_bus *bus; /**< Bus handle assigned on scan */
int numa_node; /**< NUMA node connection */
- struct rte_devargs *devargs; /**< Device user arguments */
+ struct rte_devargs *devargs; /**< Arguments for latest probing */
};
/**
- * Attach a device to a registered driver.
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
*
- * @param name
- * The device name, that refers to a pci device (or some private
- * way of designating a vdev device). Based on this device name, eal
- * will identify a driver capable of handling it and pass it to the
- * driver probing function.
- * @param devargs
- * Device arguments to be passed to the driver.
- * @return
- * 0 on success, negative on error.
- */
-__rte_deprecated
-int rte_eal_dev_attach(const char *name, const char *devargs);
-
-/**
- * Detach a device from its driver.
+ * Query status of a device.
*
* @param dev
- * A pointer to a rte_device structure.
+ * Generic device pointer.
* @return
- * 0 on success, negative on error.
+ * (int)true if already probed successfully, 0 otherwise.
*/
-__rte_deprecated
-int rte_eal_dev_detach(struct rte_device *dev);
+__rte_experimental
+int rte_dev_is_probed(const struct rte_device *dev);
/**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
* Hotplug add a given device to a specific bus.
*
+ * In multi-process, it will request other processes to add the same device.
+ * A failure, in any process, will rollback the action
+ *
* @param busname
* The bus name the device is added to.
* @param devname
* The device name. Based on this device name, eal will identify a driver
* capable of handling it and pass it to the driver probing function.
- * @param devargs
+ * @param drvargs
* Device arguments to be passed to the driver.
* @return
* 0 on success, negative on error.
*/
-int __rte_experimental rte_eal_hotplug_add(const char *busname, const char *devname,
- const char *devargs);
+int rte_eal_hotplug_add(const char *busname, const char *devname,
+ const char *drvargs);
/**
* @warning
* @b EXPERIMENTAL: this API may change without prior notice
*
+ * Add matching devices.
+ *
+ * In multi-process, it will request other processes to add the same device.
+ * A failure, in any process, will rollback the action
+ *
+ * @param devargs
+ * Device arguments including bus, class and driver properties.
+ * @return
+ * 0 on success, negative on error.
+ */
+int __rte_experimental rte_dev_probe(const char *devargs);
+
+/**
* Hotplug remove a given device from a specific bus.
*
+ * In multi-process, it will request other processes to remove the same device.
+ * A failure, in any process, will rollback the action
+ *
* @param busname
* The bus name the device is removed from.
* @param devname
@@ -220,8 +224,23 @@ int __rte_experimental rte_eal_hotplug_add(const char *busname, const char *devn
* @return
* 0 on success, negative on error.
*/
-int __rte_experimental rte_eal_hotplug_remove(const char *busname,
- const char *devname);
+int rte_eal_hotplug_remove(const char *busname, const char *devname);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Remove one device.
+ *
+ * In multi-process, it will request other processes to remove the same device.
+ * A failure, in any process, will rollback the action
+ *
+ * @param dev
+ * Data structure of the device to remove.
+ * @return
+ * 0 on success, negative on error.
+ */
+int __rte_experimental rte_dev_remove(struct rte_device *dev);
/**
* Device comparison function.
@@ -438,6 +457,22 @@ rte_dev_event_callback_unregister(const char *device_name,
* @warning
* @b EXPERIMENTAL: this API may change without prior notice
*
+ * Executes all the user application registered callbacks for
+ * the specific device.
+ *
+ * @param device_name
+ * The device name.
+ * @param event
+ * the device event type.
+ */
+void __rte_experimental
+rte_dev_event_callback_process(const char *device_name,
+ enum rte_dev_event_type event);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
* Start the device event monitoring.
*
* @return
@@ -460,4 +495,30 @@ rte_dev_event_monitor_start(void);
int __rte_experimental
rte_dev_event_monitor_stop(void);
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Enable hotplug handling for devices.
+ *
+ * @return
+ * - On success, zero.
+ * - On failure, a negative value.
+ */
+int __rte_experimental
+rte_dev_hotplug_handle_enable(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Disable hotplug handling for devices.
+ *
+ * @return
+ * - On success, zero.
+ * - On failure, a negative value.
+ */
+int __rte_experimental
+rte_dev_hotplug_handle_disable(void);
+
#endif /* _RTE_DEV_H_ */
diff --git a/lib/librte_eal/common/include/rte_devargs.h b/lib/librte_eal/common/include/rte_devargs.h
index 097a4ce7..b1f121f8 100644
--- a/lib/librte_eal/common/include/rte_devargs.h
+++ b/lib/librte_eal/common/include/rte_devargs.h
@@ -67,36 +67,6 @@ struct rte_devargs {
};
/**
- * @deprecated
- * Parse a devargs string.
- *
- * For PCI devices, the format of arguments string is "PCI_ADDR" or
- * "PCI_ADDR,key=val,key2=val2,...". Examples: "08:00.1", "0000:5:00.0",
- * "04:00.0,arg=val".
- *
- * For virtual devices, the format of arguments string is "DRIVER_NAME*"
- * or "DRIVER_NAME*,key=val,key2=val2,...". Examples: "net_ring",
- * "net_ring0", "net_pmdAnything,arg=0:arg2=1".
- *
- * The function parses the arguments string to get driver name and driver
- * arguments.
- *
- * @param devargs_str
- * The arguments as given by the user.
- * @param drvname
- * The pointer to the string to store parsed driver name.
- * @param drvargs
- * The pointer to the string to store parsed driver arguments.
- *
- * @return
- * - 0 on success
- * - A negative value on error
- */
-__rte_deprecated
-int rte_eal_parse_devargs_str(const char *devargs_str,
- char **drvname, char **drvargs);
-
-/**
* Parse a device string.
*
* Verify that a bus is capable of handling the device passed
@@ -202,32 +172,12 @@ __rte_experimental
int rte_devargs_add(enum rte_devtype devtype, const char *devargs_str);
/**
- * @deprecated
- * Add a device to the user device list
- * See rte_devargs_parse() for details.
- *
- * @param devtype
- * The type of the device.
- * @param devargs_str
- * The arguments as given by the user.
- *
- * @return
- * - 0 on success
- * - A negative value on error
- */
-__rte_deprecated
-int rte_eal_devargs_add(enum rte_devtype devtype, const char *devargs_str);
-
-/**
* Remove a device from the user device list.
* Its resources are freed.
* If the devargs cannot be found, nothing happens.
*
- * @param busname
- * bus name of the devargs to remove.
- *
- * @param devname
- * device name of the devargs to remove.
+ * @param devargs
+ * The instance or a copy of devargs to remove.
*
* @return
* 0 on success.
@@ -235,8 +185,7 @@ int rte_eal_devargs_add(enum rte_devtype devtype, const char *devargs_str);
* >0 if the devargs was not within the user device list.
*/
__rte_experimental
-int rte_devargs_remove(const char *busname,
- const char *devname);
+int rte_devargs_remove(struct rte_devargs *devargs);
/**
* Count the number of user devices of a specified type
@@ -252,20 +201,6 @@ unsigned int
rte_devargs_type_count(enum rte_devtype devtype);
/**
- * @deprecated
- * Count the number of user devices of a specified type
- *
- * @param devtype
- * The type of the devices to counted.
- *
- * @return
- * The number of devices.
- */
-__rte_deprecated
-unsigned int
-rte_eal_devargs_type_count(enum rte_devtype devtype);
-
-/**
* This function dumps the list of user device and their arguments.
*
* @param f
@@ -275,16 +210,6 @@ __rte_experimental
void rte_devargs_dump(FILE *f);
/**
- * @deprecated
- * This function dumps the list of user device and their arguments.
- *
- * @param f
- * A pointer to a file for output
- */
-__rte_deprecated
-void rte_eal_devargs_dump(FILE *f);
-
-/**
* Find next rte_devargs matching the provided bus name.
*
* @param busname
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index e114dcbd..a0cedd57 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -316,7 +316,7 @@ rte_mp_sendmsg(struct rte_mp_msg *msg);
*
* @param reply
* The reply argument will be for storing all the replied messages;
- * the caller is responsible for free reply->replies.
+ * the caller is responsible for free reply->msgs.
*
* @param ts
* The ts argument specifies how long we can wait for the peer(s) to reply.
@@ -378,6 +378,15 @@ int __rte_experimental
rte_mp_reply(struct rte_mp_msg *msg, const char *peer);
/**
+ * Register all mp action callbacks for hotplug.
+ *
+ * @return
+ * 0 on success, negative on error.
+ */
+int __rte_experimental
+rte_mp_dev_hotplug_init(void);
+
+/**
* Usage function typedef used by the application usage function.
*
* Use this function typedef to define and call rte_set_application_usage_hook()
@@ -498,6 +507,15 @@ enum rte_iova_mode rte_eal_iova_mode(void);
const char *
rte_eal_mbuf_user_pool_ops(void);
+/**
+ * Get the runtime directory of DPDK
+ *
+ * @return
+ * The runtime directory path of DPDK
+ */
+const char *
+rte_eal_get_runtime_dir(void);
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/librte_eal/common/include/rte_eal_interrupts.h b/lib/librte_eal/common/include/rte_eal_interrupts.h
index 6eb49327..9d302f41 100644
--- a/lib/librte_eal/common/include/rte_eal_interrupts.h
+++ b/lib/librte_eal/common/include/rte_eal_interrupts.h
@@ -35,6 +35,7 @@ enum rte_intr_handle_type {
RTE_INTR_HANDLE_EXT, /**< external handler */
RTE_INTR_HANDLE_VDEV, /**< virtual device */
RTE_INTR_HANDLE_DEV_EVENT, /**< device event handle */
+ RTE_INTR_HANDLE_VFIO_REQ, /**< VFIO request handle */
RTE_INTR_HANDLE_MAX /**< count of elements */
};
diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h
index aff0688d..84aabe36 100644
--- a/lib/librte_eal/common/include/rte_eal_memconfig.h
+++ b/lib/librte_eal/common/include/rte_eal_memconfig.h
@@ -30,9 +30,11 @@ struct rte_memseg_list {
uint64_t addr_64;
/**< Makes sure addr is always 64-bits */
};
- int socket_id; /**< Socket ID for all memsegs in this list. */
uint64_t page_sz; /**< Page size for all memsegs in this list. */
+ int socket_id; /**< Socket ID for all memsegs in this list. */
volatile uint32_t version; /**< version number for multiprocess sync. */
+ size_t len; /**< Length of memory area covered by this memseg list. */
+ unsigned int external; /**< 1 if this list points to external memory */
struct rte_fbarray memseg_arr;
};
@@ -70,13 +72,23 @@ struct rte_mem_config {
struct rte_tailq_head tailq_head[RTE_MAX_TAILQ]; /**< Tailqs for objects */
- /* Heaps of Malloc per socket */
- struct malloc_heap malloc_heaps[RTE_MAX_NUMA_NODES];
+ /* Heaps of Malloc */
+ struct malloc_heap malloc_heaps[RTE_MAX_HEAPS];
+
+ /* next socket ID for external malloc heap */
+ int next_socket_id;
/* address of mem_config in primary process. used to map shared config into
* exact same address the primary process maps it.
*/
uint64_t mem_cfg_addr;
+
+ /* legacy mem and single file segments options are shared */
+ uint32_t legacy_mem;
+ uint32_t single_file_segments;
+
+ /* keeps the more restricted dma mask */
+ uint8_t dma_maskbits;
} __attribute__((__packed__));
diff --git a/lib/librte_eal/common/include/rte_malloc.h b/lib/librte_eal/common/include/rte_malloc.h
index a9fb7e45..7249e6aa 100644
--- a/lib/librte_eal/common/include/rte_malloc.h
+++ b/lib/librte_eal/common/include/rte_malloc.h
@@ -264,6 +264,198 @@ rte_malloc_get_socket_stats(int socket,
struct rte_malloc_socket_stats *socket_stats);
/**
+ * Add memory chunk to a heap with specified name.
+ *
+ * @note Multiple memory chunks can be added to the same heap
+ *
+ * @note Before accessing this memory in other processes, it needs to be
+ * attached in each of those processes by calling
+ * ``rte_malloc_heap_memory_attach`` in each other process.
+ *
+ * @note Memory must be previously allocated for DPDK to be able to use it as a
+ * malloc heap. Failing to do so will result in undefined behavior, up to and
+ * including segmentation faults.
+ *
+ * @note Calling this function will erase any contents already present at the
+ * supplied memory address.
+ *
+ * @param heap_name
+ * Name of the heap to add memory chunk to
+ * @param va_addr
+ * Start of virtual area to add to the heap
+ * @param len
+ * Length of virtual area to add to the heap
+ * @param iova_addrs
+ * Array of page IOVA addresses corresponding to each page in this memory
+ * area. Can be NULL, in which case page IOVA addresses will be set to
+ * RTE_BAD_IOVA.
+ * @param n_pages
+ * Number of elements in the iova_addrs array. Ignored if ``iova_addrs``
+ * is NULL.
+ * @param page_sz
+ * Page size of the underlying memory
+ *
+ * @return
+ * - 0 on success
+ * - -1 in case of error, with rte_errno set to one of the following:
+ * EINVAL - one of the parameters was invalid
+ * EPERM - attempted to add memory to a reserved heap
+ * ENOSPC - no more space in internal config to store a new memory chunk
+ */
+int __rte_experimental
+rte_malloc_heap_memory_add(const char *heap_name, void *va_addr, size_t len,
+ rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz);
+
+/**
+ * Remove memory chunk from heap with specified name.
+ *
+ * @note Memory chunk being removed must be the same as one that was added;
+ * partially removing memory chunks is not supported
+ *
+ * @note Memory area must not contain any allocated elements to allow its
+ * removal from the heap
+ *
+ * @note All other processes must detach from the memory chunk prior to it being
+ * removed from the heap.
+ *
+ * @param heap_name
+ * Name of the heap to remove memory from
+ * @param va_addr
+ * Virtual address to remove from the heap
+ * @param len
+ * Length of virtual area to remove from the heap
+ *
+ * @return
+ * - 0 on success
+ * - -1 in case of error, with rte_errno set to one of the following:
+ * EINVAL - one of the parameters was invalid
+ * EPERM - attempted to remove memory from a reserved heap
+ * ENOENT - heap or memory chunk was not found
+ * EBUSY - memory chunk still contains data
+ */
+int __rte_experimental
+rte_malloc_heap_memory_remove(const char *heap_name, void *va_addr, size_t len);
+
+/**
+ * Attach to an already existing chunk of external memory in another process.
+ *
+ * @note This function must be called before any attempt is made to use an
+ * already existing external memory chunk. This function does *not* need to
+ * be called if a call to ``rte_malloc_heap_memory_add`` was made in the
+ * current process.
+ *
+ * @param heap_name
+ * Heap name to which this chunk of memory belongs
+ * @param va_addr
+ * Start address of memory chunk to attach to
+ * @param len
+ * Length of memory chunk to attach to
+ * @return
+ * 0 on successful attach
+ * -1 on unsuccessful attach, with rte_errno set to indicate cause for error:
+ * EINVAL - one of the parameters was invalid
+ * EPERM - attempted to attach memory to a reserved heap
+ * ENOENT - heap or memory chunk was not found
+ */
+int __rte_experimental
+rte_malloc_heap_memory_attach(const char *heap_name, void *va_addr, size_t len);
+
+/**
+ * Detach from a chunk of external memory in secondary process.
+ *
+ * @note This function must be called in before any attempt is made to remove
+ * external memory from the heap in another process. This function does *not*
+ * need to be called if a call to ``rte_malloc_heap_memory_remove`` will be
+ * called in current process.
+ *
+ * @param heap_name
+ * Heap name to which this chunk of memory belongs
+ * @param va_addr
+ * Start address of memory chunk to attach to
+ * @param len
+ * Length of memory chunk to attach to
+ * @return
+ * 0 on successful detach
+ * -1 on unsuccessful detach, with rte_errno set to indicate cause for error:
+ * EINVAL - one of the parameters was invalid
+ * EPERM - attempted to detach memory from a reserved heap
+ * ENOENT - heap or memory chunk was not found
+ */
+int __rte_experimental
+rte_malloc_heap_memory_detach(const char *heap_name, void *va_addr, size_t len);
+
+/**
+ * Creates a new empty malloc heap with a specified name.
+ *
+ * @note Heaps created via this call will automatically get assigned a unique
+ * socket ID, which can be found using ``rte_malloc_heap_get_socket()``
+ *
+ * @param heap_name
+ * Name of the heap to create.
+ *
+ * @return
+ * - 0 on successful creation
+ * - -1 in case of error, with rte_errno set to one of the following:
+ * EINVAL - ``heap_name`` was NULL, empty or too long
+ * EEXIST - heap by name of ``heap_name`` already exists
+ * ENOSPC - no more space in internal config to store a new heap
+ */
+int __rte_experimental
+rte_malloc_heap_create(const char *heap_name);
+
+/**
+ * Destroys a previously created malloc heap with specified name.
+ *
+ * @note This function will return a failure result if not all memory allocated
+ * from the heap has been freed back to the heap
+ *
+ * @note This function will return a failure result if not all memory segments
+ * were removed from the heap prior to its destruction
+ *
+ * @param heap_name
+ * Name of the heap to create.
+ *
+ * @return
+ * - 0 on success
+ * - -1 in case of error, with rte_errno set to one of the following:
+ * EINVAL - ``heap_name`` was NULL, empty or too long
+ * ENOENT - heap by the name of ``heap_name`` was not found
+ * EPERM - attempting to destroy reserved heap
+ * EBUSY - heap still contains data
+ */
+int __rte_experimental
+rte_malloc_heap_destroy(const char *heap_name);
+
+/**
+ * Find socket ID corresponding to a named heap.
+ *
+ * @param name
+ * Heap name to find socket ID for
+ * @return
+ * Socket ID in case of success (a non-negative number)
+ * -1 in case of error, with rte_errno set to one of the following:
+ * EINVAL - ``name`` was NULL
+ * ENOENT - heap identified by the name ``name`` was not found
+ */
+int __rte_experimental
+rte_malloc_heap_get_socket(const char *name);
+
+/**
+ * Check if a given socket ID refers to externally allocated memory.
+ *
+ * @note Passing SOCKET_ID_ANY will return 0.
+ *
+ * @param socket_id
+ * Socket ID to check
+ * @return
+ * 1 if socket ID refers to externally allocated memory
+ * 0 if socket ID refers to internal DPDK memory
+ * -1 if socket ID is invalid
+ */
+int __rte_experimental
+rte_malloc_heap_socket_is_external(int socket_id);
+
+/**
* Dump statistics.
*
* Dump for the specified type to a file. If the type argument is
diff --git a/lib/librte_eal/common/include/rte_malloc_heap.h b/lib/librte_eal/common/include/rte_malloc_heap.h
index d43fa909..4a7e0eb1 100644
--- a/lib/librte_eal/common/include/rte_malloc_heap.h
+++ b/lib/librte_eal/common/include/rte_malloc_heap.h
@@ -12,6 +12,7 @@
/* Number of free lists per heap, grouped by size. */
#define RTE_HEAP_NUM_FREELISTS 13
+#define RTE_HEAP_NAME_MAX_LEN 32
/* dummy definition, for pointers */
struct malloc_elem;
@@ -26,7 +27,9 @@ struct malloc_heap {
struct malloc_elem *volatile last;
unsigned alloc_count;
+ unsigned int socket_id;
size_t total_size;
+ char name[RTE_HEAP_NAME_MAX_LEN];
} __rte_cache_aligned;
#endif /* _RTE_MALLOC_HEAP_H_ */
diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h
index c4b7f4cf..ce937058 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -215,6 +215,9 @@ typedef int (*rte_memseg_list_walk_t)(const struct rte_memseg_list *msl,
* @note This function read-locks the memory hotplug subsystem, and thus cannot
* be used within memory-related callback functions.
*
+ * @note This function will also walk through externally allocated segments. It
+ * is up to the user to decide whether to skip through these segments.
+ *
* @param func
* Iterator function
* @param arg
@@ -233,6 +236,9 @@ rte_memseg_walk(rte_memseg_walk_t func, void *arg);
* @note This function read-locks the memory hotplug subsystem, and thus cannot
* be used within memory-related callback functions.
*
+ * @note This function will also walk through externally allocated segments. It
+ * is up to the user to decide whether to skip through these segments.
+ *
* @param func
* Iterator function
* @param arg
@@ -251,6 +257,9 @@ rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg);
* @note This function read-locks the memory hotplug subsystem, and thus cannot
* be used within memory-related callback functions.
*
+ * @note This function will also walk through externally allocated segments. It
+ * is up to the user to decide whether to skip through these segments.
+ *
* @param func
* Iterator function
* @param arg
@@ -318,6 +327,103 @@ int __rte_experimental
rte_memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg);
/**
+ * Return file descriptor associated with a particular memseg (if available).
+ *
+ * @note This function read-locks the memory hotplug subsystem, and thus cannot
+ * be used within memory-related callback functions.
+ *
+ * @note This returns an internal file descriptor. Performing any operations on
+ * this file descriptor is inherently dangerous, so it should be treated
+ * as read-only for all intents and purposes.
+ *
+ * @param ms
+ * A pointer to memseg for which to get file descriptor.
+ *
+ * @return
+ * Valid file descriptor in case of success.
+ * -1 in case of error, with ``rte_errno`` set to the following values:
+ * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg
+ * - ENODEV - ``ms`` fd is not available
+ * - ENOENT - ``ms`` is an unused segment
+ * - ENOTSUP - segment fd's are not supported
+ */
+int __rte_experimental
+rte_memseg_get_fd(const struct rte_memseg *ms);
+
+/**
+ * Return file descriptor associated with a particular memseg (if available).
+ *
+ * @note This function does not perform any locking, and is only safe to call
+ * from within memory-related callback functions.
+ *
+ * @note This returns an internal file descriptor. Performing any operations on
+ * this file descriptor is inherently dangerous, so it should be treated
+ * as read-only for all intents and purposes.
+ *
+ * @param ms
+ * A pointer to memseg for which to get file descriptor.
+ *
+ * @return
+ * Valid file descriptor in case of success.
+ * -1 in case of error, with ``rte_errno`` set to the following values:
+ * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg
+ * - ENODEV - ``ms`` fd is not available
+ * - ENOENT - ``ms`` is an unused segment
+ * - ENOTSUP - segment fd's are not supported
+ */
+int __rte_experimental
+rte_memseg_get_fd_thread_unsafe(const struct rte_memseg *ms);
+
+/**
+ * Get offset into segment file descriptor associated with a particular memseg
+ * (if available).
+ *
+ * @note This function read-locks the memory hotplug subsystem, and thus cannot
+ * be used within memory-related callback functions.
+ *
+ * @param ms
+ * A pointer to memseg for which to get file descriptor.
+ * @param offset
+ * A pointer to offset value where the result will be stored.
+ *
+ * @return
+ * Valid file descriptor in case of success.
+ * -1 in case of error, with ``rte_errno`` set to the following values:
+ * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg
+ * - EINVAL - ``offset`` pointer was NULL
+ * - ENODEV - ``ms`` fd is not available
+ * - ENOENT - ``ms`` is an unused segment
+ * - ENOTSUP - segment fd's are not supported
+ */
+int __rte_experimental
+rte_memseg_get_fd_offset(const struct rte_memseg *ms, size_t *offset);
+
+/**
+ * Get offset into segment file descriptor associated with a particular memseg
+ * (if available).
+ *
+ * @note This function does not perform any locking, and is only safe to call
+ * from within memory-related callback functions.
+ *
+ * @param ms
+ * A pointer to memseg for which to get file descriptor.
+ * @param offset
+ * A pointer to offset value where the result will be stored.
+ *
+ * @return
+ * Valid file descriptor in case of success.
+ * -1 in case of error, with ``rte_errno`` set to the following values:
+ * - EINVAL - ``ms`` pointer was NULL or did not point to a valid memseg
+ * - EINVAL - ``offset`` pointer was NULL
+ * - ENODEV - ``ms`` fd is not available
+ * - ENOENT - ``ms`` is an unused segment
+ * - ENOTSUP - segment fd's are not supported
+ */
+int __rte_experimental
+rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms,
+ size_t *offset);
+
+/**
* Dump the physical memory layout to a file.
*
* @note This function read-locks the memory hotplug subsystem, and thus cannot
@@ -357,6 +463,9 @@ unsigned rte_memory_get_nchannel(void);
*/
unsigned rte_memory_get_nrank(void);
+/* check memsegs iovas are within a range based on dma mask */
+int __rte_experimental rte_eal_check_dma_mask(uint8_t maskbits);
+
/**
* Drivers based on uio will not load unless physical
* addresses are obtainable. It is only possible to get
diff --git a/lib/librte_eal/common/include/rte_option.h b/lib/librte_eal/common/include/rte_option.h
new file mode 100644
index 00000000..8957b970
--- /dev/null
+++ b/lib/librte_eal/common/include/rte_option.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#ifndef __INCLUDE_RTE_OPTION_H__
+#define __INCLUDE_RTE_OPTION_H__
+
+/**
+ * @file
+ *
+ * This API offers the ability to register options to the EAL command line and
+ * map those options to functions that will be executed at the end of EAL
+ * initialization. These options will be available as part of the EAL command
+ * line of applications and are dynamically managed.
+ *
+ * This is used primarily by DPDK libraries offering command line options.
+ * Currently, this API is limited to registering options without argument.
+ *
+ * The register API can be used to resolve circular dependency issues
+ * between EAL and the library. The library uses EAL, but is also initialized
+ * by EAL. Hence, EAL depends on the init function of the library. The API
+ * introduced in rte_option allows us to register the library init with EAL
+ * (passing a function pointer) and avoid the circular dependency.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int (*rte_option_cb)(void);
+
+/*
+ * Structure describing the EAL command line option being registered.
+ */
+struct rte_option {
+ TAILQ_ENTRY(rte_option) next; /**< Next entry in the list. */
+ char *opt_str; /**< The option name. */
+ rte_option_cb cb; /**< Function called when option is used. */
+ int enabled; /**< Set when the option is used. */
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Register an option to the EAL command line.
+ * When recognized, the associated function will be executed at the end of EAL
+ * initialization.
+ *
+ * The associated structure must be available the whole time this option is
+ * registered (i.e. not stack memory).
+ *
+ * @param opt
+ * Structure describing the option to parse.
+ */
+void __rte_experimental
+rte_option_register(struct rte_option *opt);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/lib/librte_eal/common/include/rte_string_fns.h b/lib/librte_eal/common/include/rte_string_fns.h
index 97597a14..9a2a1ff9 100644
--- a/lib/librte_eal/common/include/rte_string_fns.h
+++ b/lib/librte_eal/common/include/rte_string_fns.h
@@ -16,6 +16,7 @@ extern "C" {
#endif
#include <stdio.h>
+#include <string.h>
/**
* Takes string "string" parameter and splits it at character "delim"
@@ -60,12 +61,10 @@ rte_strlcpy(char *dst, const char *src, size_t size)
/* pull in a strlcpy function */
#ifdef RTE_EXEC_ENV_BSDAPP
-#include <string.h>
#ifndef __BSD_VISIBLE /* non-standard functions are hidden */
#define strlcpy(dst, src, size) rte_strlcpy(dst, src, size)
#endif
-
#else /* non-BSD platforms */
#ifdef RTE_USE_LIBBSD
#include <bsd/string.h>
@@ -76,6 +75,29 @@ rte_strlcpy(char *dst, const char *src, size_t size)
#endif /* RTE_USE_LIBBSD */
#endif /* BSDAPP */
+/**
+ * Copy string src to buffer dst of size dsize.
+ * At most dsize-1 chars will be copied.
+ * Always NUL-terminates, unless (dsize == 0).
+ * Returns number of bytes copied (terminating NUL-byte excluded) on success ;
+ * negative errno on error.
+ *
+ * @param dst
+ * The destination string.
+ *
+ * @param src
+ * The input string to be copied.
+ *
+ * @param dsize
+ * Length in bytes of the destination buffer.
+ *
+ * @return
+ * The number of bytes copied on success
+ * -E2BIG if the destination buffer is too small.
+ */
+ssize_t
+rte_strscpy(char *dst, const char *src, size_t dsize);
+
#ifdef __cplusplus
}
#endif
diff --git a/lib/librte_eal/common/include/rte_version.h b/lib/librte_eal/common/include/rte_version.h
index 7c6714a2..412ed2db 100644
--- a/lib/librte_eal/common/include/rte_version.h
+++ b/lib/librte_eal/common/include/rte_version.h
@@ -32,7 +32,7 @@ extern "C" {
/**
* Minor version/month number i.e. the mm in yy.mm.z
*/
-#define RTE_VER_MONTH 8
+#define RTE_VER_MONTH 11
/**
* Patch level number i.e. the z in yy.mm.z
@@ -42,14 +42,14 @@ extern "C" {
/**
* Extra string to be appended to version number
*/
-#define RTE_VER_SUFFIX ""
+#define RTE_VER_SUFFIX "-rc"
/**
* Patch release number
* 0-15 = release candidates
* 16 = release
*/
-#define RTE_VER_RELEASE 16
+#define RTE_VER_RELEASE 1
/**
* Macro to compute a version number usable for comparisons
diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h
index 5ca13fcc..cae96fab 100644
--- a/lib/librte_eal/common/include/rte_vfio.h
+++ b/lib/librte_eal/common/include/rte_vfio.h
@@ -14,6 +14,8 @@
extern "C" {
#endif
+#include <stdint.h>
+
/*
* determine if VFIO is present on the system
*/
@@ -22,6 +24,9 @@ extern "C" {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
#define VFIO_PRESENT
#endif /* kernel version >= 3.6.0 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0)
+#define HAVE_VFIO_DEV_REQ_INTERFACE
+#endif /* kernel version >= 4.0.0 */
#endif /* RTE_EAL_VFIO */
#ifdef VFIO_PRESENT
@@ -44,6 +49,30 @@ extern "C" {
#define RTE_VFIO_NOIOMMU 8
#endif
+/*
+ * capabilities are only supported on kernel 4.6+. there were also some API
+ * changes as well, so add a macro to get cap offset.
+ */
+#ifdef VFIO_REGION_INFO_FLAG_CAPS
+#define RTE_VFIO_INFO_FLAG_CAPS VFIO_REGION_INFO_FLAG_CAPS
+#define VFIO_CAP_OFFSET(x) (x->cap_offset)
+#else
+#define RTE_VFIO_INFO_FLAG_CAPS (1 << 3)
+#define VFIO_CAP_OFFSET(x) (x->resv)
+struct vfio_info_cap_header {
+ uint16_t id;
+ uint16_t version;
+ uint32_t next;
+};
+#endif
+
+/* kernels 4.16+ can map BAR containing MSI-X table */
+#ifdef VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
+#define RTE_VFIO_CAP_MSIX_MAPPABLE VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
+#else
+#define RTE_VFIO_CAP_MSIX_MAPPABLE 3
+#endif
+
#else /* not VFIO_PRESENT */
/* we don't need an actual definition, only pointer is used */
@@ -227,7 +256,7 @@ rte_vfio_get_group_num(const char *sysfs_base,
const char *dev_addr, int *iommu_group_num);
/**
- * Open VFIO container fd or get an existing one
+ * Open a new VFIO container fd
*
* This function is only relevant to linux and will return
* an error on BSD.
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index e0a8ed15..1a74660d 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -39,10 +39,14 @@ malloc_elem_find_max_iova_contig(struct malloc_elem *elem, size_t align)
contig_seg_start = RTE_PTR_ALIGN_CEIL(data_start, align);
/* if we're in IOVA as VA mode, or if we're in legacy mode with
- * hugepages, all elements are IOVA-contiguous.
+ * hugepages, all elements are IOVA-contiguous. however, we can only
+ * make these assumptions about internal memory - externally allocated
+ * segments have to be checked.
*/
- if (rte_eal_iova_mode() == RTE_IOVA_VA ||
- (internal_config.legacy_mem && rte_eal_has_hugepages()))
+ if (!elem->msl->external &&
+ (rte_eal_iova_mode() == RTE_IOVA_VA ||
+ (internal_config.legacy_mem &&
+ rte_eal_has_hugepages())))
return RTE_PTR_DIFF(data_end, contig_seg_start);
cur_page = RTE_PTR_ALIGN_FLOOR(contig_seg_start, page_sz);
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 12aaf2d7..1973b6e6 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -29,6 +29,10 @@
#include "malloc_heap.h"
#include "malloc_mp.h"
+/* start external socket ID's at a very high number */
+#define CONST_MAX(a, b) (a > b ? a : b) /* RTE_MAX is not a constant */
+#define EXTERNAL_HEAP_MIN_SOCKET_ID (CONST_MAX((1 << 8), RTE_MAX_NUMA_NODES))
+
static unsigned
check_hugepage_sz(unsigned flags, uint64_t hugepage_sz)
{
@@ -66,6 +70,21 @@ check_hugepage_sz(unsigned flags, uint64_t hugepage_sz)
return check_flag & flags;
}
+int
+malloc_socket_to_heap_id(unsigned int socket_id)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int i;
+
+ for (i = 0; i < RTE_MAX_HEAPS; i++) {
+ struct malloc_heap *heap = &mcfg->malloc_heaps[i];
+
+ if (heap->socket_id == socket_id)
+ return i;
+ }
+ return -1;
+}
+
/*
* Expand the heap with a memory area.
*/
@@ -93,9 +112,17 @@ malloc_add_seg(const struct rte_memseg_list *msl,
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct rte_memseg_list *found_msl;
struct malloc_heap *heap;
- int msl_idx;
+ int msl_idx, heap_idx;
+
+ if (msl->external)
+ return 0;
- heap = &mcfg->malloc_heaps[msl->socket_id];
+ heap_idx = malloc_socket_to_heap_id(msl->socket_id);
+ if (heap_idx < 0) {
+ RTE_LOG(ERR, EAL, "Memseg list has invalid socket id\n");
+ return -1;
+ }
+ heap = &mcfg->malloc_heaps[heap_idx];
/* msl is const, so find it */
msl_idx = msl - mcfg->memsegs;
@@ -165,7 +192,9 @@ find_biggest_element(struct malloc_heap *heap, size_t *size,
for (elem = LIST_FIRST(&heap->free_head[idx]);
!!elem; elem = LIST_NEXT(elem, free_list)) {
size_t cur_size;
- if (!check_hugepage_sz(flags, elem->msl->page_sz))
+ if ((flags & RTE_MEMZONE_SIZE_HINT_ONLY) == 0 &&
+ !check_hugepage_sz(flags,
+ elem->msl->page_sz))
continue;
if (contig) {
cur_size =
@@ -259,11 +288,13 @@ alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
int socket, unsigned int flags, size_t align, size_t bound,
bool contig, struct rte_memseg **ms, int n_segs)
{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct rte_memseg_list *msl;
struct malloc_elem *elem = NULL;
size_t alloc_sz;
int allocd_pages;
void *ret, *map_addr;
+ uint64_t mask;
alloc_sz = (size_t)pg_sz * n_segs;
@@ -291,6 +322,16 @@ alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
goto fail;
}
+ if (mcfg->dma_maskbits) {
+ mask = ~((1ULL << mcfg->dma_maskbits) - 1);
+ if (rte_eal_check_dma_mask(mask)) {
+ RTE_LOG(ERR, EAL,
+ "%s(): couldn't allocate memory due to DMA mask\n",
+ __func__);
+ goto fail;
+ }
+ }
+
/* add newly minted memsegs to malloc heap */
elem = malloc_heap_add_memory(heap, msl, map_addr, alloc_sz);
@@ -326,11 +367,9 @@ try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz,
/* we can't know in advance how many pages we'll need, so we malloc */
ms = malloc(sizeof(*ms) * n_segs);
-
- memset(ms, 0, sizeof(*ms) * n_segs);
-
if (ms == NULL)
return -1;
+ memset(ms, 0, sizeof(*ms) * n_segs);
elem = alloc_pages_on_heap(heap, pg_sz, elt_size, socket, flags, align,
bound, contig, ms, n_segs);
@@ -560,12 +599,14 @@ alloc_more_mem_on_socket(struct malloc_heap *heap, size_t size, int socket,
/* this will try lower page sizes first */
static void *
-heap_alloc_on_socket(const char *type, size_t size, int socket,
- unsigned int flags, size_t align, size_t bound, bool contig)
+malloc_heap_alloc_on_heap_id(const char *type, size_t size,
+ unsigned int heap_id, unsigned int flags, size_t align,
+ size_t bound, bool contig)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
+ struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id];
unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY;
+ int socket_id;
void *ret;
rte_spinlock_lock(&(heap->lock));
@@ -583,12 +624,28 @@ heap_alloc_on_socket(const char *type, size_t size, int socket,
* we may still be able to allocate memory from appropriate page sizes,
* we just need to request more memory first.
*/
+
+ socket_id = rte_socket_id_by_idx(heap_id);
+ /*
+ * if socket ID is negative, we cannot find a socket ID for this heap -
+ * which means it's an external heap. those can have unexpected page
+ * sizes, so if the user asked to allocate from there - assume user
+ * knows what they're doing, and allow allocating from there with any
+ * page size flags.
+ */
+ if (socket_id < 0)
+ size_flags |= RTE_MEMZONE_SIZE_HINT_ONLY;
+
ret = heap_alloc(heap, type, size, size_flags, align, bound, contig);
if (ret != NULL)
goto alloc_unlock;
- if (!alloc_more_mem_on_socket(heap, size, socket, flags, align, bound,
- contig)) {
+ /* if socket ID is invalid, this is an external heap */
+ if (socket_id < 0)
+ goto alloc_unlock;
+
+ if (!alloc_more_mem_on_socket(heap, size, socket_id, flags, align,
+ bound, contig)) {
ret = heap_alloc(heap, type, size, flags, align, bound, contig);
/* this should have succeeded */
@@ -604,14 +661,14 @@ void *
malloc_heap_alloc(const char *type, size_t size, int socket_arg,
unsigned int flags, size_t align, size_t bound, bool contig)
{
- int socket, i, cur_socket;
+ int socket, heap_id, i;
void *ret;
/* return NULL if size is 0 or alignment is not power-of-2 */
if (size == 0 || (align && !rte_is_power_of_2(align)))
return NULL;
- if (!rte_eal_has_hugepages())
+ if (!rte_eal_has_hugepages() && socket_arg < RTE_MAX_NUMA_NODES)
socket_arg = SOCKET_ID_ANY;
if (socket_arg == SOCKET_ID_ANY)
@@ -619,22 +676,25 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg,
else
socket = socket_arg;
- /* Check socket parameter */
- if (socket >= RTE_MAX_NUMA_NODES)
+ /* turn socket ID into heap ID */
+ heap_id = malloc_socket_to_heap_id(socket);
+ /* if heap id is negative, socket ID was invalid */
+ if (heap_id < 0)
return NULL;
- ret = heap_alloc_on_socket(type, size, socket, flags, align, bound,
- contig);
+ ret = malloc_heap_alloc_on_heap_id(type, size, heap_id, flags, align,
+ bound, contig);
if (ret != NULL || socket_arg != SOCKET_ID_ANY)
return ret;
- /* try other heaps */
+ /* try other heaps. we are only iterating through native DPDK sockets,
+ * so external heaps won't be included.
+ */
for (i = 0; i < (int) rte_socket_count(); i++) {
- cur_socket = rte_socket_id_by_idx(i);
- if (cur_socket == socket)
+ if (i == heap_id)
continue;
- ret = heap_alloc_on_socket(type, size, cur_socket, flags,
- align, bound, contig);
+ ret = malloc_heap_alloc_on_heap_id(type, size, i, flags, align,
+ bound, contig);
if (ret != NULL)
return ret;
}
@@ -642,11 +702,11 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg,
}
static void *
-heap_alloc_biggest_on_socket(const char *type, int socket, unsigned int flags,
- size_t align, bool contig)
+heap_alloc_biggest_on_heap_id(const char *type, unsigned int heap_id,
+ unsigned int flags, size_t align, bool contig)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
+ struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id];
void *ret;
rte_spinlock_lock(&(heap->lock));
@@ -664,7 +724,7 @@ void *
malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags,
size_t align, bool contig)
{
- int socket, i, cur_socket;
+ int socket, i, cur_socket, heap_id;
void *ret;
/* return NULL if align is not power-of-2 */
@@ -679,11 +739,13 @@ malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags,
else
socket = socket_arg;
- /* Check socket parameter */
- if (socket >= RTE_MAX_NUMA_NODES)
+ /* turn socket ID into heap ID */
+ heap_id = malloc_socket_to_heap_id(socket);
+ /* if heap id is negative, socket ID was invalid */
+ if (heap_id < 0)
return NULL;
- ret = heap_alloc_biggest_on_socket(type, socket, flags, align,
+ ret = heap_alloc_biggest_on_heap_id(type, heap_id, flags, align,
contig);
if (ret != NULL || socket_arg != SOCKET_ID_ANY)
return ret;
@@ -693,8 +755,8 @@ malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags,
cur_socket = rte_socket_id_by_idx(i);
if (cur_socket == socket)
continue;
- ret = heap_alloc_biggest_on_socket(type, cur_socket, flags,
- align, contig);
+ ret = heap_alloc_biggest_on_heap_id(type, i, flags, align,
+ contig);
if (ret != NULL)
return ret;
}
@@ -756,8 +818,10 @@ malloc_heap_free(struct malloc_elem *elem)
/* anything after this is a bonus */
ret = 0;
- /* ...of which we can't avail if we are in legacy mode */
- if (internal_config.legacy_mem)
+ /* ...of which we can't avail if we are in legacy mode, or if this is an
+ * externally allocated segment.
+ */
+ if (internal_config.legacy_mem || (msl->external > 0))
goto free_unlock;
/* check if we can free any memory back to the system */
@@ -914,7 +978,7 @@ malloc_heap_resize(struct malloc_elem *elem, size_t size)
}
/*
- * Function to retrieve data for heap on given socket
+ * Function to retrieve data for a given heap
*/
int
malloc_heap_get_stats(struct malloc_heap *heap,
@@ -952,7 +1016,7 @@ malloc_heap_get_stats(struct malloc_heap *heap,
}
/*
- * Function to retrieve data for heap on given socket
+ * Function to retrieve data for a given heap
*/
void
malloc_heap_dump(struct malloc_heap *heap, FILE *f)
@@ -973,10 +1037,216 @@ malloc_heap_dump(struct malloc_heap *heap, FILE *f)
rte_spinlock_unlock(&heap->lock);
}
+static int
+destroy_seg(struct malloc_elem *elem, size_t len)
+{
+ struct malloc_heap *heap = elem->heap;
+ struct rte_memseg_list *msl;
+
+ msl = elem->msl;
+
+ /* notify all subscribers that a memory area is going to be removed */
+ eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, elem, len);
+
+ /* this element can be removed */
+ malloc_elem_free_list_remove(elem);
+ malloc_elem_hide_region(elem, elem, len);
+
+ heap->total_size -= len;
+
+ memset(elem, 0, sizeof(*elem));
+
+ /* destroy the fbarray backing this memory */
+ if (rte_fbarray_destroy(&msl->memseg_arr) < 0)
+ return -1;
+
+ /* reset the memseg list */
+ memset(msl, 0, sizeof(*msl));
+
+ return 0;
+}
+
+int
+malloc_heap_add_external_memory(struct malloc_heap *heap, void *va_addr,
+ rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ char fbarray_name[RTE_FBARRAY_NAME_LEN];
+ struct rte_memseg_list *msl = NULL;
+ struct rte_fbarray *arr;
+ size_t seg_len = n_pages * page_sz;
+ unsigned int i;
+
+ /* first, find a free memseg list */
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *tmp = &mcfg->memsegs[i];
+ if (tmp->base_va == NULL) {
+ msl = tmp;
+ break;
+ }
+ }
+ if (msl == NULL) {
+ RTE_LOG(ERR, EAL, "Couldn't find empty memseg list\n");
+ rte_errno = ENOSPC;
+ return -1;
+ }
+
+ snprintf(fbarray_name, sizeof(fbarray_name) - 1, "%s_%p",
+ heap->name, va_addr);
+
+ /* create the backing fbarray */
+ if (rte_fbarray_init(&msl->memseg_arr, fbarray_name, n_pages,
+ sizeof(struct rte_memseg)) < 0) {
+ RTE_LOG(ERR, EAL, "Couldn't create fbarray backing the memseg list\n");
+ return -1;
+ }
+ arr = &msl->memseg_arr;
+
+ /* fbarray created, fill it up */
+ for (i = 0; i < n_pages; i++) {
+ struct rte_memseg *ms;
+
+ rte_fbarray_set_used(arr, i);
+ ms = rte_fbarray_get(arr, i);
+ ms->addr = RTE_PTR_ADD(va_addr, i * page_sz);
+ ms->iova = iova_addrs == NULL ? RTE_BAD_IOVA : iova_addrs[i];
+ ms->hugepage_sz = page_sz;
+ ms->len = page_sz;
+ ms->nchannel = rte_memory_get_nchannel();
+ ms->nrank = rte_memory_get_nrank();
+ ms->socket_id = heap->socket_id;
+ }
+
+ /* set up the memseg list */
+ msl->base_va = va_addr;
+ msl->page_sz = page_sz;
+ msl->socket_id = heap->socket_id;
+ msl->len = seg_len;
+ msl->version = 0;
+ msl->external = 1;
+
+ /* erase contents of new memory */
+ memset(va_addr, 0, seg_len);
+
+ /* now, add newly minted memory to the malloc heap */
+ malloc_heap_add_memory(heap, msl, va_addr, seg_len);
+
+ heap->total_size += seg_len;
+
+ /* all done! */
+ RTE_LOG(DEBUG, EAL, "Added segment for heap %s starting at %p\n",
+ heap->name, va_addr);
+
+ /* notify all subscribers that a new memory area has been added */
+ eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC,
+ va_addr, seg_len);
+
+ return 0;
+}
+
+int
+malloc_heap_remove_external_memory(struct malloc_heap *heap, void *va_addr,
+ size_t len)
+{
+ struct malloc_elem *elem = heap->first;
+
+ /* find element with specified va address */
+ while (elem != NULL && elem != va_addr) {
+ elem = elem->next;
+ /* stop if we've blown past our VA */
+ if (elem > (struct malloc_elem *)va_addr) {
+ rte_errno = ENOENT;
+ return -1;
+ }
+ }
+ /* check if element was found */
+ if (elem == NULL || elem->msl->len != len) {
+ rte_errno = ENOENT;
+ return -1;
+ }
+ /* if element's size is not equal to segment len, segment is busy */
+ if (elem->state == ELEM_BUSY || elem->size != len) {
+ rte_errno = EBUSY;
+ return -1;
+ }
+ return destroy_seg(elem, len);
+}
+
+int
+malloc_heap_create(struct malloc_heap *heap, const char *heap_name)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ uint32_t next_socket_id = mcfg->next_socket_id;
+
+ /* prevent overflow. did you really create 2 billion heaps??? */
+ if (next_socket_id > INT32_MAX) {
+ RTE_LOG(ERR, EAL, "Cannot assign new socket ID's\n");
+ rte_errno = ENOSPC;
+ return -1;
+ }
+
+ /* initialize empty heap */
+ heap->alloc_count = 0;
+ heap->first = NULL;
+ heap->last = NULL;
+ LIST_INIT(heap->free_head);
+ rte_spinlock_init(&heap->lock);
+ heap->total_size = 0;
+ heap->socket_id = next_socket_id;
+
+ /* we hold a global mem hotplug writelock, so it's safe to increment */
+ mcfg->next_socket_id++;
+
+ /* set up name */
+ strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN);
+ return 0;
+}
+
+int
+malloc_heap_destroy(struct malloc_heap *heap)
+{
+ if (heap->alloc_count != 0) {
+ RTE_LOG(ERR, EAL, "Heap is still in use\n");
+ rte_errno = EBUSY;
+ return -1;
+ }
+ if (heap->first != NULL || heap->last != NULL) {
+ RTE_LOG(ERR, EAL, "Heap still contains memory segments\n");
+ rte_errno = EBUSY;
+ return -1;
+ }
+ if (heap->total_size != 0)
+ RTE_LOG(ERR, EAL, "Total size not zero, heap is likely corrupt\n");
+
+ /* after this, the lock will be dropped */
+ memset(heap, 0, sizeof(*heap));
+
+ return 0;
+}
+
int
rte_eal_malloc_heap_init(void)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ unsigned int i;
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ /* assign min socket ID to external heaps */
+ mcfg->next_socket_id = EXTERNAL_HEAP_MIN_SOCKET_ID;
+
+ /* assign names to default DPDK heaps */
+ for (i = 0; i < rte_socket_count(); i++) {
+ struct malloc_heap *heap = &mcfg->malloc_heaps[i];
+ char heap_name[RTE_HEAP_NAME_MAX_LEN];
+ int socket_id = rte_socket_id_by_idx(i);
+
+ snprintf(heap_name, sizeof(heap_name) - 1,
+ "socket_%i", socket_id);
+ strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN);
+ heap->socket_id = socket_id;
+ }
+ }
+
if (register_mp_requests()) {
RTE_LOG(ERR, EAL, "Couldn't register malloc multiprocess actions\n");
diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h
index f52cb555..e48996d5 100644
--- a/lib/librte_eal/common/malloc_heap.h
+++ b/lib/librte_eal/common/malloc_heap.h
@@ -34,6 +34,20 @@ malloc_heap_alloc_biggest(const char *type, int socket, unsigned int flags,
size_t align, bool contig);
int
+malloc_heap_create(struct malloc_heap *heap, const char *heap_name);
+
+int
+malloc_heap_destroy(struct malloc_heap *heap);
+
+int
+malloc_heap_add_external_memory(struct malloc_heap *heap, void *va_addr,
+ rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz);
+
+int
+malloc_heap_remove_external_memory(struct malloc_heap *heap, void *va_addr,
+ size_t len);
+
+int
malloc_heap_free(struct malloc_elem *elem);
int
@@ -47,6 +61,9 @@ void
malloc_heap_dump(struct malloc_heap *heap, FILE *f);
int
+malloc_socket_to_heap_id(unsigned int socket_id);
+
+int
rte_eal_malloc_heap_init(void);
#ifdef __cplusplus
diff --git a/lib/librte_eal/common/malloc_mp.c b/lib/librte_eal/common/malloc_mp.c
index 931c14bc..5f2d4e0b 100644
--- a/lib/librte_eal/common/malloc_mp.c
+++ b/lib/librte_eal/common/malloc_mp.c
@@ -194,13 +194,11 @@ handle_alloc_request(const struct malloc_mp_req *m,
/* we can't know in advance how many pages we'll need, so we malloc */
ms = malloc(sizeof(*ms) * n_segs);
-
- memset(ms, 0, sizeof(*ms) * n_segs);
-
if (ms == NULL) {
RTE_LOG(ERR, EAL, "Couldn't allocate memory for request state\n");
goto fail;
}
+ memset(ms, 0, sizeof(*ms) * n_segs);
elem = alloc_pages_on_heap(heap, ar->page_sz, ar->elt_size, ar->socket,
ar->flags, ar->align, ar->bound, ar->contig, ms,
diff --git a/lib/librte_eal/common/meson.build b/lib/librte_eal/common/meson.build
index 56005bea..2a10d57d 100644
--- a/lib/librte_eal/common/meson.build
+++ b/lib/librte_eal/common/meson.build
@@ -14,6 +14,7 @@ common_sources = files(
'eal_common_errno.c',
'eal_common_fbarray.c',
'eal_common_hexdump.c',
+ 'eal_common_hypervisor.c',
'eal_common_launch.c',
'eal_common_lcore.c',
'eal_common_log.c',
@@ -27,11 +28,13 @@ common_sources = files(
'eal_common_thread.c',
'eal_common_timer.c',
'eal_common_uuid.c',
+ 'hotplug_mp.c',
'malloc_elem.c',
'malloc_heap.c',
'malloc_mp.c',
'rte_keepalive.c',
'rte_malloc.c',
+ 'rte_option.c',
'rte_reciprocal.c',
'rte_service.c'
)
@@ -59,6 +62,7 @@ common_headers = files(
'include/rte_errno.h',
'include/rte_fbarray.h',
'include/rte_hexdump.h',
+ 'include/rte_hypervisor.h',
'include/rte_interrupts.h',
'include/rte_keepalive.h',
'include/rte_launch.h',
@@ -68,6 +72,7 @@ common_headers = files(
'include/rte_malloc_heap.h',
'include/rte_memory.h',
'include/rte_memzone.h',
+ 'include/rte_option.h',
'include/rte_pci_dev_feature_defs.h',
'include/rte_pci_dev_features.h',
'include/rte_per_lcore.h',
diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index b51a6d11..9e61dc41 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -8,6 +8,7 @@
#include <string.h>
#include <sys/queue.h>
+#include <rte_errno.h>
#include <rte_memcpy.h>
#include <rte_memory.h>
#include <rte_eal.h>
@@ -23,6 +24,7 @@
#include <rte_malloc.h>
#include "malloc_elem.h"
#include "malloc_heap.h"
+#include "eal_memalloc.h"
/* Free the memory space back to heap */
@@ -44,13 +46,15 @@ rte_malloc_socket(const char *type, size_t size, unsigned int align,
if (size == 0 || (align && !rte_is_power_of_2(align)))
return NULL;
- if (!rte_eal_has_hugepages())
+ /* if there are no hugepages and if we are not allocating from an
+ * external heap, use memory from any socket available. checking for
+ * socket being external may return -1 in case of invalid socket, but
+ * that's OK - if there are no hugepages, it doesn't matter.
+ */
+ if (rte_malloc_heap_socket_is_external(socket_arg) != 1 &&
+ !rte_eal_has_hugepages())
socket_arg = SOCKET_ID_ANY;
- /* Check socket parameter */
- if (socket_arg >= RTE_MAX_NUMA_NODES)
- return NULL;
-
return malloc_heap_alloc(type, size, socket_arg, 0,
align == 0 ? 1 : align, 0, false);
}
@@ -152,11 +156,20 @@ rte_malloc_get_socket_stats(int socket,
struct rte_malloc_socket_stats *socket_stats)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int heap_idx, ret = -1;
- if (socket >= RTE_MAX_NUMA_NODES || socket < 0)
- return -1;
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+
+ heap_idx = malloc_socket_to_heap_id(socket);
+ if (heap_idx < 0)
+ goto unlock;
- return malloc_heap_get_stats(&mcfg->malloc_heaps[socket], socket_stats);
+ ret = malloc_heap_get_stats(&mcfg->malloc_heaps[heap_idx],
+ socket_stats);
+unlock:
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+
+ return ret;
}
/*
@@ -168,12 +181,75 @@ rte_malloc_dump_heaps(FILE *f)
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
unsigned int idx;
- for (idx = 0; idx < rte_socket_count(); idx++) {
- unsigned int socket = rte_socket_id_by_idx(idx);
- fprintf(f, "Heap on socket %i:\n", socket);
- malloc_heap_dump(&mcfg->malloc_heaps[socket], f);
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+
+ for (idx = 0; idx < RTE_MAX_HEAPS; idx++) {
+ fprintf(f, "Heap id: %u\n", idx);
+ malloc_heap_dump(&mcfg->malloc_heaps[idx], f);
+ }
+
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+}
+
+int
+rte_malloc_heap_get_socket(const char *name)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct malloc_heap *heap = NULL;
+ unsigned int idx;
+ int ret;
+
+ if (name == NULL ||
+ strnlen(name, RTE_HEAP_NAME_MAX_LEN) == 0 ||
+ strnlen(name, RTE_HEAP_NAME_MAX_LEN) ==
+ RTE_HEAP_NAME_MAX_LEN) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+ for (idx = 0; idx < RTE_MAX_HEAPS; idx++) {
+ struct malloc_heap *tmp = &mcfg->malloc_heaps[idx];
+
+ if (!strncmp(name, tmp->name, RTE_HEAP_NAME_MAX_LEN)) {
+ heap = tmp;
+ break;
+ }
+ }
+
+ if (heap != NULL) {
+ ret = heap->socket_id;
+ } else {
+ rte_errno = ENOENT;
+ ret = -1;
+ }
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+
+ return ret;
+}
+
+int
+rte_malloc_heap_socket_is_external(int socket_id)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ unsigned int idx;
+ int ret = -1;
+
+ if (socket_id == SOCKET_ID_ANY)
+ return 0;
+
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+ for (idx = 0; idx < RTE_MAX_HEAPS; idx++) {
+ struct malloc_heap *tmp = &mcfg->malloc_heaps[idx];
+
+ if ((int)tmp->socket_id == socket_id) {
+ /* external memory always has large socket ID's */
+ ret = tmp->socket_id >= RTE_MAX_NUMA_NODES;
+ break;
+ }
}
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+ return ret;
}
/*
@@ -182,14 +258,20 @@ rte_malloc_dump_heaps(FILE *f)
void
rte_malloc_dump_stats(FILE *f, __rte_unused const char *type)
{
- unsigned int socket;
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ unsigned int heap_id;
struct rte_malloc_socket_stats sock_stats;
+
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+
/* Iterate through all initialised heaps */
- for (socket=0; socket< RTE_MAX_NUMA_NODES; socket++) {
- if ((rte_malloc_get_socket_stats(socket, &sock_stats) < 0))
- continue;
+ for (heap_id = 0; heap_id < RTE_MAX_HEAPS; heap_id++) {
+ struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id];
+
+ malloc_heap_get_stats(heap, &sock_stats);
- fprintf(f, "Socket:%u\n", socket);
+ fprintf(f, "Heap id:%u\n", heap_id);
+ fprintf(f, "\tHeap name:%s\n", heap->name);
fprintf(f, "\tHeap_size:%zu,\n", sock_stats.heap_totalsz_bytes);
fprintf(f, "\tFree_size:%zu,\n", sock_stats.heap_freesz_bytes);
fprintf(f, "\tAlloc_size:%zu,\n", sock_stats.heap_allocsz_bytes);
@@ -198,6 +280,7 @@ rte_malloc_dump_stats(FILE *f, __rte_unused const char *type)
fprintf(f, "\tAlloc_count:%u,\n",sock_stats.alloc_count);
fprintf(f, "\tFree_count:%u,\n", sock_stats.free_count);
}
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
return;
}
@@ -223,7 +306,7 @@ rte_malloc_virt2iova(const void *addr)
if (elem == NULL)
return RTE_BAD_IOVA;
- if (rte_eal_iova_mode() == RTE_IOVA_VA)
+ if (!elem->msl->external && rte_eal_iova_mode() == RTE_IOVA_VA)
return (uintptr_t) addr;
ms = rte_mem_virt2memseg(addr, elem->msl);
@@ -235,3 +318,320 @@ rte_malloc_virt2iova(const void *addr)
return ms->iova + RTE_PTR_DIFF(addr, ms->addr);
}
+
+static struct malloc_heap *
+find_named_heap(const char *name)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ unsigned int i;
+
+ for (i = 0; i < RTE_MAX_HEAPS; i++) {
+ struct malloc_heap *heap = &mcfg->malloc_heaps[i];
+
+ if (!strncmp(name, heap->name, RTE_HEAP_NAME_MAX_LEN))
+ return heap;
+ }
+ return NULL;
+}
+
+int
+rte_malloc_heap_memory_add(const char *heap_name, void *va_addr, size_t len,
+ rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct malloc_heap *heap = NULL;
+ unsigned int n;
+ int ret;
+
+ if (heap_name == NULL || va_addr == NULL ||
+ page_sz == 0 || !rte_is_power_of_2(page_sz) ||
+ strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 ||
+ strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) ==
+ RTE_HEAP_NAME_MAX_LEN) {
+ rte_errno = EINVAL;
+ ret = -1;
+ goto unlock;
+ }
+ rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);
+
+ /* find our heap */
+ heap = find_named_heap(heap_name);
+ if (heap == NULL) {
+ rte_errno = ENOENT;
+ ret = -1;
+ goto unlock;
+ }
+ if (heap->socket_id < RTE_MAX_NUMA_NODES) {
+ /* cannot add memory to internal heaps */
+ rte_errno = EPERM;
+ ret = -1;
+ goto unlock;
+ }
+ n = len / page_sz;
+ if (n != n_pages && iova_addrs != NULL) {
+ rte_errno = EINVAL;
+ ret = -1;
+ goto unlock;
+ }
+
+ rte_spinlock_lock(&heap->lock);
+ ret = malloc_heap_add_external_memory(heap, va_addr, iova_addrs, n,
+ page_sz);
+ rte_spinlock_unlock(&heap->lock);
+
+unlock:
+ rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock);
+
+ return ret;
+}
+
+int
+rte_malloc_heap_memory_remove(const char *heap_name, void *va_addr, size_t len)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct malloc_heap *heap = NULL;
+ int ret;
+
+ if (heap_name == NULL || va_addr == NULL || len == 0 ||
+ strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 ||
+ strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) ==
+ RTE_HEAP_NAME_MAX_LEN) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);
+ /* find our heap */
+ heap = find_named_heap(heap_name);
+ if (heap == NULL) {
+ rte_errno = ENOENT;
+ ret = -1;
+ goto unlock;
+ }
+ if (heap->socket_id < RTE_MAX_NUMA_NODES) {
+ /* cannot remove memory from internal heaps */
+ rte_errno = EPERM;
+ ret = -1;
+ goto unlock;
+ }
+
+ rte_spinlock_lock(&heap->lock);
+ ret = malloc_heap_remove_external_memory(heap, va_addr, len);
+ rte_spinlock_unlock(&heap->lock);
+
+unlock:
+ rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock);
+
+ return ret;
+}
+
+struct sync_mem_walk_arg {
+ void *va_addr;
+ size_t len;
+ int result;
+ bool attach;
+};
+
+static int
+sync_mem_walk(const struct rte_memseg_list *msl, void *arg)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct sync_mem_walk_arg *wa = arg;
+ size_t len = msl->page_sz * msl->memseg_arr.len;
+
+ if (msl->base_va == wa->va_addr &&
+ len == wa->len) {
+ struct rte_memseg_list *found_msl;
+ int msl_idx, ret;
+
+ /* msl is const */
+ msl_idx = msl - mcfg->memsegs;
+ found_msl = &mcfg->memsegs[msl_idx];
+
+ if (wa->attach) {
+ ret = rte_fbarray_attach(&found_msl->memseg_arr);
+ } else {
+ /* notify all subscribers that a memory area is about to
+ * be removed
+ */
+ eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE,
+ msl->base_va, msl->len);
+ ret = rte_fbarray_detach(&found_msl->memseg_arr);
+ }
+
+ if (ret < 0) {
+ wa->result = -rte_errno;
+ } else {
+ /* notify all subscribers that a new memory area was
+ * added
+ */
+ if (wa->attach)
+ eal_memalloc_mem_event_notify(
+ RTE_MEM_EVENT_ALLOC,
+ msl->base_va, msl->len);
+ wa->result = 0;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static int
+sync_memory(const char *heap_name, void *va_addr, size_t len, bool attach)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct malloc_heap *heap = NULL;
+ struct sync_mem_walk_arg wa;
+ int ret;
+
+ if (heap_name == NULL || va_addr == NULL || len == 0 ||
+ strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 ||
+ strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) ==
+ RTE_HEAP_NAME_MAX_LEN) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+
+ /* find our heap */
+ heap = find_named_heap(heap_name);
+ if (heap == NULL) {
+ rte_errno = ENOENT;
+ ret = -1;
+ goto unlock;
+ }
+ /* we shouldn't be able to sync to internal heaps */
+ if (heap->socket_id < RTE_MAX_NUMA_NODES) {
+ rte_errno = EPERM;
+ ret = -1;
+ goto unlock;
+ }
+
+ /* find corresponding memseg list to sync to */
+ wa.va_addr = va_addr;
+ wa.len = len;
+ wa.result = -ENOENT; /* fail unless explicitly told to succeed */
+ wa.attach = attach;
+
+ /* we're already holding a read lock */
+ rte_memseg_list_walk_thread_unsafe(sync_mem_walk, &wa);
+
+ if (wa.result < 0) {
+ rte_errno = -wa.result;
+ ret = -1;
+ } else {
+ /* notify all subscribers that a new memory area was added */
+ if (attach)
+ eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC,
+ va_addr, len);
+ ret = 0;
+ }
+unlock:
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+ return ret;
+}
+
+int
+rte_malloc_heap_memory_attach(const char *heap_name, void *va_addr, size_t len)
+{
+ return sync_memory(heap_name, va_addr, len, true);
+}
+
+int
+rte_malloc_heap_memory_detach(const char *heap_name, void *va_addr, size_t len)
+{
+ return sync_memory(heap_name, va_addr, len, false);
+}
+
+int
+rte_malloc_heap_create(const char *heap_name)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct malloc_heap *heap = NULL;
+ int i, ret;
+
+ if (heap_name == NULL ||
+ strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 ||
+ strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) ==
+ RTE_HEAP_NAME_MAX_LEN) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ /* check if there is space in the heap list, or if heap with this name
+ * already exists.
+ */
+ rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);
+
+ for (i = 0; i < RTE_MAX_HEAPS; i++) {
+ struct malloc_heap *tmp = &mcfg->malloc_heaps[i];
+ /* existing heap */
+ if (strncmp(heap_name, tmp->name,
+ RTE_HEAP_NAME_MAX_LEN) == 0) {
+ RTE_LOG(ERR, EAL, "Heap %s already exists\n",
+ heap_name);
+ rte_errno = EEXIST;
+ ret = -1;
+ goto unlock;
+ }
+ /* empty heap */
+ if (strnlen(tmp->name, RTE_HEAP_NAME_MAX_LEN) == 0) {
+ heap = tmp;
+ break;
+ }
+ }
+ if (heap == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot create new heap: no space\n");
+ rte_errno = ENOSPC;
+ ret = -1;
+ goto unlock;
+ }
+
+ /* we're sure that we can create a new heap, so do it */
+ ret = malloc_heap_create(heap, heap_name);
+unlock:
+ rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock);
+
+ return ret;
+}
+
+int
+rte_malloc_heap_destroy(const char *heap_name)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct malloc_heap *heap = NULL;
+ int ret;
+
+ if (heap_name == NULL ||
+ strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 ||
+ strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) ==
+ RTE_HEAP_NAME_MAX_LEN) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+ rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);
+
+ /* start from non-socket heaps */
+ heap = find_named_heap(heap_name);
+ if (heap == NULL) {
+ RTE_LOG(ERR, EAL, "Heap %s not found\n", heap_name);
+ rte_errno = ENOENT;
+ ret = -1;
+ goto unlock;
+ }
+ /* we shouldn't be able to destroy internal heaps */
+ if (heap->socket_id < RTE_MAX_NUMA_NODES) {
+ rte_errno = EPERM;
+ ret = -1;
+ goto unlock;
+ }
+ /* sanity checks done, now we can destroy the heap */
+ rte_spinlock_lock(&heap->lock);
+ ret = malloc_heap_destroy(heap);
+
+ /* if we failed, lock is still active */
+ if (ret < 0)
+ rte_spinlock_unlock(&heap->lock);
+unlock:
+ rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock);
+
+ return ret;
+}
diff --git a/lib/librte_eal/common/rte_option.c b/lib/librte_eal/common/rte_option.c
new file mode 100644
index 00000000..02d59a86
--- /dev/null
+++ b/lib/librte_eal/common/rte_option.c
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#include <unistd.h>
+#include <string.h>
+
+#include <rte_eal.h>
+#include <rte_option.h>
+
+#include "eal_private.h"
+
+TAILQ_HEAD(rte_option_list, rte_option);
+
+struct rte_option_list rte_option_list =
+ TAILQ_HEAD_INITIALIZER(rte_option_list);
+
+static struct rte_option *option;
+
+int
+rte_option_parse(const char *opt)
+{
+ /* Check if the option is registered */
+ TAILQ_FOREACH(option, &rte_option_list, next) {
+ if (strcmp(opt, option->opt_str) == 0) {
+ option->enabled = 1;
+ return 0;
+ }
+ }
+
+ return -1;
+}
+
+void __rte_experimental
+rte_option_register(struct rte_option *opt)
+{
+ TAILQ_FOREACH(option, &rte_option_list, next) {
+ if (strcmp(opt->opt_str, option->opt_str) == 0)
+ RTE_LOG(INFO, EAL, "Option %s has already been registered.",
+ opt->opt_str);
+ return;
+ }
+
+ TAILQ_INSERT_HEAD(&rte_option_list, opt, next);
+}
+
+void
+rte_option_init(void)
+{
+ TAILQ_FOREACH(option, &rte_option_list, next) {
+ if (option->enabled)
+ option->cb();
+ }
+}
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index fd92c75c..51deb579 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -10,7 +10,7 @@ ARCH_DIR ?= $(RTE_ARCH)
EXPORT_MAP := ../../rte_eal_version.map
VPATH += $(RTE_SDK)/lib/librte_eal/common/arch/$(ARCH_DIR)
-LIBABIVER := 8
+LIBABIVER := 9
VPATH += $(RTE_SDK)/lib/librte_eal/common
@@ -70,10 +70,12 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_proc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_fbarray.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_uuid.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += hotplug_mp.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_mp.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_keepalive.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_option.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_service.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_reciprocal.c
@@ -85,22 +87,6 @@ SRCS-y += rte_cycles.c
CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST)
-CFLAGS_eal.o := -D_GNU_SOURCE
-CFLAGS_eal_interrupts.o := -D_GNU_SOURCE
-CFLAGS_eal_vfio_mp_sync.o := -D_GNU_SOURCE
-CFLAGS_eal_timer.o := -D_GNU_SOURCE
-CFLAGS_eal_lcore.o := -D_GNU_SOURCE
-CFLAGS_eal_memalloc.o := -D_GNU_SOURCE
-CFLAGS_eal_thread.o := -D_GNU_SOURCE
-CFLAGS_eal_log.o := -D_GNU_SOURCE
-CFLAGS_eal_common_log.o := -D_GNU_SOURCE
-CFLAGS_eal_hugepage_info.o := -D_GNU_SOURCE
-CFLAGS_eal_common_whitelist.o := -D_GNU_SOURCE
-CFLAGS_eal_common_options.o := -D_GNU_SOURCE
-CFLAGS_eal_common_thread.o := -D_GNU_SOURCE
-CFLAGS_eal_common_lcore.o := -D_GNU_SOURCE
-CFLAGS_rte_cycles.o := -D_GNU_SOURCE
-
# workaround for a gcc bug with noreturn attribute
# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603
ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index e59ac657..361744d4 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -48,6 +48,7 @@
#include <rte_atomic.h>
#include <malloc_heap.h>
#include <rte_vfio.h>
+#include <rte_option.h>
#include "eal_private.h"
#include "eal_thread.h"
@@ -149,7 +150,7 @@ eal_create_runtime_dir(void)
}
const char *
-eal_get_runtime_dir(void)
+rte_eal_get_runtime_dir(void)
{
return runtime_dir;
}
@@ -263,6 +264,8 @@ rte_eal_config_create(void)
* processes could later map the config into this exact location */
rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr;
+ rte_config.mem_config->dma_maskbits = 0;
+
}
/* attach to an existing shared memory config */
@@ -352,6 +355,24 @@ eal_proc_type_detect(void)
return ptype;
}
+/* copies data from internal config to shared config */
+static void
+eal_update_mem_config(void)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ mcfg->legacy_mem = internal_config.legacy_mem;
+ mcfg->single_file_segments = internal_config.single_file_segments;
+}
+
+/* copies data from shared config to internal config */
+static void
+eal_update_internal_config(void)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ internal_config.legacy_mem = mcfg->legacy_mem;
+ internal_config.single_file_segments = mcfg->single_file_segments;
+}
+
/* Sets up rte_config structure with the pointer to shared memory config.*/
static void
rte_config_init(void)
@@ -361,11 +382,13 @@ rte_config_init(void)
switch (rte_config.process_type){
case RTE_PROC_PRIMARY:
rte_eal_config_create();
+ eal_update_mem_config();
break;
case RTE_PROC_SECONDARY:
rte_eal_config_attach();
rte_eal_mcfg_wait_complete(rte_config.mem_config);
rte_eal_config_reattach();
+ eal_update_internal_config();
break;
case RTE_PROC_AUTO:
case RTE_PROC_INVALID:
@@ -580,12 +603,20 @@ eal_parse_args(int argc, char **argv)
argvopt = argv;
optind = 1;
+ opterr = 0;
while ((opt = getopt_long(argc, argvopt, eal_short_options,
eal_long_options, &option_index)) != EOF) {
- /* getopt is not happy, stop right now */
+ /*
+ * getopt didn't recognise the option, lets parse the
+ * registered options to see if the flag is valid
+ */
if (opt == '?') {
+ ret = rte_option_parse(argv[optind-1]);
+ if (ret == 0)
+ continue;
+
eal_usage(prgname);
ret = -1;
goto out;
@@ -725,6 +756,9 @@ check_socket(const struct rte_memseg_list *msl, void *arg)
{
int *socket_id = arg;
+ if (msl->external)
+ return 0;
+
return *socket_id == msl->socket_id;
}
@@ -793,7 +827,8 @@ rte_eal_init(int argc, char **argv)
int i, fctret, ret;
pthread_t thread_id;
static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0);
- const char *logid;
+ const char *p;
+ static char logid[PATH_MAX];
char cpuset[RTE_CPU_AFFINITY_STR_LEN];
char thread_name[RTE_MAX_THREAD_NAME_LEN];
@@ -810,9 +845,8 @@ rte_eal_init(int argc, char **argv)
return -1;
}
- logid = strrchr(argv[0], '/');
- logid = strdup(logid ? logid + 1: argv[0]);
-
+ p = strrchr(argv[0], '/');
+ strlcpy(logid, p ? p + 1 : argv[0], sizeof(logid));
thread_id = pthread_self();
eal_reset_internal_config(&internal_config);
@@ -835,7 +869,7 @@ rte_eal_init(int argc, char **argv)
}
if (eal_plugins_init() < 0) {
- rte_eal_init_alert("Cannot init plugins\n");
+ rte_eal_init_alert("Cannot init plugins");
rte_errno = EINVAL;
rte_atomic32_clear(&run_once);
return -1;
@@ -850,7 +884,7 @@ rte_eal_init(int argc, char **argv)
rte_config_init();
if (rte_eal_intr_init() < 0) {
- rte_eal_init_alert("Cannot init interrupt-handling thread\n");
+ rte_eal_init_alert("Cannot init interrupt-handling thread");
return -1;
}
@@ -858,30 +892,43 @@ rte_eal_init(int argc, char **argv)
* bus through mp channel in the secondary process before the bus scan.
*/
if (rte_mp_channel_init() < 0) {
- rte_eal_init_alert("failed to init mp channel\n");
+ rte_eal_init_alert("failed to init mp channel");
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
rte_errno = EFAULT;
return -1;
}
}
+ /* register multi-process action callbacks for hotplug */
+ if (rte_mp_dev_hotplug_init() < 0) {
+ rte_eal_init_alert("failed to register mp callback for hotplug");
+ return -1;
+ }
+
if (rte_bus_scan()) {
- rte_eal_init_alert("Cannot scan the buses for devices\n");
+ rte_eal_init_alert("Cannot scan the buses for devices");
rte_errno = ENODEV;
rte_atomic32_clear(&run_once);
return -1;
}
- /* autodetect the iova mapping mode (default is iova_pa) */
- rte_eal_get_configuration()->iova_mode = rte_bus_get_iommu_class();
-
- /* Workaround for KNI which requires physical address to work */
- if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA &&
- rte_eal_check_module("rte_kni") == 1) {
- rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA;
- RTE_LOG(WARNING, EAL,
- "Some devices want IOVA as VA but PA will be used because.. "
- "KNI module inserted\n");
+ /* if no EAL option "--iova-mode=<pa|va>", use bus IOVA scheme */
+ if (internal_config.iova_mode == RTE_IOVA_DC) {
+ /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */
+ rte_eal_get_configuration()->iova_mode =
+ rte_bus_get_iommu_class();
+
+ /* Workaround for KNI which requires physical address to work */
+ if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA &&
+ rte_eal_check_module("rte_kni") == 1) {
+ rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA;
+ RTE_LOG(WARNING, EAL,
+ "Some devices want IOVA as VA but PA will be used because.. "
+ "KNI module inserted\n");
+ }
+ } else {
+ rte_eal_get_configuration()->iova_mode =
+ internal_config.iova_mode;
}
if (internal_config.no_hugetlbfs == 0) {
@@ -924,7 +971,7 @@ rte_eal_init(int argc, char **argv)
#ifdef VFIO_PRESENT
if (rte_eal_vfio_setup() < 0) {
- rte_eal_init_alert("Cannot init VFIO\n");
+ rte_eal_init_alert("Cannot init VFIO");
rte_errno = EAGAIN;
rte_atomic32_clear(&run_once);
return -1;
@@ -935,13 +982,13 @@ rte_eal_init(int argc, char **argv)
* initialize memzones first.
*/
if (rte_eal_memzone_init() < 0) {
- rte_eal_init_alert("Cannot init memzone\n");
+ rte_eal_init_alert("Cannot init memzone");
rte_errno = ENODEV;
return -1;
}
if (rte_eal_memory_init() < 0) {
- rte_eal_init_alert("Cannot init memory\n");
+ rte_eal_init_alert("Cannot init memory");
rte_errno = ENOMEM;
return -1;
}
@@ -950,25 +997,25 @@ rte_eal_init(int argc, char **argv)
eal_hugedirs_unlock();
if (rte_eal_malloc_heap_init() < 0) {
- rte_eal_init_alert("Cannot init malloc heap\n");
+ rte_eal_init_alert("Cannot init malloc heap");
rte_errno = ENODEV;
return -1;
}
if (rte_eal_tailqs_init() < 0) {
- rte_eal_init_alert("Cannot init tail queues for objects\n");
+ rte_eal_init_alert("Cannot init tail queues for objects");
rte_errno = EFAULT;
return -1;
}
if (rte_eal_alarm_init() < 0) {
- rte_eal_init_alert("Cannot init interrupt-handling thread\n");
+ rte_eal_init_alert("Cannot init interrupt-handling thread");
/* rte_eal_alarm_init sets rte_errno on failure. */
return -1;
}
if (rte_eal_timer_init() < 0) {
- rte_eal_init_alert("Cannot init HPET or TSC timers\n");
+ rte_eal_init_alert("Cannot init HPET or TSC timers");
rte_errno = ENOTSUP;
return -1;
}
@@ -979,8 +1026,8 @@ rte_eal_init(int argc, char **argv)
ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
- RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%x;cpuset=[%s%s])\n",
- rte_config.master_lcore, (int)thread_id, cpuset,
+ RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%zx;cpuset=[%s%s])\n",
+ rte_config.master_lcore, (uintptr_t)thread_id, cpuset,
ret == 0 ? "" : "...");
RTE_LCORE_FOREACH_SLAVE(i) {
@@ -1022,14 +1069,14 @@ rte_eal_init(int argc, char **argv)
/* initialize services so vdevs register service during bus_probe. */
ret = rte_service_init();
if (ret) {
- rte_eal_init_alert("rte_service_init() failed\n");
+ rte_eal_init_alert("rte_service_init() failed");
rte_errno = ENOEXEC;
return -1;
}
/* Probe all the buses and devices/drivers on them */
if (rte_bus_probe()) {
- rte_eal_init_alert("Cannot probe devices\n");
+ rte_eal_init_alert("Cannot probe devices");
rte_errno = ENOTSUP;
return -1;
}
@@ -1051,6 +1098,9 @@ rte_eal_init(int argc, char **argv)
rte_eal_mcfg_complete();
+ /* Call each registered callback, if enabled */
+ rte_option_init();
+
return fctret;
}
@@ -1059,7 +1109,12 @@ mark_freeable(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
void *arg __rte_unused)
{
/* ms is const, so find this memseg */
- struct rte_memseg *found = rte_mem_virt2memseg(ms->addr, msl);
+ struct rte_memseg *found;
+
+ if (msl->external)
+ return 0;
+
+ found = rte_mem_virt2memseg(ms->addr, msl);
found->flags &= ~RTE_MEMSEG_FLAG_DO_NOT_FREE;
diff --git a/lib/librte_eal/linuxapp/eal/eal_dev.c b/lib/librte_eal/linuxapp/eal/eal_dev.c
index 1cf6aebf..d589c692 100644
--- a/lib/librte_eal/linuxapp/eal/eal_dev.c
+++ b/lib/librte_eal/linuxapp/eal/eal_dev.c
@@ -4,6 +4,8 @@
#include <string.h>
#include <unistd.h>
+#include <fcntl.h>
+#include <signal.h>
#include <sys/socket.h>
#include <linux/netlink.h>
@@ -14,15 +16,32 @@
#include <rte_malloc.h>
#include <rte_interrupts.h>
#include <rte_alarm.h>
+#include <rte_bus.h>
+#include <rte_eal.h>
+#include <rte_spinlock.h>
+#include <rte_errno.h>
#include "eal_private.h"
static struct rte_intr_handle intr_handle = {.fd = -1 };
static bool monitor_started;
+static bool hotplug_handle;
#define EAL_UEV_MSG_LEN 4096
#define EAL_UEV_MSG_ELEM_LEN 128
+/*
+ * spinlock for device hot-unplug failure handling. If it try to access bus or
+ * device, such as handle sigbus on bus or handle memory failure for device
+ * just need to use this lock. It could protect the bus and the device to avoid
+ * race condition.
+ */
+static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER;
+
+static struct sigaction sigbus_action_old;
+
+static int sigbus_need_recover;
+
static void dev_uev_handler(__rte_unused void *param);
/* identify the system layer which reports this event. */
@@ -33,6 +52,55 @@ enum eal_dev_event_subsystem {
EAL_DEV_EVENT_SUBSYSTEM_MAX
};
+static void
+sigbus_action_recover(void)
+{
+ if (sigbus_need_recover) {
+ sigaction(SIGBUS, &sigbus_action_old, NULL);
+ sigbus_need_recover = 0;
+ }
+}
+
+static void sigbus_handler(int signum, siginfo_t *info,
+ void *ctx __rte_unused)
+{
+ int ret;
+
+ RTE_LOG(DEBUG, EAL, "Thread[%d] catch SIGBUS, fault address:%p\n",
+ (int)pthread_self(), info->si_addr);
+
+ rte_spinlock_lock(&failure_handle_lock);
+ ret = rte_bus_sigbus_handler(info->si_addr);
+ rte_spinlock_unlock(&failure_handle_lock);
+ if (ret == -1) {
+ rte_exit(EXIT_FAILURE,
+ "Failed to handle SIGBUS for hot-unplug, "
+ "(rte_errno: %s)!", strerror(rte_errno));
+ } else if (ret == 1) {
+ if (sigbus_action_old.sa_flags == SA_SIGINFO
+ && sigbus_action_old.sa_sigaction) {
+ (*(sigbus_action_old.sa_sigaction))(signum,
+ info, ctx);
+ } else if (sigbus_action_old.sa_flags != SA_SIGINFO
+ && sigbus_action_old.sa_handler) {
+ (*(sigbus_action_old.sa_handler))(signum);
+ } else {
+ rte_exit(EXIT_FAILURE,
+ "Failed to handle generic SIGBUS!");
+ }
+ }
+
+ RTE_LOG(DEBUG, EAL, "Success to handle SIGBUS for hot-unplug!\n");
+}
+
+static int cmp_dev_name(const struct rte_device *dev,
+ const void *_name)
+{
+ const char *name = _name;
+
+ return strcmp(dev->name, name);
+}
+
static int
dev_uev_socket_fd_create(void)
{
@@ -147,6 +215,9 @@ dev_uev_handler(__rte_unused void *param)
struct rte_dev_event uevent;
int ret;
char buf[EAL_UEV_MSG_LEN];
+ struct rte_bus *bus;
+ struct rte_device *dev;
+ const char *busname = "";
memset(&uevent, 0, sizeof(struct rte_dev_event));
memset(buf, 0, EAL_UEV_MSG_LEN);
@@ -171,8 +242,43 @@ dev_uev_handler(__rte_unused void *param)
RTE_LOG(DEBUG, EAL, "receive uevent(name:%s, type:%d, subsystem:%d)\n",
uevent.devname, uevent.type, uevent.subsystem);
- if (uevent.devname)
- dev_callback_process(uevent.devname, uevent.type);
+ switch (uevent.subsystem) {
+ case EAL_DEV_EVENT_SUBSYSTEM_PCI:
+ case EAL_DEV_EVENT_SUBSYSTEM_UIO:
+ busname = "pci";
+ break;
+ default:
+ break;
+ }
+
+ if (uevent.devname) {
+ if (uevent.type == RTE_DEV_EVENT_REMOVE && hotplug_handle) {
+ rte_spinlock_lock(&failure_handle_lock);
+ bus = rte_bus_find_by_name(busname);
+ if (bus == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n",
+ busname);
+ return;
+ }
+
+ dev = bus->find_device(NULL, cmp_dev_name,
+ uevent.devname);
+ if (dev == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot find device (%s) on "
+ "bus (%s)\n", uevent.devname, busname);
+ return;
+ }
+
+ ret = bus->hot_unplug_handler(dev);
+ rte_spinlock_unlock(&failure_handle_lock);
+ if (ret) {
+ RTE_LOG(ERR, EAL, "Can not handle hot-unplug "
+ "for device (%s)\n", dev->name);
+ return;
+ }
+ }
+ rte_dev_event_callback_process(uevent.devname, uevent.type);
+ }
}
int __rte_experimental
@@ -220,5 +326,67 @@ rte_dev_event_monitor_stop(void)
close(intr_handle.fd);
intr_handle.fd = -1;
monitor_started = false;
+
return 0;
}
+
+int
+dev_sigbus_handler_register(void)
+{
+ sigset_t mask;
+ struct sigaction action;
+
+ rte_errno = 0;
+
+ if (sigbus_need_recover)
+ return 0;
+
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGBUS);
+ action.sa_flags = SA_SIGINFO;
+ action.sa_mask = mask;
+ action.sa_sigaction = sigbus_handler;
+ sigbus_need_recover = !sigaction(SIGBUS, &action, &sigbus_action_old);
+
+ return rte_errno;
+}
+
+int
+dev_sigbus_handler_unregister(void)
+{
+ rte_errno = 0;
+
+ sigbus_action_recover();
+
+ return rte_errno;
+}
+
+int __rte_experimental
+rte_dev_hotplug_handle_enable(void)
+{
+ int ret = 0;
+
+ ret = dev_sigbus_handler_register();
+ if (ret < 0)
+ RTE_LOG(ERR, EAL,
+ "fail to register sigbus handler for devices.\n");
+
+ hotplug_handle = true;
+
+ return ret;
+}
+
+int __rte_experimental
+rte_dev_hotplug_handle_disable(void)
+{
+ int ret = 0;
+
+ ret = dev_sigbus_handler_unregister();
+ if (ret < 0)
+ RTE_LOG(ERR, EAL,
+ "fail to unregister sigbus handler for devices.\n");
+
+ hotplug_handle = false;
+
+ return ret;
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index 3a7d4b22..0eab1cf7 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -6,6 +6,7 @@
#include <sys/types.h>
#include <sys/file.h>
#include <dirent.h>
+#include <fcntl.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 4076c6d6..39252a88 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -33,6 +33,7 @@
#include <rte_errno.h>
#include <rte_spinlock.h>
#include <rte_pause.h>
+#include <rte_vfio.h>
#include "eal_private.h"
#include "eal_vfio.h"
@@ -308,6 +309,66 @@ vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
return ret;
}
+
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+/* enable req notifier */
+static int
+vfio_enable_req(const struct rte_intr_handle *intr_handle)
+{
+ int len, ret;
+ char irq_set_buf[IRQ_SET_BUF_LEN];
+ struct vfio_irq_set *irq_set;
+ int *fd_ptr;
+
+ len = sizeof(irq_set_buf);
+
+ irq_set = (struct vfio_irq_set *) irq_set_buf;
+ irq_set->argsz = len;
+ irq_set->count = 1;
+ irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+ VFIO_IRQ_SET_ACTION_TRIGGER;
+ irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
+ irq_set->start = 0;
+ fd_ptr = (int *) &irq_set->data;
+ *fd_ptr = intr_handle->fd;
+
+ ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+ if (ret) {
+ RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n",
+ intr_handle->fd);
+ return -1;
+ }
+
+ return 0;
+}
+
+/* disable req notifier */
+static int
+vfio_disable_req(const struct rte_intr_handle *intr_handle)
+{
+ struct vfio_irq_set *irq_set;
+ char irq_set_buf[IRQ_SET_BUF_LEN];
+ int len, ret;
+
+ len = sizeof(struct vfio_irq_set);
+
+ irq_set = (struct vfio_irq_set *) irq_set_buf;
+ irq_set->argsz = len;
+ irq_set->count = 0;
+ irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+ irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
+ irq_set->start = 0;
+
+ ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+ if (ret)
+ RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n",
+ intr_handle->fd);
+
+ return ret;
+}
+#endif
#endif
static int
@@ -556,6 +617,12 @@ rte_intr_enable(const struct rte_intr_handle *intr_handle)
if (vfio_enable_intx(intr_handle))
return -1;
break;
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+ case RTE_INTR_HANDLE_VFIO_REQ:
+ if (vfio_enable_req(intr_handle))
+ return -1;
+ break;
+#endif
#endif
/* not used at this moment */
case RTE_INTR_HANDLE_DEV_EVENT:
@@ -606,6 +673,12 @@ rte_intr_disable(const struct rte_intr_handle *intr_handle)
if (vfio_disable_intx(intr_handle))
return -1;
break;
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+ case RTE_INTR_HANDLE_VFIO_REQ:
+ if (vfio_disable_req(intr_handle))
+ return -1;
+ break;
+#endif
#endif
/* not used at this moment */
case RTE_INTR_HANDLE_DEV_EVENT:
@@ -672,6 +745,12 @@ eal_intr_process_interrupts(struct epoll_event *events, int nfds)
case RTE_INTR_HANDLE_VFIO_LEGACY:
bytes_read = sizeof(buf.vfio_intr_count);
break;
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+ case RTE_INTR_HANDLE_VFIO_REQ:
+ bytes_read = 0;
+ call = true;
+ break;
+#endif
#endif
case RTE_INTR_HANDLE_VDEV:
case RTE_INTR_HANDLE_EXT:
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index aa95551a..48b9c736 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -34,6 +34,7 @@
#include <rte_log.h>
#include <rte_eal_memconfig.h>
#include <rte_eal.h>
+#include <rte_errno.h>
#include <rte_memory.h>
#include <rte_spinlock.h>
@@ -52,30 +53,55 @@ const int anonymous_hugepages_supported =
#endif
/*
+ * we don't actually care if memfd itself is supported - we only need to check
+ * if memfd supports hugetlbfs, as that already implies memfd support.
+ *
+ * also, this is not a constant, because while we may be *compiled* with memfd
+ * hugetlbfs support, we might not be *running* on a system that supports memfd
+ * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at
+ * runtime, and fall back to anonymous memory.
+ */
+static int memfd_create_supported =
+#ifdef MFD_HUGETLB
+#define MEMFD_SUPPORTED
+ 1;
+#else
+ 0;
+#endif
+
+/*
* not all kernel version support fallocate on hugetlbfs, so fall back to
* ftruncate and disallow deallocation if fallocate is not supported.
*/
static int fallocate_supported = -1; /* unknown */
-/* for single-file segments, we need some kind of mechanism to keep track of
+/*
+ * we have two modes - single file segments, and file-per-page mode.
+ *
+ * for single-file segments, we need some kind of mechanism to keep track of
* which hugepages can be freed back to the system, and which cannot. we cannot
* use flock() because they don't allow locking parts of a file, and we cannot
* use fcntl() due to issues with their semantics, so we will have to rely on a
- * bunch of lockfiles for each page.
+ * bunch of lockfiles for each page. so, we will use 'fds' array to keep track
+ * of per-page lockfiles. we will store the actual segment list fd in the
+ * 'memseg_list_fd' field.
+ *
+ * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd'
+ * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's.
*
* we cannot know how many pages a system will have in advance, but we do know
* that they come in lists, and we know lengths of these lists. so, simply store
* a malloc'd array of fd's indexed by list and segment index.
*
* they will be initialized at startup, and filled as we allocate/deallocate
- * segments. also, use this to track memseg list proper fd.
+ * segments.
*/
static struct {
int *fds; /**< dynamically allocated array of segment lock fd's */
int memseg_list_fd; /**< memseg list fd */
int len; /**< total length of the array */
int count; /**< entries used in an array */
-} lock_fds[RTE_MAX_MEMSEG_LISTS];
+} fd_list[RTE_MAX_MEMSEG_LISTS];
/** local copy of a memory map, used to synchronize memory hotplug in MP */
static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS];
@@ -182,6 +208,31 @@ get_file_size(int fd)
return st.st_size;
}
+static inline uint32_t
+bsf64(uint64_t v)
+{
+ return (uint32_t)__builtin_ctzll(v);
+}
+
+static inline uint32_t
+log2_u64(uint64_t v)
+{
+ if (v == 0)
+ return 0;
+ v = rte_align64pow2(v);
+ return bsf64(v);
+}
+
+static int
+pagesz_flags(uint64_t page_sz)
+{
+ /* as per mmap() manpage, all page sizes are log2 of page size
+ * shifted by MAP_HUGE_SHIFT
+ */
+ int log2 = log2_u64(page_sz);
+ return log2 << RTE_MAP_HUGE_SHIFT;
+}
+
/* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */
static int lock(int fd, int type)
{
@@ -209,12 +260,12 @@ static int get_segment_lock_fd(int list_idx, int seg_idx)
char path[PATH_MAX] = {0};
int fd;
- if (list_idx < 0 || list_idx >= (int)RTE_DIM(lock_fds))
+ if (list_idx < 0 || list_idx >= (int)RTE_DIM(fd_list))
return -1;
- if (seg_idx < 0 || seg_idx >= lock_fds[list_idx].len)
+ if (seg_idx < 0 || seg_idx >= fd_list[list_idx].len)
return -1;
- fd = lock_fds[list_idx].fds[seg_idx];
+ fd = fd_list[list_idx].fds[seg_idx];
/* does this lock already exist? */
if (fd >= 0)
return fd;
@@ -236,8 +287,8 @@ static int get_segment_lock_fd(int list_idx, int seg_idx)
return -1;
}
/* store it for future reference */
- lock_fds[list_idx].fds[seg_idx] = fd;
- lock_fds[list_idx].count++;
+ fd_list[list_idx].fds[seg_idx] = fd;
+ fd_list[list_idx].count++;
return fd;
}
@@ -245,12 +296,12 @@ static int unlock_segment(int list_idx, int seg_idx)
{
int fd, ret;
- if (list_idx < 0 || list_idx >= (int)RTE_DIM(lock_fds))
+ if (list_idx < 0 || list_idx >= (int)RTE_DIM(fd_list))
return -1;
- if (seg_idx < 0 || seg_idx >= lock_fds[list_idx].len)
+ if (seg_idx < 0 || seg_idx >= fd_list[list_idx].len)
return -1;
- fd = lock_fds[list_idx].fds[seg_idx];
+ fd = fd_list[list_idx].fds[seg_idx];
/* upgrade lock to exclusive to see if we can remove the lockfile */
ret = lock(fd, LOCK_EX);
@@ -270,8 +321,8 @@ static int unlock_segment(int list_idx, int seg_idx)
* and remove it from list anyway.
*/
close(fd);
- lock_fds[list_idx].fds[seg_idx] = -1;
- lock_fds[list_idx].count--;
+ fd_list[list_idx].fds[seg_idx] = -1;
+ fd_list[list_idx].count--;
if (ret < 0)
return -1;
@@ -279,16 +330,68 @@ static int unlock_segment(int list_idx, int seg_idx)
}
static int
+get_seg_memfd(struct hugepage_info *hi __rte_unused,
+ unsigned int list_idx __rte_unused,
+ unsigned int seg_idx __rte_unused)
+{
+#ifdef MEMFD_SUPPORTED
+ int fd;
+ char segname[250]; /* as per manpage, limit is 249 bytes plus null */
+
+ if (internal_config.single_file_segments) {
+ fd = fd_list[list_idx].memseg_list_fd;
+
+ if (fd < 0) {
+ int flags = MFD_HUGETLB | pagesz_flags(hi->hugepage_sz);
+
+ snprintf(segname, sizeof(segname), "seg_%i", list_idx);
+ fd = memfd_create(segname, flags);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ fd_list[list_idx].memseg_list_fd = fd;
+ }
+ } else {
+ fd = fd_list[list_idx].fds[seg_idx];
+
+ if (fd < 0) {
+ int flags = MFD_HUGETLB | pagesz_flags(hi->hugepage_sz);
+
+ snprintf(segname, sizeof(segname), "seg_%i-%i",
+ list_idx, seg_idx);
+ fd = memfd_create(segname, flags);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ fd_list[list_idx].fds[seg_idx] = fd;
+ }
+ }
+ return fd;
+#endif
+ return -1;
+}
+
+static int
get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
unsigned int list_idx, unsigned int seg_idx)
{
int fd;
+ /* for in-memory mode, we only make it here when we're sure we support
+ * memfd, and this is a special case.
+ */
+ if (internal_config.in_memory)
+ return get_seg_memfd(hi, list_idx, seg_idx);
+
if (internal_config.single_file_segments) {
/* create a hugepage file path */
eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx);
- fd = lock_fds[list_idx].memseg_list_fd;
+ fd = fd_list[list_idx].memseg_list_fd;
if (fd < 0) {
fd = open(path, O_CREAT | O_RDWR, 0600);
@@ -304,24 +407,30 @@ get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
close(fd);
return -1;
}
- lock_fds[list_idx].memseg_list_fd = fd;
+ fd_list[list_idx].memseg_list_fd = fd;
}
} else {
/* create a hugepage file path */
eal_get_hugefile_path(path, buflen, hi->hugedir,
list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
- fd = open(path, O_CREAT | O_RDWR, 0600);
+
+ fd = fd_list[list_idx].fds[seg_idx];
+
if (fd < 0) {
- RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
- strerror(errno));
- return -1;
- }
- /* take out a read lock */
- if (lock(fd, LOCK_SH) < 0) {
- RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n",
- __func__, strerror(errno));
- close(fd);
- return -1;
+ fd = open(path, O_CREAT | O_RDWR, 0600);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ /* take out a read lock */
+ if (lock(fd, LOCK_SH) < 0) {
+ RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n",
+ __func__, strerror(errno));
+ close(fd);
+ return -1;
+ }
+ fd_list[list_idx].fds[seg_idx] = fd;
}
}
return fd;
@@ -332,6 +441,33 @@ resize_hugefile(int fd, char *path, int list_idx, int seg_idx,
uint64_t fa_offset, uint64_t page_sz, bool grow)
{
bool again = false;
+
+ /* in-memory mode is a special case, because we don't need to perform
+ * any locking, and we can be sure that fallocate() is supported.
+ */
+ if (internal_config.in_memory) {
+ int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_KEEP_SIZE;
+ int ret;
+
+ /* grow or shrink the file */
+ ret = fallocate(fd, flags, fa_offset, page_sz);
+
+ if (ret < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
+ __func__,
+ strerror(errno));
+ return -1;
+ }
+ /* increase/decrease total segment count */
+ fd_list[list_idx].count += (grow ? 1 : -1);
+ if (!grow && fd_list[list_idx].count == 0) {
+ close(fd_list[list_idx].memseg_list_fd);
+ fd_list[list_idx].memseg_list_fd = -1;
+ }
+ return 0;
+ }
+
do {
if (fallocate_supported == 0) {
/* we cannot deallocate memory if fallocate() is not
@@ -410,9 +546,9 @@ resize_hugefile(int fd, char *path, int list_idx, int seg_idx,
* page file fd, so that one of the processes
* could then delete the file after shrinking.
*/
- if (ret < 1 && lock_fds[list_idx].count == 0) {
+ if (ret < 1 && fd_list[list_idx].count == 0) {
close(fd);
- lock_fds[list_idx].memseg_list_fd = -1;
+ fd_list[list_idx].memseg_list_fd = -1;
}
if (ret < 0) {
@@ -448,13 +584,13 @@ resize_hugefile(int fd, char *path, int list_idx, int seg_idx,
* more segments active in this segment list,
* and remove the file if there aren't.
*/
- if (lock_fds[list_idx].count == 0) {
+ if (fd_list[list_idx].count == 0) {
if (unlink(path))
RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n",
__func__, path,
strerror(errno));
close(fd);
- lock_fds[list_idx].memseg_list_fd = -1;
+ fd_list[list_idx].memseg_list_fd = -1;
}
}
}
@@ -481,26 +617,34 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
void *new_addr;
alloc_sz = hi->hugepage_sz;
- if (!internal_config.single_file_segments &&
- internal_config.in_memory &&
- anonymous_hugepages_supported) {
- int log2, flags;
-
- log2 = rte_log2_u32(alloc_sz);
- /* as per mmap() manpage, all page sizes are log2 of page size
- * shifted by MAP_HUGE_SHIFT
- */
- flags = (log2 << RTE_MAP_HUGE_SHIFT) | MAP_HUGETLB | MAP_FIXED |
+
+ /* these are checked at init, but code analyzers don't know that */
+ if (internal_config.in_memory && !anonymous_hugepages_supported) {
+ RTE_LOG(ERR, EAL, "Anonymous hugepages not supported, in-memory mode cannot allocate memory\n");
+ return -1;
+ }
+ if (internal_config.in_memory && !memfd_create_supported &&
+ internal_config.single_file_segments) {
+ RTE_LOG(ERR, EAL, "Single-file segments are not supported without memfd support\n");
+ return -1;
+ }
+
+ /* in-memory without memfd is a special case */
+ int mmap_flags;
+
+ if (internal_config.in_memory && !memfd_create_supported) {
+ int pagesz_flag, flags;
+
+ pagesz_flag = pagesz_flags(alloc_sz);
+ flags = pagesz_flag | MAP_HUGETLB | MAP_FIXED |
MAP_PRIVATE | MAP_ANONYMOUS;
fd = -1;
- va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, flags, -1, 0);
-
- /* single-file segments codepath will never be active because
- * in-memory mode is incompatible with it and it's stopped at
- * EAL initialization stage, however the compiler doesn't know
- * that and complains about map_offset being used uninitialized
- * on failure codepaths while having in-memory mode enabled. so,
- * assign a value here.
+ mmap_flags = flags;
+
+ /* single-file segments codepath will never be active
+ * here because in-memory mode is incompatible with the
+ * fallback path, and it's stopped at EAL initialization
+ * stage.
*/
map_offset = 0;
} else {
@@ -524,7 +668,8 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
__func__, strerror(errno));
goto resized;
}
- if (internal_config.hugepage_unlink) {
+ if (internal_config.hugepage_unlink &&
+ !internal_config.in_memory) {
if (unlink(path)) {
RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n",
__func__, strerror(errno));
@@ -532,16 +677,16 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
}
}
}
-
- /*
- * map the segment, and populate page tables, the kernel fills
- * this segment with zeros if it's a new page.
- */
- va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd,
- map_offset);
+ mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED;
}
+ /*
+ * map the segment, and populate page tables, the kernel fills
+ * this segment with zeros if it's a new page.
+ */
+ va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd,
+ map_offset);
+
if (va == MAP_FAILED) {
RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
strerror(errno));
@@ -593,10 +738,6 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
goto mapped;
}
#endif
- /* for non-single file segments that aren't in-memory, we can close fd
- * here */
- if (!internal_config.single_file_segments && !internal_config.in_memory)
- close(fd);
ms->addr = addr;
ms->hugepage_sz = alloc_sz;
@@ -626,7 +767,10 @@ unmapped:
RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n");
}
resized:
- /* in-memory mode will never be single-file-segments mode */
+ /* some codepaths will return negative fd, so exit early */
+ if (fd < 0)
+ return -1;
+
if (internal_config.single_file_segments) {
resize_hugefile(fd, path, list_idx, seg_idx, map_offset,
alloc_sz, false);
@@ -638,6 +782,7 @@ resized:
lock(fd, LOCK_EX) == 1)
unlink(path);
close(fd);
+ fd_list[list_idx].fds[seg_idx] = -1;
}
return -1;
}
@@ -648,7 +793,8 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
{
uint64_t map_offset;
char path[PATH_MAX];
- int fd, ret;
+ int fd, ret = 0;
+ bool exit_early;
/* erase page data */
memset(ms->addr, 0, ms->len);
@@ -660,8 +806,17 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
return -1;
}
+ exit_early = false;
+
+ /* if we're using anonymous hugepages, nothing to be done */
+ if (internal_config.in_memory && !memfd_create_supported)
+ exit_early = true;
+
/* if we've already unlinked the page, nothing needs to be done */
- if (internal_config.hugepage_unlink) {
+ if (!internal_config.in_memory && internal_config.hugepage_unlink)
+ exit_early = true;
+
+ if (exit_early) {
memset(ms, 0, sizeof(*ms));
return 0;
}
@@ -684,14 +839,17 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
/* if we're able to take out a write lock, we're the last one
* holding onto this page.
*/
- ret = lock(fd, LOCK_EX);
- if (ret >= 0) {
- /* no one else is using this page */
- if (ret == 1)
- unlink(path);
+ if (!internal_config.in_memory) {
+ ret = lock(fd, LOCK_EX);
+ if (ret >= 0) {
+ /* no one else is using this page */
+ if (ret == 1)
+ unlink(path);
+ }
}
/* closing fd will drop the lock */
close(fd);
+ fd_list[list_idx].fds[seg_idx] = -1;
}
memset(ms, 0, sizeof(*ms));
@@ -828,7 +986,7 @@ free_seg_walk(const struct rte_memseg_list *msl, void *arg)
int msl_idx, seg_idx, ret, dir_fd = -1;
start_addr = (uintptr_t) msl->base_va;
- end_addr = start_addr + msl->memseg_arr.len * (size_t)msl->page_sz;
+ end_addr = start_addr + msl->len;
if ((uintptr_t)wa->ms->addr < start_addr ||
(uintptr_t)wa->ms->addr >= end_addr)
@@ -1250,6 +1408,9 @@ sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused)
unsigned int i;
int msl_idx;
+ if (msl->external)
+ return 0;
+
msl_idx = msl - mcfg->memsegs;
primary_msl = &mcfg->memsegs[msl_idx];
local_msl = &local_memsegs[msl_idx];
@@ -1298,6 +1459,9 @@ secondary_msl_create_walk(const struct rte_memseg_list *msl,
char name[PATH_MAX];
int msl_idx, ret;
+ if (msl->external)
+ return 0;
+
msl_idx = msl - mcfg->memsegs;
primary_msl = &mcfg->memsegs[msl_idx];
local_msl = &local_memsegs[msl_idx];
@@ -1314,50 +1478,176 @@ secondary_msl_create_walk(const struct rte_memseg_list *msl,
return -1;
}
local_msl->base_va = primary_msl->base_va;
+ local_msl->len = primary_msl->len;
return 0;
}
static int
-secondary_lock_list_create_walk(const struct rte_memseg_list *msl,
- void *arg __rte_unused)
+alloc_list(int list_idx, int len)
{
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- unsigned int i, len;
- int msl_idx;
int *data;
+ int i;
- msl_idx = msl - mcfg->memsegs;
- len = msl->memseg_arr.len;
-
- /* ensure we have space to store lock fd per each possible segment */
+ /* ensure we have space to store fd per each possible segment */
data = malloc(sizeof(int) * len);
if (data == NULL) {
- RTE_LOG(ERR, EAL, "Unable to allocate space for lock descriptors\n");
+ RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n");
return -1;
}
/* set all fd's as invalid */
for (i = 0; i < len; i++)
data[i] = -1;
- lock_fds[msl_idx].fds = data;
- lock_fds[msl_idx].len = len;
- lock_fds[msl_idx].count = 0;
- lock_fds[msl_idx].memseg_list_fd = -1;
+ fd_list[list_idx].fds = data;
+ fd_list[list_idx].len = len;
+ fd_list[list_idx].count = 0;
+ fd_list[list_idx].memseg_list_fd = -1;
+
+ return 0;
+}
+
+static int
+fd_list_create_walk(const struct rte_memseg_list *msl,
+ void *arg __rte_unused)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ unsigned int len;
+ int msl_idx;
+
+ if (msl->external)
+ return 0;
+
+ msl_idx = msl - mcfg->memsegs;
+ len = msl->memseg_arr.len;
+
+ return alloc_list(msl_idx, len);
+}
+
+int
+eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+
+ /* if list is not allocated, allocate it */
+ if (fd_list[list_idx].len == 0) {
+ int len = mcfg->memsegs[list_idx].memseg_arr.len;
+
+ if (alloc_list(list_idx, len) < 0)
+ return -ENOMEM;
+ }
+ fd_list[list_idx].fds[seg_idx] = fd;
return 0;
}
int
+eal_memalloc_get_seg_fd(int list_idx, int seg_idx)
+{
+ int fd;
+ if (internal_config.single_file_segments) {
+ fd = fd_list[list_idx].memseg_list_fd;
+ } else if (fd_list[list_idx].len == 0) {
+ /* list not initialized */
+ fd = -1;
+ } else {
+ fd = fd_list[list_idx].fds[seg_idx];
+ }
+ if (fd < 0)
+ return -ENODEV;
+ return fd;
+}
+
+static int
+test_memfd_create(void)
+{
+#ifdef MEMFD_SUPPORTED
+ unsigned int i;
+ for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
+ uint64_t pagesz = internal_config.hugepage_info[i].hugepage_sz;
+ int pagesz_flag = pagesz_flags(pagesz);
+ int flags;
+
+ flags = pagesz_flag | MFD_HUGETLB;
+ int fd = memfd_create("test", flags);
+ if (fd < 0) {
+ /* we failed - let memalloc know this isn't working */
+ if (errno == EINVAL) {
+ memfd_create_supported = 0;
+ return 0; /* not supported */
+ }
+
+ /* we got other error - something's wrong */
+ return -1; /* error */
+ }
+ close(fd);
+ return 1; /* supported */
+ }
+#endif
+ return 0; /* not supported */
+}
+
+int
+eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+
+ /* fd_list not initialized? */
+ if (fd_list[list_idx].len == 0)
+ return -ENODEV;
+ if (internal_config.single_file_segments) {
+ size_t pgsz = mcfg->memsegs[list_idx].page_sz;
+
+ /* segment not active? */
+ if (fd_list[list_idx].memseg_list_fd < 0)
+ return -ENOENT;
+ *offset = pgsz * seg_idx;
+ } else {
+ /* segment not active? */
+ if (fd_list[list_idx].fds[seg_idx] < 0)
+ return -ENOENT;
+ *offset = 0;
+ }
+ return 0;
+}
+
+int
eal_memalloc_init(void)
{
if (rte_eal_process_type() == RTE_PROC_SECONDARY)
if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0)
return -1;
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
+ internal_config.in_memory) {
+ int mfd_res = test_memfd_create();
- /* initialize all of the lock fd lists */
- if (internal_config.single_file_segments)
- if (rte_memseg_list_walk(secondary_lock_list_create_walk, NULL))
+ if (mfd_res < 0) {
+ RTE_LOG(ERR, EAL, "Unable to check if memfd is supported\n");
+ return -1;
+ }
+ if (mfd_res == 1)
+ RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n");
+ else
+ RTE_LOG(INFO, EAL, "Using memfd is not supported, falling back to anonymous hugepages\n");
+
+ /* we only support single-file segments mode with in-memory mode
+ * if we support hugetlbfs with memfd_create. this code will
+ * test if we do.
+ */
+ if (internal_config.single_file_segments &&
+ mfd_res != 1) {
+ RTE_LOG(ERR, EAL, "Single-file segments mode cannot be used without memfd support\n");
return -1;
+ }
+ /* this cannot ever happen but better safe than sorry */
+ if (!anonymous_hugepages_supported) {
+ RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n");
+ return -1;
+ }
+ }
+
+ /* initialize all of the fd lists */
+ if (rte_memseg_list_walk(fd_list_create_walk, NULL))
+ return -1;
return 0;
}
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index dbf19499..fce86fda 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -5,6 +5,7 @@
#define _FILE_OFFSET_BITS 64
#include <errno.h>
+#include <fcntl.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdlib.h>
@@ -17,6 +18,7 @@
#include <sys/stat.h>
#include <sys/queue.h>
#include <sys/file.h>
+#include <sys/resource.h>
#include <unistd.h>
#include <limits.h>
#include <sys/ioctl.h>
@@ -263,7 +265,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
int node_id = -1;
int essential_prev = 0;
int oldpolicy;
- struct bitmask *oldmask = numa_allocate_nodemask();
+ struct bitmask *oldmask = NULL;
bool have_numa = true;
unsigned long maxnode = 0;
@@ -275,6 +277,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
if (have_numa) {
RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
+ oldmask = numa_allocate_nodemask();
if (get_mempolicy(&oldpolicy, oldmask->maskp,
oldmask->size + 1, 0, 0) < 0) {
RTE_LOG(ERR, EAL,
@@ -402,7 +405,8 @@ out:
numa_set_localalloc();
}
}
- numa_free_cpumask(oldmask);
+ if (oldmask != NULL)
+ numa_free_cpumask(oldmask);
#endif
return i;
}
@@ -584,7 +588,7 @@ unlink_hugepage_files(struct hugepage_file *hugepg_tbl,
for (page = 0; page < nrpages; page++) {
struct hugepage_file *hp = &hugepg_tbl[page];
- if (hp->final_va != NULL && unlink(hp->filepath)) {
+ if (hp->orig_va != NULL && unlink(hp->filepath)) {
RTE_LOG(WARNING, EAL, "%s(): Removing %s failed: %s\n",
__func__, hp->filepath, strerror(errno));
}
@@ -771,7 +775,10 @@ remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end)
rte_fbarray_set_used(arr, ms_idx);
- close(fd);
+ /* store segment fd internally */
+ if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0)
+ RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n",
+ rte_strerror(rte_errno));
}
RTE_LOG(DEBUG, EAL, "Allocated %" PRIu64 "M on socket %i\n",
(seg_len * page_sz) >> 20, socket_id);
@@ -857,6 +864,7 @@ alloc_va_space(struct rte_memseg_list *msl)
return -1;
}
msl->base_va = addr;
+ msl->len = mem_sz;
return 0;
}
@@ -1365,6 +1373,7 @@ eal_legacy_hugepage_init(void)
msl->base_va = addr;
msl->page_sz = page_sz;
msl->socket_id = 0;
+ msl->len = internal_config.memory;
/* populate memsegs. each memseg is one page long */
for (cur_seg = 0; cur_seg < n_segs; cur_seg++) {
@@ -1611,7 +1620,7 @@ eal_legacy_hugepage_init(void)
if (msl->memseg_arr.count > 0)
continue;
/* this is an unused list, deallocate it */
- mem_sz = (size_t)msl->page_sz * msl->memseg_arr.len;
+ mem_sz = msl->len;
munmap(msl->base_va, mem_sz);
msl->base_va = NULL;
@@ -1770,6 +1779,7 @@ getFileSize(int fd)
static int
eal_legacy_hugepage_attach(void)
{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct hugepage_file *hp = NULL;
unsigned int num_hp = 0;
unsigned int i = 0;
@@ -1813,6 +1823,9 @@ eal_legacy_hugepage_attach(void)
struct hugepage_file *hf = &hp[i];
size_t map_sz = hf->size;
void *map_addr = hf->final_va;
+ int msl_idx, ms_idx;
+ struct rte_memseg_list *msl;
+ struct rte_memseg *ms;
/* if size is zero, no more pages left */
if (map_sz == 0)
@@ -1830,25 +1843,50 @@ eal_legacy_hugepage_attach(void)
if (map_addr == MAP_FAILED) {
RTE_LOG(ERR, EAL, "Could not map %s: %s\n",
hf->filepath, strerror(errno));
- close(fd);
- goto error;
+ goto fd_error;
}
/* set shared lock on the file. */
if (flock(fd, LOCK_SH) < 0) {
RTE_LOG(DEBUG, EAL, "%s(): Locking file failed: %s\n",
__func__, strerror(errno));
- close(fd);
- goto error;
+ goto fd_error;
}
- close(fd);
+ /* find segment data */
+ msl = rte_mem_virt2memseg_list(map_addr);
+ if (msl == NULL) {
+ RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg list\n",
+ __func__);
+ goto fd_error;
+ }
+ ms = rte_mem_virt2memseg(map_addr, msl);
+ if (ms == NULL) {
+ RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg\n",
+ __func__);
+ goto fd_error;
+ }
+
+ msl_idx = msl - mcfg->memsegs;
+ ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
+ if (ms_idx < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg idx\n",
+ __func__);
+ goto fd_error;
+ }
+
+ /* store segment fd internally */
+ if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0)
+ RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n",
+ rte_strerror(rte_errno));
}
/* unmap the hugepage config file, since we are done using it */
munmap(hp, size);
close(fd_hugepage);
return 0;
+fd_error:
+ close(fd);
error:
/* map all segments into memory to make sure we get the addrs */
cur_seg = 0;
@@ -2093,18 +2131,65 @@ static int __rte_unused
memseg_primary_init(void)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- int i, socket_id, hpi_idx, msl_idx = 0;
+ struct memtype {
+ uint64_t page_sz;
+ int socket_id;
+ } *memtypes = NULL;
+ int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */
struct rte_memseg_list *msl;
- uint64_t max_mem, total_mem;
+ uint64_t max_mem, max_mem_per_type;
+ unsigned int max_seglists_per_type;
+ unsigned int n_memtypes, cur_type;
/* no-huge does not need this at all */
if (internal_config.no_hugetlbfs)
return 0;
- max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
- total_mem = 0;
+ /*
+ * figuring out amount of memory we're going to have is a long and very
+ * involved process. the basic element we're operating with is a memory
+ * type, defined as a combination of NUMA node ID and page size (so that
+ * e.g. 2 sockets with 2 page sizes yield 4 memory types in total).
+ *
+ * deciding amount of memory going towards each memory type is a
+ * balancing act between maximum segments per type, maximum memory per
+ * type, and number of detected NUMA nodes. the goal is to make sure
+ * each memory type gets at least one memseg list.
+ *
+ * the total amount of memory is limited by RTE_MAX_MEM_MB value.
+ *
+ * the total amount of memory per type is limited by either
+ * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number
+ * of detected NUMA nodes. additionally, maximum number of segments per
+ * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for
+ * smaller page sizes, it can take hundreds of thousands of segments to
+ * reach the above specified per-type memory limits.
+ *
+ * additionally, each type may have multiple memseg lists associated
+ * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger
+ * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones.
+ *
+ * the number of memseg lists per type is decided based on the above
+ * limits, and also taking number of detected NUMA nodes, to make sure
+ * that we don't run out of memseg lists before we populate all NUMA
+ * nodes with memory.
+ *
+ * we do this in three stages. first, we collect the number of types.
+ * then, we figure out memory constraints and populate the list of
+ * would-be memseg lists. then, we go ahead and allocate the memseg
+ * lists.
+ */
- /* create memseg lists */
+ /* create space for mem types */
+ n_memtypes = internal_config.num_hugepage_sizes * rte_socket_count();
+ memtypes = calloc(n_memtypes, sizeof(*memtypes));
+ if (memtypes == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot allocate space for memory types\n");
+ return -1;
+ }
+
+ /* populate mem types */
+ cur_type = 0;
for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes;
hpi_idx++) {
struct hugepage_info *hpi;
@@ -2113,62 +2198,114 @@ memseg_primary_init(void)
hpi = &internal_config.hugepage_info[hpi_idx];
hugepage_sz = hpi->hugepage_sz;
- for (i = 0; i < (int) rte_socket_count(); i++) {
- uint64_t max_type_mem, total_type_mem = 0;
- int type_msl_idx, max_segs, total_segs = 0;
-
- socket_id = rte_socket_id_by_idx(i);
+ for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) {
+ int socket_id = rte_socket_id_by_idx(i);
#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
if (socket_id > 0)
break;
#endif
+ memtypes[cur_type].page_sz = hugepage_sz;
+ memtypes[cur_type].socket_id = socket_id;
- if (total_mem >= max_mem)
- break;
-
- max_type_mem = RTE_MIN(max_mem - total_mem,
- (uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20);
- max_segs = RTE_MAX_MEMSEG_PER_TYPE;
+ RTE_LOG(DEBUG, EAL, "Detected memory type: "
+ "socket_id:%u hugepage_sz:%" PRIu64 "\n",
+ socket_id, hugepage_sz);
+ }
+ }
- type_msl_idx = 0;
- while (total_type_mem < max_type_mem &&
- total_segs < max_segs) {
- uint64_t cur_max_mem, cur_mem;
- unsigned int n_segs;
+ /* set up limits for types */
+ max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
+ max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20,
+ max_mem / n_memtypes);
+ /*
+ * limit maximum number of segment lists per type to ensure there's
+ * space for memseg lists for all NUMA nodes with all page sizes
+ */
+ max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes;
- if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
- RTE_LOG(ERR, EAL,
- "No more space in memseg lists, please increase %s\n",
- RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
- return -1;
- }
+ if (max_seglists_per_type == 0) {
+ RTE_LOG(ERR, EAL, "Cannot accommodate all memory types, please increase %s\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+ goto out;
+ }
- msl = &mcfg->memsegs[msl_idx++];
+ /* go through all mem types and create segment lists */
+ msl_idx = 0;
+ for (cur_type = 0; cur_type < n_memtypes; cur_type++) {
+ unsigned int cur_seglist, n_seglists, n_segs;
+ unsigned int max_segs_per_type, max_segs_per_list;
+ struct memtype *type = &memtypes[cur_type];
+ uint64_t max_mem_per_list, pagesz;
+ int socket_id;
- cur_max_mem = max_type_mem - total_type_mem;
+ pagesz = type->page_sz;
+ socket_id = type->socket_id;
- cur_mem = get_mem_amount(hugepage_sz,
- cur_max_mem);
- n_segs = cur_mem / hugepage_sz;
+ /*
+ * we need to create segment lists for this type. we must take
+ * into account the following things:
+ *
+ * 1. total amount of memory we can use for this memory type
+ * 2. total amount of memory per memseg list allowed
+ * 3. number of segments needed to fit the amount of memory
+ * 4. number of segments allowed per type
+ * 5. number of segments allowed per memseg list
+ * 6. number of memseg lists we are allowed to take up
+ */
- if (alloc_memseg_list(msl, hugepage_sz, n_segs,
- socket_id, type_msl_idx))
- return -1;
+ /* calculate how much segments we will need in total */
+ max_segs_per_type = max_mem_per_type / pagesz;
+ /* limit number of segments to maximum allowed per type */
+ max_segs_per_type = RTE_MIN(max_segs_per_type,
+ (unsigned int)RTE_MAX_MEMSEG_PER_TYPE);
+ /* limit number of segments to maximum allowed per list */
+ max_segs_per_list = RTE_MIN(max_segs_per_type,
+ (unsigned int)RTE_MAX_MEMSEG_PER_LIST);
+
+ /* calculate how much memory we can have per segment list */
+ max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz,
+ (uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20);
+
+ /* calculate how many segments each segment list will have */
+ n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz);
+
+ /* calculate how many segment lists we can have */
+ n_seglists = RTE_MIN(max_segs_per_type / n_segs,
+ max_mem_per_type / max_mem_per_list);
+
+ /* limit number of segment lists according to our maximum */
+ n_seglists = RTE_MIN(n_seglists, max_seglists_per_type);
+
+ RTE_LOG(DEBUG, EAL, "Creating %i segment lists: "
+ "n_segs:%i socket_id:%i hugepage_sz:%" PRIu64 "\n",
+ n_seglists, n_segs, socket_id, pagesz);
+
+ /* create all segment lists */
+ for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) {
+ if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
+ RTE_LOG(ERR, EAL,
+ "No more space in memseg lists, please increase %s\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+ goto out;
+ }
+ msl = &mcfg->memsegs[msl_idx++];
- total_segs += msl->memseg_arr.len;
- total_type_mem = total_segs * hugepage_sz;
- type_msl_idx++;
+ if (alloc_memseg_list(msl, pagesz, n_segs,
+ socket_id, cur_seglist))
+ goto out;
- if (alloc_va_space(msl)) {
- RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
- return -1;
- }
+ if (alloc_va_space(msl)) {
+ RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
+ goto out;
}
- total_mem += total_type_mem;
}
}
- return 0;
+ /* we're successful */
+ ret = 0;
+out:
+ free(memtypes);
+ return ret;
}
static int
@@ -2204,6 +2341,25 @@ memseg_secondary_init(void)
int
rte_eal_memseg_init(void)
{
+ /* increase rlimit to maximum */
+ struct rlimit lim;
+
+ if (getrlimit(RLIMIT_NOFILE, &lim) == 0) {
+ /* set limit to maximum */
+ lim.rlim_cur = lim.rlim_max;
+
+ if (setrlimit(RLIMIT_NOFILE, &lim) < 0) {
+ RTE_LOG(DEBUG, EAL, "Setting maximum number of open files failed: %s\n",
+ strerror(errno));
+ } else {
+ RTE_LOG(DEBUG, EAL, "Setting maximum number of open files to %"
+ PRIu64 "\n",
+ (uint64_t)lim.rlim_cur);
+ }
+ } else {
+ RTE_LOG(ERR, EAL, "Cannot get current resource limits\n");
+ }
+
return rte_eal_process_type() == RTE_PROC_PRIMARY ?
#ifndef RTE_ARCH_64
memseg_primary_init_32() :
diff --git a/lib/librte_eal/linuxapp/eal/eal_thread.c b/lib/librte_eal/linuxapp/eal/eal_thread.c
index b496fc71..379773b6 100644
--- a/lib/librte_eal/linuxapp/eal/eal_thread.c
+++ b/lib/librte_eal/linuxapp/eal/eal_thread.c
@@ -121,8 +121,8 @@ eal_thread_loop(__attribute__((unused)) void *arg)
ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
- RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%x;cpuset=[%s%s])\n",
- lcore_id, (int)thread_id, cpuset, ret == 0 ? "" : "...");
+ RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%zx;cpuset=[%s%s])\n",
+ lcore_id, (uintptr_t)thread_id, cpuset, ret == 0 ? "" : "...");
/* read on our pipe to get commands */
while (1) {
diff --git a/lib/librte_eal/linuxapp/eal/eal_timer.c b/lib/librte_eal/linuxapp/eal/eal_timer.c
index 2766bd78..bc8f0519 100644
--- a/lib/librte_eal/linuxapp/eal/eal_timer.c
+++ b/lib/librte_eal/linuxapp/eal/eal_timer.c
@@ -87,7 +87,7 @@ static pthread_t msb_inc_thread_id;
* containing used to process MSB of the HPET (unfortunately, we need
* this because hpet is 32 bits by default under linux).
*/
-static void
+static void *
hpet_msb_inc(__attribute__((unused)) void *arg)
{
uint32_t t;
@@ -98,6 +98,7 @@ hpet_msb_inc(__attribute__((unused)) void *arg)
eal_hpet_msb ++;
sleep(10);
}
+ return NULL;
}
uint64_t
@@ -178,7 +179,7 @@ rte_eal_hpet_init(int make_default)
/* create a thread that will increment a global variable for
* msb (hpet is 32 bits by default under linux) */
ret = rte_ctrl_thread_create(&msb_inc_thread_id, "hpet-msb-inc", NULL,
- (void *(*)(void *))hpet_msb_inc, NULL);
+ hpet_msb_inc, NULL);
if (ret != 0) {
RTE_LOG(ERR, EAL, "ERROR: Cannot create HPET timer thread!\n");
internal_config.no_hpet = 1;
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index c68dc38e..0516b159 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -345,46 +345,13 @@ get_vfio_cfg_by_group_num(int iommu_group_num)
return NULL;
}
-static struct vfio_config *
-get_vfio_cfg_by_group_fd(int vfio_group_fd)
-{
- struct vfio_config *vfio_cfg;
- int i, j;
-
- for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
- vfio_cfg = &vfio_cfgs[i];
- for (j = 0; j < VFIO_MAX_GROUPS; j++)
- if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
- return vfio_cfg;
- }
-
- return NULL;
-}
-
-static struct vfio_config *
-get_vfio_cfg_by_container_fd(int container_fd)
-{
- int i;
-
- for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
- if (vfio_cfgs[i].vfio_container_fd == container_fd)
- return &vfio_cfgs[i];
- }
-
- return NULL;
-}
-
-int
-rte_vfio_get_group_fd(int iommu_group_num)
+static int
+vfio_get_group_fd(struct vfio_config *vfio_cfg,
+ int iommu_group_num)
{
int i;
int vfio_group_fd;
struct vfio_group *cur_grp;
- struct vfio_config *vfio_cfg;
-
- /* get the vfio_config it belongs to */
- vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
- vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
/* check if we already have the group descriptor open */
for (i = 0; i < VFIO_MAX_GROUPS; i++)
@@ -423,6 +390,47 @@ rte_vfio_get_group_fd(int iommu_group_num)
return vfio_group_fd;
}
+static struct vfio_config *
+get_vfio_cfg_by_group_fd(int vfio_group_fd)
+{
+ struct vfio_config *vfio_cfg;
+ int i, j;
+
+ for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+ vfio_cfg = &vfio_cfgs[i];
+ for (j = 0; j < VFIO_MAX_GROUPS; j++)
+ if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
+ return vfio_cfg;
+ }
+
+ return NULL;
+}
+
+static struct vfio_config *
+get_vfio_cfg_by_container_fd(int container_fd)
+{
+ int i;
+
+ for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+ if (vfio_cfgs[i].vfio_container_fd == container_fd)
+ return &vfio_cfgs[i];
+ }
+
+ return NULL;
+}
+
+int
+rte_vfio_get_group_fd(int iommu_group_num)
+{
+ struct vfio_config *vfio_cfg;
+
+ /* get the vfio_config it belongs to */
+ vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
+ vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
+
+ return vfio_get_group_fd(vfio_cfg, iommu_group_num);
+}
+
static int
get_vfio_group_idx(int vfio_group_fd)
{
@@ -509,7 +517,7 @@ vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
msl = rte_mem_virt2memseg_list(addr);
/* for IOVA as VA mode, no need to care for IOVA addresses */
- if (rte_eal_iova_mode() == RTE_IOVA_VA) {
+ if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) {
uint64_t vfio_va = (uint64_t)(uintptr_t)addr;
if (type == RTE_MEM_EVENT_ALLOC)
vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va,
@@ -523,13 +531,19 @@ vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
/* memsegs are contiguous in memory */
ms = rte_mem_virt2memseg(addr, msl);
while (cur_len < len) {
+ /* some memory segments may have invalid IOVA */
+ if (ms->iova == RTE_BAD_IOVA) {
+ RTE_LOG(DEBUG, EAL, "Memory segment at %p has bad IOVA, skipping\n",
+ ms->addr);
+ goto next;
+ }
if (type == RTE_MEM_EVENT_ALLOC)
vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
ms->iova, ms->len, 1);
else
vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
ms->iova, ms->len, 0);
-
+next:
cur_len += ms->len;
++ms;
}
@@ -896,7 +910,15 @@ rte_vfio_enable(const char *modname)
return 0;
}
- default_vfio_cfg->vfio_container_fd = rte_vfio_get_container_fd();
+ if (internal_config.process_type == RTE_PROC_PRIMARY) {
+ /* open a new container */
+ default_vfio_cfg->vfio_container_fd =
+ rte_vfio_get_container_fd();
+ } else {
+ /* get the default container from the primary process */
+ default_vfio_cfg->vfio_container_fd =
+ vfio_get_default_container_fd();
+ }
/* check if we have VFIO driver enabled */
if (default_vfio_cfg->vfio_container_fd != -1) {
@@ -916,6 +938,45 @@ rte_vfio_is_enabled(const char *modname)
return default_vfio_cfg->vfio_enabled && mod_available;
}
+int
+vfio_get_default_container_fd(void)
+{
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+
+ if (default_vfio_cfg->vfio_enabled)
+ return default_vfio_cfg->vfio_container_fd;
+
+ if (internal_config.process_type == RTE_PROC_PRIMARY) {
+ /* if we were secondary process we would try requesting
+ * container fd from the primary, but we're the primary
+ * process so just exit here
+ */
+ return -1;
+ }
+
+ p->req = SOCKET_REQ_DEFAULT_CONTAINER;
+ strcpy(mp_req.name, EAL_VFIO_MP);
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ free(mp_reply.msgs);
+ return mp_rep->fds[0];
+ }
+ free(mp_reply.msgs);
+ }
+
+ RTE_LOG(ERR, EAL, " cannot request default container fd\n");
+ return -1;
+}
+
const struct vfio_iommu_type *
vfio_set_iommu_type(int vfio_container_fd)
{
@@ -1028,8 +1089,9 @@ rte_vfio_get_container_fd(void)
mp_rep = &mp_reply.msgs[0];
p = (struct vfio_mp_param *)mp_rep->param;
if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ vfio_container_fd = mp_rep->fds[0];
free(mp_reply.msgs);
- return mp_rep->fds[0];
+ return vfio_container_fd;
}
free(mp_reply.msgs);
}
@@ -1082,11 +1144,14 @@ rte_vfio_get_group_num(const char *sysfs_base,
}
static int
-type1_map(const struct rte_memseg_list *msl __rte_unused,
- const struct rte_memseg *ms, void *arg)
+type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+ void *arg)
{
int *vfio_container_fd = arg;
+ if (msl->external)
+ return 0;
+
return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
ms->len, 1);
}
@@ -1145,8 +1210,22 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
struct vfio_iommu_type1_dma_map dma_map;
struct vfio_iommu_type1_dma_unmap dma_unmap;
int ret;
+ struct vfio_iommu_spapr_register_memory reg = {
+ .argsz = sizeof(reg),
+ .flags = 0
+ };
+ reg.vaddr = (uintptr_t) vaddr;
+ reg.size = len;
if (do_map != 0) {
+ ret = ioctl(vfio_container_fd,
+ VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
memset(&dma_map, 0, sizeof(dma_map));
dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
dma_map.vaddr = vaddr;
@@ -1163,13 +1242,6 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
}
} else {
- struct vfio_iommu_spapr_register_memory reg = {
- .argsz = sizeof(reg),
- .flags = 0
- };
- reg.vaddr = (uintptr_t) vaddr;
- reg.size = len;
-
ret = ioctl(vfio_container_fd,
VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
if (ret) {
@@ -1196,12 +1268,15 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
}
static int
-vfio_spapr_map_walk(const struct rte_memseg_list *msl __rte_unused,
+vfio_spapr_map_walk(const struct rte_memseg_list *msl,
const struct rte_memseg *ms, void *arg)
{
int *vfio_container_fd = arg;
- return vfio_spapr_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
+ if (msl->external)
+ return 0;
+
+ return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova,
ms->len, 1);
}
@@ -1210,12 +1285,15 @@ struct spapr_walk_param {
uint64_t hugepage_sz;
};
static int
-vfio_spapr_window_size_walk(const struct rte_memseg_list *msl __rte_unused,
+vfio_spapr_window_size_walk(const struct rte_memseg_list *msl,
const struct rte_memseg *ms, void *arg)
{
struct spapr_walk_param *param = arg;
uint64_t max = ms->iova + ms->len;
+ if (msl->external)
+ return 0;
+
if (max > param->window_size) {
param->hugepage_sz = ms->hugepage_sz;
param->window_size = max;
@@ -1670,9 +1748,6 @@ int
rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
{
struct vfio_config *vfio_cfg;
- struct vfio_group *cur_grp;
- int vfio_group_fd;
- int i;
vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
if (vfio_cfg == NULL) {
@@ -1680,36 +1755,7 @@ rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
return -1;
}
- /* Check room for new group */
- if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) {
- RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
- return -1;
- }
-
- /* Get an index for the new group */
- for (i = 0; i < VFIO_MAX_GROUPS; i++)
- if (vfio_cfg->vfio_groups[i].group_num == -1) {
- cur_grp = &vfio_cfg->vfio_groups[i];
- break;
- }
-
- /* This should not happen */
- if (i == VFIO_MAX_GROUPS) {
- RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
- return -1;
- }
-
- vfio_group_fd = vfio_open_group_fd(iommu_group_num);
- if (vfio_group_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num);
- return -1;
- }
- cur_grp->group_num = iommu_group_num;
- cur_grp->fd = vfio_group_fd;
- cur_grp->devices = 0;
- vfio_cfg->vfio_active_groups++;
-
- return vfio_group_fd;
+ return vfio_get_group_fd(vfio_cfg, iommu_group_num);
}
int
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 68d4750a..63ae115c 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -115,6 +115,9 @@ struct vfio_iommu_type {
vfio_dma_func_t dma_map_func;
};
+/* get the vfio container that devices are bound to by default */
+int vfio_get_default_container_fd(void);
+
/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */
const struct vfio_iommu_type *
vfio_set_iommu_type(int vfio_container_fd);
@@ -129,6 +132,7 @@ int vfio_mp_sync_setup(void);
#define SOCKET_REQ_CONTAINER 0x100
#define SOCKET_REQ_GROUP 0x200
+#define SOCKET_REQ_DEFAULT_CONTAINER 0x400
#define SOCKET_OK 0x0
#define SOCKET_NO_FD 0x1
#define SOCKET_ERR 0xFF
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
index 680a24aa..a1e8c834 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
@@ -66,6 +66,17 @@ vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
reply.fds[0] = fd;
}
break;
+ case SOCKET_REQ_DEFAULT_CONTAINER:
+ r->req = SOCKET_REQ_DEFAULT_CONTAINER;
+ fd = vfio_get_default_container_fd();
+ if (fd < 0)
+ r->result = SOCKET_ERR;
+ else {
+ r->result = SOCKET_OK;
+ reply.num_fds = 1;
+ reply.fds[0] = fd;
+ }
+ break;
default:
RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
return -1;
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
index cfa9448b..5afa0871 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
@@ -8,6 +8,7 @@
#ifdef __KERNEL__
#include <linux/if.h>
+#include <asm/barrier.h>
#define RTE_STD_C11
#else
#include <rte_common.h>
@@ -54,8 +55,13 @@ struct rte_kni_request {
* Writing should never overwrite the read position
*/
struct rte_kni_fifo {
+#ifdef RTE_USE_C11_MEM_MODEL
+ unsigned write; /**< Next position to be written*/
+ unsigned read; /**< Next position to be read */
+#else
volatile unsigned write; /**< Next position to be written*/
volatile unsigned read; /**< Next position to be read */
+#endif
unsigned len; /**< Circular buffer length */
unsigned elem_size; /**< Pointer size - for 32/64 bit OS */
void *volatile buffer[]; /**< The buffer contains mbuf pointers */
diff --git a/lib/librte_eal/meson.build b/lib/librte_eal/meson.build
index e1fde15d..a18f3a82 100644
--- a/lib/librte_eal/meson.build
+++ b/lib/librte_eal/meson.build
@@ -21,11 +21,10 @@ else
error('unsupported system type "@0@"'.format(host_machine.system()))
endif
-version = 8 # the version of the EAL API
+version = 9 # the version of the EAL API
allow_experimental_apis = true
deps += 'compat'
deps += 'kvargs'
-cflags += '-D_GNU_SOURCE'
sources = common_sources + env_sources
objs = common_objs + env_objs
headers = common_headers + env_headers
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 344a43d3..04f62424 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -19,9 +19,6 @@ DPDK_2.0 {
rte_dump_tailq;
rte_eal_alarm_cancel;
rte_eal_alarm_set;
- rte_eal_devargs_add;
- rte_eal_devargs_dump;
- rte_eal_devargs_type_count;
rte_eal_get_configuration;
rte_eal_get_lcore_state;
rte_eal_get_physmem_size;
@@ -32,7 +29,6 @@ DPDK_2.0 {
rte_eal_lcore_role;
rte_eal_mp_remote_launch;
rte_eal_mp_wait_lcore;
- rte_eal_parse_devargs_str;
rte_eal_process_type;
rte_eal_remote_launch;
rte_eal_tailq_lookup;
@@ -134,8 +130,6 @@ DPDK_16.11 {
rte_delay_us_block;
rte_delay_us_callback_register;
- rte_eal_dev_attach;
- rte_eal_dev_detach;
} DPDK_16.07;
@@ -262,6 +256,16 @@ DPDK_18.08 {
} DPDK_18.05;
+DPDK_18.11 {
+ global:
+
+ rte_eal_get_runtime_dir;
+ rte_eal_hotplug_add;
+ rte_eal_hotplug_remove;
+ rte_strscpy;
+
+} DPDK_18.08;
+
EXPERIMENTAL {
global:
@@ -270,12 +274,19 @@ EXPERIMENTAL {
rte_class_register;
rte_class_unregister;
rte_ctrl_thread_create;
+ rte_delay_us_sleep;
+ rte_dev_event_callback_process;
rte_dev_event_callback_register;
rte_dev_event_callback_unregister;
rte_dev_event_monitor_start;
rte_dev_event_monitor_stop;
+ rte_dev_hotplug_handle_disable;
+ rte_dev_hotplug_handle_enable;
+ rte_dev_is_probed;
rte_dev_iterator_init;
rte_dev_iterator_next;
+ rte_dev_probe;
+ rte_dev_remove;
rte_devargs_add;
rte_devargs_dump;
rte_devargs_insert;
@@ -284,9 +295,8 @@ EXPERIMENTAL {
rte_devargs_parsef;
rte_devargs_remove;
rte_devargs_type_count;
+ rte_eal_check_dma_mask;
rte_eal_cleanup;
- rte_eal_hotplug_add;
- rte_eal_hotplug_remove;
rte_fbarray_attach;
rte_fbarray_destroy;
rte_fbarray_detach;
@@ -311,6 +321,14 @@ EXPERIMENTAL {
rte_fbarray_set_used;
rte_log_register_type_and_pick_level;
rte_malloc_dump_heaps;
+ rte_malloc_heap_create;
+ rte_malloc_heap_destroy;
+ rte_malloc_heap_get_socket;
+ rte_malloc_heap_memory_add;
+ rte_malloc_heap_memory_attach;
+ rte_malloc_heap_memory_detach;
+ rte_malloc_heap_memory_remove;
+ rte_malloc_heap_socket_is_external;
rte_mem_alloc_validator_register;
rte_mem_alloc_validator_unregister;
rte_mem_event_callback_register;
@@ -320,6 +338,10 @@ EXPERIMENTAL {
rte_mem_virt2memseg_list;
rte_memseg_contig_walk;
rte_memseg_contig_walk_thread_unsafe;
+ rte_memseg_get_fd;
+ rte_memseg_get_fd_offset;
+ rte_memseg_get_fd_thread_unsafe;
+ rte_memseg_get_fd_offset_thread_unsafe;
rte_memseg_list_walk;
rte_memseg_list_walk_thread_unsafe;
rte_memseg_walk;
@@ -330,6 +352,7 @@ EXPERIMENTAL {
rte_mp_request_sync;
rte_mp_request_async;
rte_mp_sendmsg;
+ rte_option_register;
rte_service_lcore_attr_get;
rte_service_lcore_attr_reset_all;
rte_service_may_be_active;